diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index 301f571f5fb..8bb8ab92a48 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -14,16 +14,14 @@ from cudf._lib.cpp.hash cimport hash as cpp_hash, hash_id as cpp_hash_id from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns -def hash_partition(source_table, object columns_to_hash, - int num_partitions, bool keep_index=True): +def hash_partition(list source_columns, object columns_to_hash, + int num_partitions): cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash cdef int c_num_partitions = num_partitions - cdef table_view c_source_view = table_view_from_table( - source_table, not keep_index - ) + cdef table_view c_source_view = table_view_from_columns(source_columns) cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result with nogil: @@ -36,27 +34,17 @@ def hash_partition(source_table, object columns_to_hash, ) # Note that the offsets (`c_result.second`) may be empty when - # the original table (`source_table`) is empty. We need to + # the original table (`source_columns`) is empty. We need to # return a list of zeros in this case. return ( - *data_from_unique_ptr( - move(c_result.first), - column_names=source_table._column_names, - index_names=( - source_table._index_names - if keep_index is True - else None - ) - - ), - list(c_result.second) if c_result.second.size() - else [0] * num_partitions + columns_from_unique_ptr(move(c_result.first)), + list(c_result.second) + if c_result.second.size() else [0] * num_partitions ) -def hash(source_table, str method, int seed=0): - cdef table_view c_source_view = table_view_from_table( - source_table, ignore_index=True) +def hash(list source_columns, str method, int seed=0): + cdef table_view c_source_view = table_view_from_columns(source_columns) cdef unique_ptr[column] c_result cdef cpp_hash_id c_hash_function if method == "murmur3": diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 06e287ee670..88c8b19ded0 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import cudf @@ -20,12 +20,12 @@ from cudf._lib.cpp.interop cimport ( ) from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns def from_dlpack(dlpack_capsule): """ - Converts a DLPack Tensor PyCapsule into a cudf Frame object. + Converts a DLPack Tensor PyCapsule into a list of columns. DLPack Tensor PyCapsule is expected to have the name "dltensor". """ @@ -40,31 +40,25 @@ def from_dlpack(dlpack_capsule): cpp_from_dlpack(dlpack_tensor) ) - res = data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) - ) + res = columns_from_unique_ptr(move(c_result)) dlpack_tensor.deleter(dlpack_tensor) return res -def to_dlpack(source_table): +def to_dlpack(list source_columns): """ - Converts a cudf Frame into a DLPack Tensor PyCapsule. + Converts a list of columns into a DLPack Tensor PyCapsule. DLPack Tensor PyCapsule will have the name "dltensor". """ - for column in source_table._columns: - if column.null_count: - raise ValueError( - "Cannot create a DLPack tensor with null values. \ - Input is required to have null count as zero." - ) + if any(column.null_count for column in source_columns): + raise ValueError( + "Cannot create a DLPack tensor with null values. \ + Input is required to have null count as zero." + ) cdef DLManagedTensor *dlpack_tensor - cdef table_view source_table_view = table_view_from_table( - source_table, ignore_index=True - ) + cdef table_view source_table_view = table_view_from_columns(source_columns) with nogil: dlpack_tensor = cpp_to_dlpack( @@ -110,17 +104,14 @@ cdef vector[column_metadata] gather_metadata(object metadata) except *: raise ValueError("Malformed metadata has been encountered") -def to_arrow(input_table, - object metadata, - bool keep_index=True): - """Convert from cudf Frame to PyArrow Table. +def to_arrow(list source_columns, object metadata): + """Convert a list of columns from + cudf Frame to a PyArrow Table. Parameters ---------- - input_table : cudf table - column_names : names for the pyarrow arrays - field_names : field names for nested type arrays - keep_index : whether index needs to be part of arrow table + source_columns : a list of columns to convert + metadata : a list of metadata, see `gather_metadata` for layout Returns ------- @@ -128,9 +119,7 @@ def to_arrow(input_table, """ cdef vector[column_metadata] cpp_metadata = gather_metadata(metadata) - cdef table_view input_table_view = ( - table_view_from_table(input_table, not keep_index) - ) + cdef table_view input_table_view = table_view_from_columns(source_columns) cdef shared_ptr[CTable] cpp_arrow_table with nogil: @@ -141,22 +130,16 @@ def to_arrow(input_table, return pyarrow_wrap_table(cpp_arrow_table) -def from_arrow( - object input_table, - object column_names=None, - object index_names=None -): - """Convert from PyArrow Table to cudf Frame. +def from_arrow(object input_table): + """Convert from PyArrow Table to a list of columns. Parameters ---------- input_table : PyArrow table - column_names : names for the cudf table data columns - index_names : names for the cudf table index columns Returns ------- - cudf Frame + A list of columns to construct Frame object """ cdef shared_ptr[CTable] cpp_arrow_table = ( pyarrow_unwrap_table(input_table) @@ -166,8 +149,4 @@ def from_arrow( with nogil: c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0])) - return data_from_unique_ptr( - move(c_result), - column_names=column_names, - index_names=index_names - ) + return columns_from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 5921f06d36e..1baef266dab 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from itertools import chain @@ -16,31 +16,25 @@ from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport data_type, size_type, type_id -from cudf._lib.utils cimport table_view_from_table +from cudf._lib.utils cimport table_view_from_columns # The functions below return the *gathermaps* that represent # the join result when joining on the keys `lhs` and `rhs`. -cpdef join(lhs, rhs, how=None): +cpdef join(list lhs, list rhs, how=None): cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result - cdef table_view c_lhs = table_view_from_table(lhs) - cdef table_view c_rhs = table_view_from_table(rhs) + cdef table_view c_lhs = table_view_from_columns(lhs) + cdef table_view c_rhs = table_view_from_columns(rhs) if how == "inner": - c_result = move(cpp_join.inner_join( - c_lhs, - c_rhs - )) + with nogil: + c_result = move(cpp_join.inner_join(c_lhs, c_rhs)) elif how == "left": - c_result = move(cpp_join.left_join( - c_lhs, - c_rhs - )) + with nogil: + c_result = move(cpp_join.left_join(c_lhs, c_rhs)) elif how == "outer": - c_result = move(cpp_join.full_join( - c_lhs, - c_rhs - )) + with nogil: + c_result = move(cpp_join.full_join(c_lhs, c_rhs)) else: raise ValueError(f"Invalid join type {how}") @@ -49,30 +43,23 @@ cpdef join(lhs, rhs, how=None): return left_rows, right_rows -cpdef semi_join(lhs, rhs, how=None): +cpdef semi_join(list lhs, list rhs, how=None): # left-semi and left-anti joins cdef cpp_join.gather_map_type c_result - cdef table_view c_lhs = table_view_from_table(lhs) - cdef table_view c_rhs = table_view_from_table(rhs) + cdef table_view c_lhs = table_view_from_columns(lhs) + cdef table_view c_rhs = table_view_from_columns(rhs) if how == "leftsemi": - c_result = move(cpp_join.left_semi_join( - c_lhs, - c_rhs - )) + with nogil: + c_result = move(cpp_join.left_semi_join(c_lhs, c_rhs)) elif how == "leftanti": - c_result = move(cpp_join.left_anti_join( - c_lhs, - c_rhs - )) + with nogil: + c_result = move(cpp_join.left_anti_join(c_lhs, c_rhs)) else: raise ValueError(f"Invalid join type {how}") cdef Column left_rows = _gather_map_as_column(move(c_result)) - return ( - left_rows, - None - ) + return left_rows, None cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map): diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 523686fafe6..e5a705ab603 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -42,7 +42,7 @@ from cudf.core.dtypes import ListDtype from cudf._lib.cpp.lists.contains cimport contains, index_of as cpp_index_of from cudf._lib.cpp.lists.extract cimport extract_list_element -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns def count_elements(Column col): @@ -61,8 +61,10 @@ def count_elements(Column col): return result -def explode_outer(tbl, int explode_column_idx, bool ignore_index=False): - cdef table_view c_table_view = table_view_from_table(tbl, ignore_index) +def explode_outer( + list source_columns, int explode_column_idx +): + cdef table_view c_table_view = table_view_from_columns(source_columns) cdef size_type c_explode_column_idx = explode_column_idx cdef unique_ptr[table] c_result @@ -70,11 +72,7 @@ def explode_outer(tbl, int explode_column_idx, bool ignore_index=False): with nogil: c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx)) - return data_from_unique_ptr( - move(c_result), - column_names=tbl._column_names, - index_names=None if ignore_index else tbl._index_names - ) + return columns_from_unique_ptr(move(c_result)) def drop_list_duplicates(Column col, bool nulls_equal, bool nans_all_equal): @@ -197,18 +195,17 @@ def index_of(Column col, object py_search_key): return Column.from_unique_ptr(move(c_result)) -def concatenate_rows(tbl): +def concatenate_rows(list source_columns): cdef unique_ptr[column] c_result - cdef table_view c_table_view = table_view_from_table(tbl) + cdef table_view c_table_view = table_view_from_columns(source_columns) with nogil: c_result = move(cpp_concatenate_rows( c_table_view, )) - result = Column.from_unique_ptr(move(c_result)) - return result + return Column.from_unique_ptr(move(c_result)) def concatenate_list_elements(Column input_column, dropna=False): diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx index e53667e7589..f2f5a92aca1 100644 --- a/python/cudf/cudf/_lib/partitioning.pyx +++ b/python/cudf/cudf/_lib/partitioning.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -11,21 +11,19 @@ from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.partitioning cimport partition as cpp_partition from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count cimport cudf._lib.cpp.types as libcudf_types -def partition(source_table, Column partition_map, - object num_partitions, bool keep_index=True): +def partition(list source_columns, Column partition_map, + object num_partitions): if num_partitions is None: num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True) cdef int c_num_partitions = num_partitions - cdef table_view c_source_view = table_view_from_table( - source_table, not keep_index - ) + cdef table_view c_source_view = table_view_from_columns(source_columns) cdef column_view c_partition_map_view = partition_map.view() @@ -40,13 +38,5 @@ def partition(source_table, Column partition_map, ) return ( - *data_from_unique_ptr( - move(c_result.first), - column_names=source_table._column_names, - index_names=source_table._index_names if( - keep_index is True) - else None - - ), - list(c_result.second) + columns_from_unique_ptr(move(c_result.first)), list(c_result.second) ) diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx index 497a71df89d..f65c29a55a8 100644 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ b/python/cudf/cudf/_lib/quantiles.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -31,7 +31,7 @@ from cudf._lib.cpp.types cimport ( order_info, sorted, ) -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns def quantile( @@ -74,14 +74,13 @@ def quantile( return Column.from_unique_ptr(move(c_result)) -def quantiles(source_table, +def quantiles(list source_columns, vector[double] q, object interp, object is_input_sorted, list column_order, list null_precedence): - cdef table_view c_input = table_view_from_table( - source_table, ignore_index=True) + cdef table_view c_input = table_view_from_columns(source_columns) cdef vector[double] c_q = q cdef interpolation c_interp = ( interp @@ -119,7 +118,4 @@ def quantiles(source_table, ) ) - return data_from_unique_ptr( - move(c_result), - column_names=source_table._column_names - ) + return columns_from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx index d64d0543892..29223947eea 100644 --- a/python/cudf/cudf/_lib/reshape.pyx +++ b/python/cudf/cudf/_lib/reshape.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -13,32 +13,25 @@ from cudf._lib.cpp.reshape cimport ( from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns -def interleave_columns(source_table): - cdef table_view c_view = table_view_from_table( - source_table, ignore_index=True) +def interleave_columns(list source_columns): + cdef table_view c_view = table_view_from_columns(source_columns) cdef unique_ptr[column] c_result with nogil: c_result = move(cpp_interleave_columns(c_view)) - return Column.from_unique_ptr( - move(c_result) - ) + return Column.from_unique_ptr(move(c_result)) -def tile(source_table, size_type count): +def tile(list source_columns, size_type count): cdef size_type c_count = count - cdef table_view c_view = table_view_from_table(source_table) + cdef table_view c_view = table_view_from_columns(source_columns) cdef unique_ptr[table] c_result with nogil: c_result = move(cpp_tile(c_view, c_count)) - return data_from_unique_ptr( - move(c_result), - column_names=source_table._column_names, - index_names=source_table._index_names - ) + return columns_from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 32d6cb2ea6d..a7acfa8f906 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -65,6 +65,7 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( timestamp_us, ) from cudf._lib.utils cimport ( + columns_from_table_view, data_from_table_view, table_view_from_columns, table_view_from_table, @@ -361,8 +362,8 @@ cdef _set_struct_from_pydict(unique_ptr[scalar]& s, names=columns ) - data, _ = from_arrow(pyarrow_table, column_names=columns) - cdef table_view struct_view = table_view_from_columns(data.values()) + data = from_arrow(pyarrow_table) + cdef table_view struct_view = table_view_from_columns(data) s.reset( new struct_scalar(struct_view, valid) @@ -373,18 +374,10 @@ cdef _get_py_dict_from_struct(unique_ptr[scalar]& s): return cudf.NA cdef table_view struct_table_view = (s.get()).view() - columns = [str(i) for i in range(struct_table_view.num_columns())] + column_names = [str(i) for i in range(struct_table_view.num_columns())] - data, _ = data_from_table_view( - struct_table_view, - None, - column_names=columns - ) - to_arrow_table = cudf.core.frame.Frame( - cudf.core.column_accessor.ColumnAccessor(data) - ) - - python_dict = to_arrow(to_arrow_table, columns).to_pydict() + columns = columns_from_table_view(struct_table_view, None) + python_dict = to_arrow(columns, column_names).to_pydict() return {k: _nested_na_replace(python_dict[k])[0] for k in python_dict} @@ -415,9 +408,8 @@ cdef _get_py_list_from_list(unique_ptr[scalar]& s): cdef column_view list_col_view = (s.get()).view() cdef Column list_col = Column.from_column_view(list_col_view, None) - to_arrow_table = cudf.core.frame.Frame({"col": list_col}) - arrow_table = to_arrow(to_arrow_table, [["col", []]]) + arrow_table = to_arrow([list_col], [["col", []]]) result = arrow_table['col'].to_pylist() return _nested_na_replace(result) diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx index f92ef753fc2..d5568f53231 100644 --- a/python/cudf/cudf/_lib/search.pyx +++ b/python/cudf/cudf/_lib/search.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -10,20 +10,20 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.utils cimport table_view_from_table +from cudf._lib.utils cimport table_view_from_columns def search_sorted( - table, values, side, ascending=True, na_position="last" + list source, list values, side, ascending=True, na_position="last" ): """Find indices where elements should be inserted to maintain order Parameters ---------- - table : Frame - Frame to search in - values : Frame - Frame of values to search for + source : list of columns + List of columns to search in + values : List of columns + List of value columns to search for side : str {‘left’, ‘right’} optional If ‘left’, the index of the first suitable location is given. If ‘right’, return the last such index @@ -33,10 +33,8 @@ def search_sorted( cdef vector[libcudf_types.null_order] c_null_precedence cdef libcudf_types.order c_order cdef libcudf_types.null_order c_null_order - cdef table_view c_table_data = table_view_from_table( - table, ignore_index=True) - cdef table_view c_values_data = table_view_from_table( - values, ignore_index=True) + cdef table_view c_table_data = table_view_from_columns(source) + cdef table_view c_values_data = table_view_from_columns(values) # Note: We are ignoring index columns here c_order = (libcudf_types.order.ASCENDING @@ -47,9 +45,9 @@ def search_sorted( if na_position=="last" else libcudf_types.null_order.BEFORE ) - c_column_order = vector[libcudf_types.order](table._num_columns, c_order) + c_column_order = vector[libcudf_types.order](len(source), c_order) c_null_precedence = vector[libcudf_types.null_order]( - table._num_columns, c_null_order + len(source), c_null_order ) if side == 'left': diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 3b5ef33a668..eeb39f70728 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -18,10 +18,10 @@ from cudf._lib.cpp.strings.combine cimport ( from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport table_view_from_table +from cudf._lib.utils cimport table_view_from_columns -def concatenate(source_strings, +def concatenate(list source_strings, object sep, object na_rep): """ @@ -33,8 +33,7 @@ def concatenate(source_strings, cdef DeviceScalar narep = na_rep.device_value cdef unique_ptr[column] c_result - cdef table_view source_view = table_view_from_table( - source_strings, ignore_index=True) + cdef table_view source_view = table_view_from_columns(source_strings) cdef const string_scalar* scalar_separator = \ (separator.get_raw_ptr()) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 96d25cb92c9..175150b6865 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import numpy as np from numba.np import numpy_support @@ -25,9 +25,9 @@ from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type, type_id from cudf._lib.types cimport underlying_type_t_type_id from cudf._lib.utils cimport ( + columns_from_unique_ptr, data_from_table_view, - data_from_unique_ptr, - table_view_from_table, + table_view_from_columns, ) @@ -123,21 +123,15 @@ def transform(Column input, op): return Column.from_unique_ptr(move(c_output)) -def table_encode(input): - cdef table_view c_input = table_view_from_table( - input, ignore_index=True) +def table_encode(list source_columns): + cdef table_view c_input = table_view_from_columns(source_columns) cdef pair[unique_ptr[table], unique_ptr[column]] c_result with nogil: c_result = move(libcudf_transform.encode(c_input)) - return ( - *data_from_unique_ptr( - move(c_result.first), - column_names=input._column_names, - ), - Column.from_unique_ptr(move(c_result.second)) - ) + return columns_from_unique_ptr( + move(c_result.first)), Column.from_unique_ptr(move(c_result.second)) def one_hot_encode(Column input_column, Column categories): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b2e3e42531b..5c9d8535798 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -229,13 +229,9 @@ def to_arrow(self) -> pa.Array: 4 ] """ - return libcudf.interop.to_arrow( - cudf.core.frame.Frame( - cudf.core.column_accessor.ColumnAccessor({"None": self}) - ), - [["None"]], - keep_index=False, - )["None"].chunk(0) + return libcudf.interop.to_arrow([self], [["None"]],)[ + "None" + ].chunk(0) @classmethod def from_arrow(cls, array: pa.Array) -> ColumnBase: @@ -280,12 +276,8 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: } ) - codes = libcudf.interop.from_arrow( - indices_table, indices_table.column_names - )[0]["None"] - categories = libcudf.interop.from_arrow( - dictionaries_table, dictionaries_table.column_names - )[0]["None"] + codes = libcudf.interop.from_arrow(indices_table)[0] + categories = libcudf.interop.from_arrow(dictionaries_table)[0] return build_categorical_column( categories=categories, @@ -301,7 +293,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: ): return cudf.core.column.IntervalColumn.from_arrow(array) - result = libcudf.interop.from_arrow(data, data.column_names)[0]["None"] + result = libcudf.interop.from_arrow(data)[0] return result._with_type_metadata(cudf_dtype_from_pa_type(array.type)) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 8578bfe8147..b383f7bc321 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -113,9 +113,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: return NotImplemented if isinstance(other.dtype, ListDtype): if op == "__add__": - return concatenate_rows( - cudf.core.frame.Frame({0: self, 1: other}) - ) + return concatenate_rows([self, other]) else: raise NotImplementedError( "Lists concatenation for this operation is not yet" diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index d5d45c341d5..6f4a6334a1d 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -365,9 +365,7 @@ def cat(self, others=None, sep=None, na_rep=None): other_cols = _get_cols_list(self._parent, others) all_cols = [self._column] + other_cols data = libstrings.concatenate( - cudf.DataFrame( - {index: value for index, value in enumerate(all_cols)} - ), + all_cols, cudf.Scalar(sep), cudf.Scalar(na_rep, "str"), ) @@ -5531,7 +5529,7 @@ def _binaryop( return cast( "column.ColumnBase", libstrings.concatenate( - cudf.DataFrame._from_data(data={0: lhs, 1: rhs}), + [lhs, rhs], sep=cudf.Scalar(""), na_rep=cudf.Scalar(None, "str"), ), diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ae60cd91fac..17b46c0e34e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3937,19 +3937,16 @@ def partition_by_hash(self, columns, nparts, keep_index=True): ------- partitioned: list of DataFrame """ - idx = ( - 0 - if (self._index is None or keep_index is False) - else self._index._num_columns - ) - key_indices = [self._data.names.index(k) + idx for k in columns] - output_data, output_index, offsets = libcudf.hash.hash_partition( - self, key_indices, nparts, keep_index + key_indices = [self._column_names.index(k) for k in columns] + output_columns, offsets = libcudf.hash.hash_partition( + [*self._columns], key_indices, nparts + ) + outdf = self._from_columns_like_self( + [*(self._index._columns if keep_index else ()), *output_columns], + self._column_names, + self._index_names if keep_index else None, ) - outdf = self.__class__._from_data(output_data, output_index) - outdf._copy_type_metadata(self, include_index=keep_index) - # Slice into partition return [outdf[s:e] for s, e in zip(offsets, offsets[1:] + [None])] @@ -5682,22 +5679,24 @@ def stack(self, level=-1, dropna=True): """ assert level in (None, -1) repeated_index = self.index.repeat(self.shape[1]) - name_index = cudf.DataFrame._from_data({0: self._column_names}).tile( - self.shape[0] + name_index = libcudf.reshape.tile( + [as_column(self._column_names)], self.shape[0] ) - new_index = list(repeated_index._columns) + [name_index._columns[0]] + new_index_columns = [*repeated_index._columns, *name_index] if isinstance(self._index, MultiIndex): index_names = self._index.names + [None] else: - index_names = [None] * len(new_index) + index_names = [None] * len(new_index_columns) new_index = MultiIndex.from_frame( - DataFrame(dict(zip(range(0, len(new_index)), new_index))), + DataFrame._from_data( + dict(zip(range(0, len(new_index_columns)), new_index_columns)) + ), names=index_names, ) # Collect datatypes and cast columns as that type common_type = np.result_type(*self.dtypes) - homogenized = DataFrame( + homogenized = DataFrame._from_data( { c: ( self._data[c].astype(common_type) @@ -5708,9 +5707,15 @@ def stack(self, level=-1, dropna=True): } ) - data_col = libcudf.reshape.interleave_columns(homogenized) + result = Series._from_data( + { + None: libcudf.reshape.interleave_columns( + [*homogenized._columns] + ) + }, + index=new_index, + ) - result = Series(data=data_col, index=new_index) if dropna: return result.dropna() else: @@ -6171,6 +6176,48 @@ def _from_columns_like_self( result._set_column_names_like(self) return result + @_cudf_nvtx_annotate + def interleave_columns(self): + """ + Interleave Series columns of a table into a single column. + + Converts the column major table `cols` into a row major column. + + Parameters + ---------- + cols : input Table containing columns to interleave. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({0: ['A1', 'A2', 'A3'], 1: ['B1', 'B2', 'B3']}) + >>> df + 0 1 + 0 A1 B1 + 1 A2 B2 + 2 A3 B3 + >>> df.interleave_columns() + 0 A1 + 1 B1 + 2 A2 + 3 B2 + 4 A3 + 5 B3 + dtype: object + + Returns + ------- + The interleaved columns as a single column + """ + if ("category" == self.dtypes).any(): + raise ValueError( + "interleave_columns does not support 'category' dtype." + ) + + return self._constructor_sliced._from_data( + {None: libcudf.reshape.interleave_columns([*self._columns])} + ) + def from_dataframe(df, allow_copy=False): return df_protocol.from_dataframe(df, allow_copy=allow_copy) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 806cdf14c71..d10f7c690bf 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -960,10 +960,16 @@ def scatter_by_map( f"ERROR: map_size must be >= {count} (got {map_size})." ) - data, index, output_offsets = libcudf.partitioning.partition( - self, map_index, map_size, keep_index + partitioned_columns, output_offsets = libcudf.partitioning.partition( + [*(self._index._columns if keep_index else ()), *self._columns], + map_index, + map_size, + ) + partitioned = self._from_columns_like_self( + partitioned_columns, + column_names=self._column_names, + index_names=self._index_names if keep_index else None, ) - partitioned = self.__class__._from_data(data, index) # due to the split limitation mentioned # here: https://github.com/rapidsai/cudf/issues/4607 @@ -973,9 +979,6 @@ def scatter_by_map( result = partitioned._split(output_offsets, keep_index=keep_index) - for frame in result: - frame._copy_type_metadata(self, include_index=keep_index) - if map_size: result += [ self._empty_like(keep_index) @@ -1274,20 +1277,18 @@ def _quantiles( libcudf.types.NullOrder[key] for key in null_precedence ] - result = self.__class__._from_data( - *libcudf.quantiles.quantiles( - self, + return self._from_columns_like_self( + libcudf.quantiles.quantiles( + [*self._columns], q, interpolation, is_sorted, column_order, null_precedence, - ) + ), + column_names=self._column_names, ) - result._copy_type_metadata(self) - return result - @_cudf_nvtx_annotate def rank( self, @@ -1466,30 +1467,33 @@ def from_arrow(cls, data): dict_indices_table = pa.table(dict_indices) data = data.drop(dict_indices_table.column_names) - cudf_indices_frame, _ = libcudf.interop.from_arrow( - dict_indices_table, dict_indices_table.column_names - ) + indices_columns = libcudf.interop.from_arrow(dict_indices_table) # as dictionary size can vary, it can't be a single table cudf_dictionaries_columns = { name: ColumnBase.from_arrow(dict_dictionaries[name]) for name in dict_dictionaries.keys() } - for name, codes in cudf_indices_frame.items(): - cudf_category_frame[name] = build_categorical_column( + cudf_category_frame = { + name: build_categorical_column( cudf_dictionaries_columns[name], codes, mask=codes.base_mask, size=codes.size, ordered=dict_ordered[name], ) + for name, codes in zip( + dict_indices_table.column_names, indices_columns + ) + } # Handle non-dict arrays - cudf_non_category_frame = ( - {} - if data.num_columns == 0 - else libcudf.interop.from_arrow(data, data.column_names)[0] - ) + cudf_non_category_frame = { + name: col + for name, col in zip( + data.column_names, libcudf.interop.from_arrow(data) + ) + } result = {**cudf_non_category_frame, **cudf_category_frame} @@ -2027,76 +2031,6 @@ def notnull(self): # Alias for notnull notna = notnull - @_cudf_nvtx_annotate - def interleave_columns(self): - """ - Interleave Series columns of a table into a single column. - - Converts the column major table `cols` into a row major column. - - Parameters - ---------- - cols : input Table containing columns to interleave. - - Examples - -------- - >>> df = DataFrame([['A1', 'A2', 'A3'], ['B1', 'B2', 'B3']]) - >>> df - 0 [A1, A2, A3] - 1 [B1, B2, B3] - >>> df.interleave_columns() - 0 A1 - 1 B1 - 2 A2 - 3 B2 - 4 A3 - 5 B3 - - Returns - ------- - The interleaved columns as a single column - """ - if ("category" == self.dtypes).any(): - raise ValueError( - "interleave_columns does not support 'category' dtype." - ) - - result = self._constructor_sliced( - libcudf.reshape.interleave_columns(self) - ) - - return result - - @_cudf_nvtx_annotate - def tile(self, count): - """ - Repeats the rows from `self` DataFrame `count` times to form a - new DataFrame. - - Parameters - ---------- - self : input Table containing columns to interleave. - count : Number of times to tile "rows". Must be non-negative. - - Examples - -------- - >>> df = Dataframe([[8, 4, 7], [5, 2, 3]]) - >>> count = 2 - >>> df.tile(df, count) - 0 1 2 - 0 8 4 7 - 1 5 2 3 - 0 8 4 7 - 1 5 2 3 - - Returns - ------- - The table containing the tiled "rows". - """ - result = self.__class__._from_data(*libcudf.reshape.tile(self, count)) - result._copy_type_metadata(self) - return result - @_cudf_nvtx_annotate def searchsorted( self, values, side="left", ascending=True, na_position="last" @@ -2166,12 +2100,24 @@ def searchsorted( scalar_flag = True if not isinstance(values, Frame): - values = as_column(values) - if values.dtype != self.dtype: - self = self.astype(values.dtype) - values = values.as_frame() + values = [as_column(values)] + else: + values = [*values._columns] + if len(values) != len(self._data): + raise ValueError("Mismatch number of columns to search for.") + + sources = [ + col + if is_dtype_equal(col.dtype, val.dtype) + else col.astype(val.dtype) + for col, val in zip(self._columns, values) + ] outcol = libcudf.search.search_sorted( - self, values, side, ascending=ascending, na_position=na_position + sources, + values, + side, + ascending=ascending, + na_position=na_position, ) # Retrun result as cupy array if the values is non-scalar @@ -2462,10 +2408,8 @@ def _split(self, splits): @_cudf_nvtx_annotate def _encode(self): - data, index, indices = libcudf.transform.table_encode(self) - for name, col in data.items(): - data[name] = col._with_type_metadata(self._data[name].dtype) - keys = self.__class__._from_data(data, index) + columns, indices = libcudf.transform.table_encode([*self._columns]) + keys = self._from_columns_like_self(columns) return keys, indices @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6b98e82d553..390af3d076c 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1104,16 +1104,11 @@ def _cov_or_corr(self, func, method_name): for i in range(0, len(cols_list), num_cols) ] - def combine_columns(gb_cov_corr, ys): - list_of_columns = [gb_cov_corr._data[y] for y in ys] - frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys) - return interleave_columns(frame) - # interleave: combines the correlation or covariance results for each # column-pair into a single column res = cudf.DataFrame._from_data( { - x: combine_columns(gb_cov_corr, ys) + x: interleave_columns([gb_cov_corr._data[y] for y in ys]) for ys, x in zip(cols_split, column_names) } ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index aff13025e72..fd918f723fe 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -76,10 +76,10 @@ def _lexsorted_equal_range( sort_inds = None sort_vals = idx lower_bound = search_sorted( - sort_vals, key_as_table, side="left" + [*sort_vals._data.columns], [*key_as_table._columns], side="left" ).element_indexing(0) upper_bound = search_sorted( - sort_vals, key_as_table, side="right" + [*sort_vals._data.columns], [*key_as_table._columns], side="right" ).element_indexing(0) return lower_bound, upper_bound, sort_inds diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 10736948b57..68c9a429227 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -818,7 +818,8 @@ def hash_values(self, method="murmur3"): # calculation, necessitating the unfortunate circular reference to the # child class here. return cudf.Series._from_data( - {None: libcudf.hash.hash(self, method)}, index=self.index + {None: libcudf.hash.hash([*self._columns], method)}, + index=self.index, ) def _gather( @@ -2688,21 +2689,52 @@ def _explode(self, explode_column: Any, ignore_index: bool): if not ignore_index and self._index is not None: explode_column_num += self._index.nlevels - data, index = libcudf.lists.explode_outer( - self, explode_column_num, ignore_index + exploded = libcudf.lists.explode_outer( + [ + *(self._index._data.columns if not ignore_index else ()), + *self._columns, + ], + explode_column_num, ) - res = self.__class__._from_data( - ColumnAccessor( - data, - multiindex=self._data.multiindex, - level_names=self._data._level_names, - ), - index=index, + + return self._from_columns_like_self( + exploded, + self._column_names, + self._index_names if not ignore_index else None, ) - if not ignore_index and self._index is not None: - res.index.names = self._index.names - return res + @_cudf_nvtx_annotate + def tile(self, count): + """Repeats the rows `count` times to form a new Frame. + + Parameters + ---------- + self : input Table containing columns to interleave. + count : Number of times to tile "rows". Must be non-negative. + + Examples + -------- + >>> import cudf + >>> df = cudf.Dataframe([[8, 4, 7], [5, 2, 3]]) + >>> count = 2 + >>> df.tile(df, count) + 0 1 2 + 0 8 4 7 + 1 5 2 3 + 0 8 4 7 + 1 5 2 3 + + Returns + ------- + The indexed frame containing the tiled "rows". + """ + return self._from_columns_like_self( + libcudf.reshape.tile( + [*self._index._columns, *self._columns], count + ), + column_names=self._column_names, + index_names=self._index_names, + ) @_cudf_nvtx_annotate @docutils.doc_apply( diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index c7e46cf0165..6a495ef8d9a 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -177,15 +177,15 @@ def __init__( ) def perform_merge(self) -> Frame: - left_join_cols = {} - right_join_cols = {} + left_join_cols = [] + right_join_cols = [] for left_key, right_key in zip(self._left_keys, self._right_keys): lcol = left_key.get(self.lhs) rcol = right_key.get(self.rhs) lcol_casted, rcol_casted = _match_join_keys(lcol, rcol, self.how) - left_join_cols[left_key.name] = lcol_casted - right_join_cols[left_key.name] = rcol_casted + left_join_cols.append(lcol_casted) + right_join_cols.append(rcol_casted) # Categorical dtypes must be cast back from the underlying codes # type that was returned by _match_join_keys. @@ -201,8 +201,8 @@ def perform_merge(self) -> Frame: right_key.set(self.rhs, rcol_casted, validate=False) left_rows, right_rows = self._joiner( - cudf.core.frame.Frame(left_join_cols), - cudf.core.frame.Frame(right_join_cols), + left_join_cols, + right_join_cols, how=self.how, ) diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index 00a2cb4cee2..644643db83c 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. import cudf @@ -34,12 +34,13 @@ def from_dlpack(pycapsule_obj): tensor is row-major, transpose it before passing it to this function. """ - data, _ = libdlpack.from_dlpack(pycapsule_obj) + columns = libdlpack.from_dlpack(pycapsule_obj) + column_names = range(len(columns)) - if len(data) == 1: - return cudf.Series._from_data(data) + if len(columns) == 1: + return cudf.Series._from_columns(columns, column_names=column_names) else: - return cudf.DataFrame._from_data(data) + return cudf.DataFrame._from_columns(columns, column_names=column_names) @ioutils.doc_to_dlpack() @@ -91,4 +92,4 @@ def to_dlpack(cudf_obj): ) gdf = gdf.astype(dtype) - return libdlpack.to_dlpack(gdf) + return libdlpack.to_dlpack([*gdf._columns]) diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py index cd029d02d79..d3433a589a7 100644 --- a/python/cudf/cudf/tests/test_search.py +++ b/python/cudf/cudf/tests/test_search.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import cupy import numpy as np import pandas as pd @@ -73,6 +73,14 @@ def test_searchsorted_dataframe(side, multiindex): assert result == [2, 0, 4, 1] +def test_search_sorted_dataframe_unequal_number_of_columns(): + values = cudf.DataFrame({"a": [1, 0, 5, 1]}) + base = cudf.DataFrame({"a": [1, 0, 5, 1], "b": ["x", "z", "w", "a"]}) + + with pytest.raises(ValueError, match="Mismatch number of columns"): + base.searchsorted(values) + + @pytest.mark.parametrize("side", ["left", "right"]) def test_searchsorted_categorical(side):