From 01a4e749abd744a6920919c6ce13bff0c009bfce Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 18 Mar 2022 12:54:19 -0700 Subject: [PATCH 01/21] Rolling out to hash.pyx --- python/cudf/cudf/_lib/hash.pyx | 32 ++++++++------------------ python/cudf/cudf/core/dataframe.py | 19 +++++++-------- python/cudf/cudf/core/indexed_frame.py | 3 ++- 3 files changed, 20 insertions(+), 34 deletions(-) diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index 301f571f5fb..8bb8ab92a48 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -14,16 +14,14 @@ from cudf._lib.cpp.hash cimport hash as cpp_hash, hash_id as cpp_hash_id from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns -def hash_partition(source_table, object columns_to_hash, - int num_partitions, bool keep_index=True): +def hash_partition(list source_columns, object columns_to_hash, + int num_partitions): cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash cdef int c_num_partitions = num_partitions - cdef table_view c_source_view = table_view_from_table( - source_table, not keep_index - ) + cdef table_view c_source_view = table_view_from_columns(source_columns) cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result with nogil: @@ -36,27 +34,17 @@ def hash_partition(source_table, object columns_to_hash, ) # Note that the offsets (`c_result.second`) may be empty when - # the original table (`source_table`) is empty. We need to + # the original table (`source_columns`) is empty. We need to # return a list of zeros in this case. return ( - *data_from_unique_ptr( - move(c_result.first), - column_names=source_table._column_names, - index_names=( - source_table._index_names - if keep_index is True - else None - ) - - ), - list(c_result.second) if c_result.second.size() - else [0] * num_partitions + columns_from_unique_ptr(move(c_result.first)), + list(c_result.second) + if c_result.second.size() else [0] * num_partitions ) -def hash(source_table, str method, int seed=0): - cdef table_view c_source_view = table_view_from_table( - source_table, ignore_index=True) +def hash(list source_columns, str method, int seed=0): + cdef table_view c_source_view = table_view_from_columns(source_columns) cdef unique_ptr[column] c_result cdef cpp_hash_id c_hash_function if method == "murmur3": diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 57d591dd3e7..006d09366ea 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3877,19 +3877,16 @@ def partition_by_hash(self, columns, nparts, keep_index=True): ------- partitioned: list of DataFrame """ - idx = ( - 0 - if (self._index is None or keep_index is False) - else self._index._num_columns - ) - key_indices = [self._data.names.index(k) + idx for k in columns] - output_data, output_index, offsets = libcudf.hash.hash_partition( - self, key_indices, nparts, keep_index + key_indices = [self._column_names.index(k) for k in columns] + output_columns, offsets = libcudf.hash.hash_partition( + [*self._columns], key_indices, nparts + ) + outdf = self._from_columns_like_self( + [*(self._index._columns if keep_index else ()), *output_columns], + self._column_names, + self._index_names if keep_index else None, ) - outdf = self.__class__._from_data(output_data, output_index) - outdf._copy_type_metadata(self, include_index=keep_index) - # Slice into partition return [outdf[s:e] for s, e in zip(offsets, offsets[1:] + [None])] diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 3fa951241f7..dced49016fd 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -617,7 +617,8 @@ def hash_values(self, method="murmur3"): # calculation, necessitating the unfortunate circular reference to the # child class here. return cudf.Series._from_data( - {None: libcudf.hash.hash(self, method)}, index=self.index + {None: libcudf.hash.hash([*self._columns], method)}, + index=self.index, ) def _gather( From e514c10bf9015dea2db1370ba726113f6665068b Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 18 Mar 2022 14:00:49 -0700 Subject: [PATCH 02/21] Rolling out to interop.pyx --- python/cudf/cudf/_lib/interop.pyx | 54 ++++++++------------------ python/cudf/cudf/core/column/column.py | 18 ++------- python/cudf/cudf/core/frame.py | 23 ++++++----- python/cudf/cudf/io/dlpack.py | 13 ++++--- 4 files changed, 40 insertions(+), 68 deletions(-) diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 06e287ee670..26bb2e868f5 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import cudf @@ -20,7 +20,7 @@ from cudf._lib.cpp.interop cimport ( ) from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns def from_dlpack(dlpack_capsule): @@ -40,31 +40,25 @@ def from_dlpack(dlpack_capsule): cpp_from_dlpack(dlpack_tensor) ) - res = data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) - ) + res = columns_from_unique_ptr(move(c_result)) dlpack_tensor.deleter(dlpack_tensor) return res -def to_dlpack(source_table): +def to_dlpack(list source_columns): """ Converts a cudf Frame into a DLPack Tensor PyCapsule. DLPack Tensor PyCapsule will have the name "dltensor". """ - for column in source_table._columns: - if column.null_count: - raise ValueError( - "Cannot create a DLPack tensor with null values. \ - Input is required to have null count as zero." - ) + if any(column.null_count for column in source_columns): + raise ValueError( + "Cannot create a DLPack tensor with null values. \ + Input is required to have null count as zero." + ) cdef DLManagedTensor *dlpack_tensor - cdef table_view source_table_view = table_view_from_table( - source_table, ignore_index=True - ) + cdef table_view source_table_view = table_view_from_columns(source_columns) with nogil: dlpack_tensor = cpp_to_dlpack( @@ -110,17 +104,13 @@ cdef vector[column_metadata] gather_metadata(object metadata) except *: raise ValueError("Malformed metadata has been encountered") -def to_arrow(input_table, - object metadata, - bool keep_index=True): +def to_arrow(list source_columns, object metadata): """Convert from cudf Frame to PyArrow Table. Parameters ---------- - input_table : cudf table - column_names : names for the pyarrow arrays - field_names : field names for nested type arrays - keep_index : whether index needs to be part of arrow table + source_columns : a list of columns to convert + metadata : a list of metadata, see `gather_metadata` for layout Returns ------- @@ -128,9 +118,7 @@ def to_arrow(input_table, """ cdef vector[column_metadata] cpp_metadata = gather_metadata(metadata) - cdef table_view input_table_view = ( - table_view_from_table(input_table, not keep_index) - ) + cdef table_view input_table_view = table_view_from_columns(source_columns) cdef shared_ptr[CTable] cpp_arrow_table with nogil: @@ -141,18 +129,12 @@ def to_arrow(input_table, return pyarrow_wrap_table(cpp_arrow_table) -def from_arrow( - object input_table, - object column_names=None, - object index_names=None -): +def from_arrow(object input_table): """Convert from PyArrow Table to cudf Frame. Parameters ---------- input_table : PyArrow table - column_names : names for the cudf table data columns - index_names : names for the cudf table index columns Returns ------- @@ -166,8 +148,4 @@ def from_arrow( with nogil: c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0])) - return data_from_unique_ptr( - move(c_result), - column_names=column_names, - index_names=index_names - ) + return columns_from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 1c1845373e1..08f460da48e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -227,13 +227,7 @@ def to_arrow(self) -> pa.Array: 4 ] """ - return libcudf.interop.to_arrow( - cudf.core.frame.Frame( - cudf.core.column_accessor.ColumnAccessor({"None": self}) - ), - [["None"]], - keep_index=False, - )["None"].chunk(0) + return libcudf.interop.to_arrow([self], [["None"]],)["None"].chunk(0) @classmethod def from_arrow(cls, array: pa.Array) -> ColumnBase: @@ -278,12 +272,8 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: } ) - codes = libcudf.interop.from_arrow( - indices_table, indices_table.column_names - )[0]["None"] - categories = libcudf.interop.from_arrow( - dictionaries_table, dictionaries_table.column_names - )[0]["None"] + codes = libcudf.interop.from_arrow(indices_table)[0] + categories = libcudf.interop.from_arrow(dictionaries_table)[0] return build_categorical_column( categories=categories, @@ -299,7 +289,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: ): return cudf.core.column.IntervalColumn.from_arrow(array) - result = libcudf.interop.from_arrow(data, data.column_names)[0]["None"] + result = libcudf.interop.from_arrow(data)[0] return result._with_type_metadata(cudf_dtype_from_pa_type(array.type)) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 07cc3ea71cd..98c052bbc34 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1727,30 +1727,33 @@ def from_arrow(cls, data): dict_indices_table = pa.table(dict_indices) data = data.drop(dict_indices_table.column_names) - cudf_indices_frame, _ = libcudf.interop.from_arrow( - dict_indices_table, dict_indices_table.column_names - ) + indices_columns = libcudf.interop.from_arrow(dict_indices_table) # as dictionary size can vary, it can't be a single table cudf_dictionaries_columns = { name: ColumnBase.from_arrow(dict_dictionaries[name]) for name in dict_dictionaries.keys() } - for name, codes in cudf_indices_frame.items(): - cudf_category_frame[name] = build_categorical_column( + cudf_category_frame = { + name: build_categorical_column( cudf_dictionaries_columns[name], codes, mask=codes.base_mask, size=codes.size, ordered=dict_ordered[name], ) + for name, codes in zip( + dict_indices_table.column_names, indices_columns + ) + } # Handle non-dict arrays - cudf_non_category_frame = ( - {} - if data.num_columns == 0 - else libcudf.interop.from_arrow(data, data.column_names)[0] - ) + cudf_non_category_frame = { + name: col + for name, col in zip( + data.column_names, libcudf.interop.from_arrow(data) + ) + } result = {**cudf_non_category_frame, **cudf_category_frame} diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index 00a2cb4cee2..644643db83c 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. import cudf @@ -34,12 +34,13 @@ def from_dlpack(pycapsule_obj): tensor is row-major, transpose it before passing it to this function. """ - data, _ = libdlpack.from_dlpack(pycapsule_obj) + columns = libdlpack.from_dlpack(pycapsule_obj) + column_names = range(len(columns)) - if len(data) == 1: - return cudf.Series._from_data(data) + if len(columns) == 1: + return cudf.Series._from_columns(columns, column_names=column_names) else: - return cudf.DataFrame._from_data(data) + return cudf.DataFrame._from_columns(columns, column_names=column_names) @ioutils.doc_to_dlpack() @@ -91,4 +92,4 @@ def to_dlpack(cudf_obj): ) gdf = gdf.astype(dtype) - return libdlpack.to_dlpack(gdf) + return libdlpack.to_dlpack([*gdf._columns]) From cb1a1a94c8e24e39636151abd13e78d056cda99a Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 18 Mar 2022 14:16:04 -0700 Subject: [PATCH 03/21] Rolling out to join.pyx --- python/cudf/cudf/_lib/join.pyx | 46 +++++++++--------------------- python/cudf/cudf/core/join/join.py | 12 ++++---- 2 files changed, 19 insertions(+), 39 deletions(-) diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 5921f06d36e..1ff6daaae52 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from itertools import chain @@ -16,31 +16,22 @@ from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport data_type, size_type, type_id -from cudf._lib.utils cimport table_view_from_table +from cudf._lib.utils cimport table_view_from_columns # The functions below return the *gathermaps* that represent # the join result when joining on the keys `lhs` and `rhs`. -cpdef join(lhs, rhs, how=None): +cpdef join(list lhs, list rhs, how=None): cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result - cdef table_view c_lhs = table_view_from_table(lhs) - cdef table_view c_rhs = table_view_from_table(rhs) + cdef table_view c_lhs = table_view_from_columns(lhs) + cdef table_view c_rhs = table_view_from_columns(rhs) if how == "inner": - c_result = move(cpp_join.inner_join( - c_lhs, - c_rhs - )) + c_result = move(cpp_join.inner_join(c_lhs, c_rhs)) elif how == "left": - c_result = move(cpp_join.left_join( - c_lhs, - c_rhs - )) + c_result = move(cpp_join.left_join(c_lhs, c_rhs)) elif how == "outer": - c_result = move(cpp_join.full_join( - c_lhs, - c_rhs - )) + c_result = move(cpp_join.full_join(c_lhs, c_rhs)) else: raise ValueError(f"Invalid join type {how}") @@ -49,30 +40,21 @@ cpdef join(lhs, rhs, how=None): return left_rows, right_rows -cpdef semi_join(lhs, rhs, how=None): +cpdef semi_join(list lhs, list rhs, how=None): # left-semi and left-anti joins cdef cpp_join.gather_map_type c_result - cdef table_view c_lhs = table_view_from_table(lhs) - cdef table_view c_rhs = table_view_from_table(rhs) + cdef table_view c_lhs = table_view_from_columns(lhs) + cdef table_view c_rhs = table_view_from_columns(rhs) if how == "leftsemi": - c_result = move(cpp_join.left_semi_join( - c_lhs, - c_rhs - )) + c_result = move(cpp_join.left_semi_join(c_lhs, c_rhs)) elif how == "leftanti": - c_result = move(cpp_join.left_anti_join( - c_lhs, - c_rhs - )) + c_result = move(cpp_join.left_anti_join(c_lhs, c_rhs)) else: raise ValueError(f"Invalid join type {how}") cdef Column left_rows = _gather_map_as_column(move(c_result)) - return ( - left_rows, - None - ) + return left_rows, None cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map): diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index c7e46cf0165..c3ccf91efe7 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -177,15 +177,15 @@ def __init__( ) def perform_merge(self) -> Frame: - left_join_cols = {} - right_join_cols = {} + left_join_cols = [] + right_join_cols = [] for left_key, right_key in zip(self._left_keys, self._right_keys): lcol = left_key.get(self.lhs) rcol = right_key.get(self.rhs) lcol_casted, rcol_casted = _match_join_keys(lcol, rcol, self.how) - left_join_cols[left_key.name] = lcol_casted - right_join_cols[left_key.name] = rcol_casted + left_join_cols.append(lcol_casted) + right_join_cols.append(rcol_casted) # Categorical dtypes must be cast back from the underlying codes # type that was returned by _match_join_keys. @@ -201,9 +201,7 @@ def perform_merge(self) -> Frame: right_key.set(self.rhs, rcol_casted, validate=False) left_rows, right_rows = self._joiner( - cudf.core.frame.Frame(left_join_cols), - cudf.core.frame.Frame(right_join_cols), - how=self.how, + left_join_cols, right_join_cols, how=self.how, ) gather_index = self._using_left_index or self._using_right_index From 30d6b78e2c2bf157f2b2b4891350749a8c5ea33e Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 18 Mar 2022 14:42:12 -0700 Subject: [PATCH 04/21] Rolling out to partitioning.pyx --- python/cudf/cudf/_lib/partitioning.pyx | 22 ++++++---------------- python/cudf/cudf/core/frame.py | 15 +++++++++------ 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx index e53667e7589..f2f5a92aca1 100644 --- a/python/cudf/cudf/_lib/partitioning.pyx +++ b/python/cudf/cudf/_lib/partitioning.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -11,21 +11,19 @@ from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.partitioning cimport partition as cpp_partition from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count cimport cudf._lib.cpp.types as libcudf_types -def partition(source_table, Column partition_map, - object num_partitions, bool keep_index=True): +def partition(list source_columns, Column partition_map, + object num_partitions): if num_partitions is None: num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True) cdef int c_num_partitions = num_partitions - cdef table_view c_source_view = table_view_from_table( - source_table, not keep_index - ) + cdef table_view c_source_view = table_view_from_columns(source_columns) cdef column_view c_partition_map_view = partition_map.view() @@ -40,13 +38,5 @@ def partition(source_table, Column partition_map, ) return ( - *data_from_unique_ptr( - move(c_result.first), - column_names=source_table._column_names, - index_names=source_table._index_names if( - keep_index is True) - else None - - ), - list(c_result.second) + columns_from_unique_ptr(move(c_result.first)), list(c_result.second) ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 98c052bbc34..ff30fb24963 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1150,10 +1150,16 @@ def scatter_by_map( f"ERROR: map_size must be >= {count} (got {map_size})." ) - data, index, output_offsets = libcudf.partitioning.partition( - self, map_index, map_size, keep_index + partitioned_columns, output_offsets = libcudf.partitioning.partition( + [*(self._index._columns if keep_index else ()), *self._columns], + map_index, + map_size, + ) + partitioned = self._from_columns_like_self( + partitioned_columns, + column_names=self._column_names, + index_names=self._index_names if keep_index else None, ) - partitioned = self.__class__._from_data(data, index) # due to the split limitation mentioned # here: https://github.com/rapidsai/cudf/issues/4607 @@ -1163,9 +1169,6 @@ def scatter_by_map( result = partitioned._split(output_offsets, keep_index=keep_index) - for frame in result: - frame._copy_type_metadata(self, include_index=keep_index) - if map_size: result += [ self._empty_like(keep_index) From 1a23fd6a5ef82144565dd9f0d8a36f2ff5084c10 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 18 Mar 2022 14:47:28 -0700 Subject: [PATCH 05/21] Include gil releases for joins --- python/cudf/cudf/_lib/join.pyx | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 1ff6daaae52..1baef266dab 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -27,11 +27,14 @@ cpdef join(list lhs, list rhs, how=None): cdef table_view c_rhs = table_view_from_columns(rhs) if how == "inner": - c_result = move(cpp_join.inner_join(c_lhs, c_rhs)) + with nogil: + c_result = move(cpp_join.inner_join(c_lhs, c_rhs)) elif how == "left": - c_result = move(cpp_join.left_join(c_lhs, c_rhs)) + with nogil: + c_result = move(cpp_join.left_join(c_lhs, c_rhs)) elif how == "outer": - c_result = move(cpp_join.full_join(c_lhs, c_rhs)) + with nogil: + c_result = move(cpp_join.full_join(c_lhs, c_rhs)) else: raise ValueError(f"Invalid join type {how}") @@ -47,9 +50,11 @@ cpdef semi_join(list lhs, list rhs, how=None): cdef table_view c_rhs = table_view_from_columns(rhs) if how == "leftsemi": - c_result = move(cpp_join.left_semi_join(c_lhs, c_rhs)) + with nogil: + c_result = move(cpp_join.left_semi_join(c_lhs, c_rhs)) elif how == "leftanti": - c_result = move(cpp_join.left_anti_join(c_lhs, c_rhs)) + with nogil: + c_result = move(cpp_join.left_anti_join(c_lhs, c_rhs)) else: raise ValueError(f"Invalid join type {how}") From 42737ab4f397aba6f3974587fe9664c1ebeb79a1 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 18 Mar 2022 15:04:20 -0700 Subject: [PATCH 06/21] Rolling out to quantiles.pyx --- python/cudf/cudf/_lib/quantiles.pyx | 14 +++++--------- python/cudf/cudf/core/frame.py | 10 ++++------ 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx index 497a71df89d..f65c29a55a8 100644 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ b/python/cudf/cudf/_lib/quantiles.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -31,7 +31,7 @@ from cudf._lib.cpp.types cimport ( order_info, sorted, ) -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns def quantile( @@ -74,14 +74,13 @@ def quantile( return Column.from_unique_ptr(move(c_result)) -def quantiles(source_table, +def quantiles(list source_columns, vector[double] q, object interp, object is_input_sorted, list column_order, list null_precedence): - cdef table_view c_input = table_view_from_table( - source_table, ignore_index=True) + cdef table_view c_input = table_view_from_columns(source_columns) cdef vector[double] c_q = q cdef interpolation c_interp = ( interp @@ -119,7 +118,4 @@ def quantiles(source_table, ) ) - return data_from_unique_ptr( - move(c_result), - column_names=source_table._column_names - ) + return columns_from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ff30fb24963..51170a9c2ee 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1460,20 +1460,18 @@ def _quantiles( libcudf.types.NullOrder[key] for key in null_precedence ] - result = self.__class__._from_data( + return self._from_columns_like_self( *libcudf.quantiles.quantiles( - self, + [*self._columns], q, interpolation, is_sorted, column_order, null_precedence, - ) + ), + column_names=self._column_names, ) - result._copy_type_metadata(self) - return result - @_cudf_nvtx_annotate def rank( self, From 6226f32473ec94b2e1fb235def6367e729fd4498 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 18 Mar 2022 16:03:07 -0700 Subject: [PATCH 07/21] Rolling out to reshape.pyx --- python/cudf/cudf/_lib/reshape.pyx | 23 +++----- python/cudf/cudf/core/dataframe.py | 56 +++++++++++++++++-- python/cudf/cudf/core/frame.py | 70 ------------------------ python/cudf/cudf/core/groupby/groupby.py | 7 +-- python/cudf/cudf/core/indexed_frame.py | 32 +++++++++++ 5 files changed, 93 insertions(+), 95 deletions(-) diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx index d64d0543892..29223947eea 100644 --- a/python/cudf/cudf/_lib/reshape.pyx +++ b/python/cudf/cudf/_lib/reshape.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -13,32 +13,25 @@ from cudf._lib.cpp.reshape cimport ( from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns -def interleave_columns(source_table): - cdef table_view c_view = table_view_from_table( - source_table, ignore_index=True) +def interleave_columns(list source_columns): + cdef table_view c_view = table_view_from_columns(source_columns) cdef unique_ptr[column] c_result with nogil: c_result = move(cpp_interleave_columns(c_view)) - return Column.from_unique_ptr( - move(c_result) - ) + return Column.from_unique_ptr(move(c_result)) -def tile(source_table, size_type count): +def tile(list source_columns, size_type count): cdef size_type c_count = count - cdef table_view c_view = table_view_from_table(source_table) + cdef table_view c_view = table_view_from_columns(source_columns) cdef unique_ptr[table] c_result with nogil: c_result = move(cpp_tile(c_view, c_count)) - return data_from_unique_ptr( - move(c_result), - column_names=source_table._column_names, - index_names=source_table._index_names - ) + return columns_from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 006d09366ea..4b5e7cc3441 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5597,8 +5597,10 @@ def stack(self, level=-1, dropna=True): """ assert level in (None, -1) repeated_index = self.index.repeat(self.shape[1]) - name_index = Frame({0: self._column_names}).tile(self.shape[0]) - new_index = list(repeated_index._columns) + [name_index._columns[0]] + name_index = libcudf.reshape.tile( + [as_column(self._column_names)], self.shape[0] + ) + new_index = list(repeated_index._columns) + name_index if isinstance(self._index, MultiIndex): index_names = self._index.names + [None] else: @@ -5621,9 +5623,15 @@ def stack(self, level=-1, dropna=True): } ) - data_col = libcudf.reshape.interleave_columns(homogenized) + result = Series._from_data( + { + None: libcudf.reshape.interleave_columns( + [*homogenized._columns] + ) + }, + index=new_index, + ) - result = Series(data=data_col, index=new_index) if dropna: return result.dropna() else: @@ -6057,6 +6065,46 @@ def _from_columns_like_self( result._set_column_names_like(self) return result + @_cudf_nvtx_annotate + def interleave_columns(self): + """ + Interleave Series columns of a table into a single column. + + Converts the column major table `cols` into a row major column. + + Parameters + ---------- + cols : input Table containing columns to interleave. + + Examples + -------- + >>> df = DataFrame([['A1', 'A2', 'A3'], ['B1', 'B2', 'B3']]) + >>> df + 0 [A1, A2, A3] + 1 [B1, B2, B3] + >>> df.interleave_columns() + 0 A1 + 1 B1 + 2 A2 + 3 B2 + 4 A3 + 5 B3 + + Returns + ------- + The interleaved columns as a single column + """ + if ("category" == self.dtypes).any(): + raise ValueError( + "interleave_columns does not support 'category' dtype." + ) + + result = self._constructor_sliced._from_data( + {None: libcudf.reshape.interleave_columns([*self._columns])} + ) + + return result + def from_dataframe(df, allow_copy=False): return df_protocol.from_dataframe(df, allow_copy=allow_copy) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 51170a9c2ee..5ddc74308f1 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2286,76 +2286,6 @@ def notnull(self): # Alias for notnull notna = notnull - @_cudf_nvtx_annotate - def interleave_columns(self): - """ - Interleave Series columns of a table into a single column. - - Converts the column major table `cols` into a row major column. - - Parameters - ---------- - cols : input Table containing columns to interleave. - - Examples - -------- - >>> df = DataFrame([['A1', 'A2', 'A3'], ['B1', 'B2', 'B3']]) - >>> df - 0 [A1, A2, A3] - 1 [B1, B2, B3] - >>> df.interleave_columns() - 0 A1 - 1 B1 - 2 A2 - 3 B2 - 4 A3 - 5 B3 - - Returns - ------- - The interleaved columns as a single column - """ - if ("category" == self.dtypes).any(): - raise ValueError( - "interleave_columns does not support 'category' dtype." - ) - - result = self._constructor_sliced( - libcudf.reshape.interleave_columns(self) - ) - - return result - - @_cudf_nvtx_annotate - def tile(self, count): - """ - Repeats the rows from `self` DataFrame `count` times to form a - new DataFrame. - - Parameters - ---------- - self : input Table containing columns to interleave. - count : Number of times to tile "rows". Must be non-negative. - - Examples - -------- - >>> df = Dataframe([[8, 4, 7], [5, 2, 3]]) - >>> count = 2 - >>> df.tile(df, count) - 0 1 2 - 0 8 4 7 - 1 5 2 3 - 0 8 4 7 - 1 5 2 3 - - Returns - ------- - The table containing the tiled "rows". - """ - result = self.__class__._from_data(*libcudf.reshape.tile(self, count)) - result._copy_type_metadata(self) - return result - @_cudf_nvtx_annotate def searchsorted( self, values, side="left", ascending=True, na_position="last" diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index a1a4596ba45..9d53558d73a 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1097,16 +1097,11 @@ def _cov_or_corr(self, func, method_name): for i in range(0, len(cols_list), num_cols) ] - def combine_columns(gb_cov_corr, ys): - list_of_columns = [gb_cov_corr._data[y] for y in ys] - frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys) - return interleave_columns(frame) - # interleave: combines the correlation or covariance results for each # column-pair into a single column res = cudf.DataFrame._from_data( { - x: combine_columns(gb_cov_corr, ys) + x: interleave_columns([gb_cov_corr._data[y] for y in ys]) for ys, x in zip(cols_split, column_names) } ) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index dced49016fd..7d9b354e69c 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2425,6 +2425,38 @@ def _explode(self, explode_column: Any, ignore_index: bool): res.index.names = self._index.names return res + @_cudf_nvtx_annotate + def tile(self, count): + """Repeats the rows `count` times to form a new Frame. + + Parameters + ---------- + self : input Table containing columns to interleave. + count : Number of times to tile "rows". Must be non-negative. + + Examples + -------- + >>> df = Dataframe([[8, 4, 7], [5, 2, 3]]) + >>> count = 2 + >>> df.tile(df, count) + 0 1 2 + 0 8 4 7 + 1 5 2 3 + 0 8 4 7 + 1 5 2 3 + + Returns + ------- + The indexed frame containing the tiled "rows". + """ + return self._from_columns_like_self( + libcudf.reshape.tile( + [*self._index._columns, *self._columns], count + ), + column_names=self._column_names, + index_names=self._index_names, + ) + def _check_duplicate_level_names(specified, level_names): """Raise if any of `specified` has duplicates in `level_names`.""" From f00b9e0594309dceb71c49c7149cdce3d207f334 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 18 Mar 2022 17:05:32 -0700 Subject: [PATCH 08/21] Rolling out to search.pyx --- python/cudf/cudf/_lib/search.pyx | 24 +++++++++++------------- python/cudf/cudf/core/frame.py | 22 +++++++++++++++++----- python/cudf/cudf/tests/test_search.py | 8 ++++++++ 3 files changed, 36 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx index f92ef753fc2..d5568f53231 100644 --- a/python/cudf/cudf/_lib/search.pyx +++ b/python/cudf/cudf/_lib/search.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -10,20 +10,20 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.utils cimport table_view_from_table +from cudf._lib.utils cimport table_view_from_columns def search_sorted( - table, values, side, ascending=True, na_position="last" + list source, list values, side, ascending=True, na_position="last" ): """Find indices where elements should be inserted to maintain order Parameters ---------- - table : Frame - Frame to search in - values : Frame - Frame of values to search for + source : list of columns + List of columns to search in + values : List of columns + List of value columns to search for side : str {‘left’, ‘right’} optional If ‘left’, the index of the first suitable location is given. If ‘right’, return the last such index @@ -33,10 +33,8 @@ def search_sorted( cdef vector[libcudf_types.null_order] c_null_precedence cdef libcudf_types.order c_order cdef libcudf_types.null_order c_null_order - cdef table_view c_table_data = table_view_from_table( - table, ignore_index=True) - cdef table_view c_values_data = table_view_from_table( - values, ignore_index=True) + cdef table_view c_table_data = table_view_from_columns(source) + cdef table_view c_values_data = table_view_from_columns(values) # Note: We are ignoring index columns here c_order = (libcudf_types.order.ASCENDING @@ -47,9 +45,9 @@ def search_sorted( if na_position=="last" else libcudf_types.null_order.BEFORE ) - c_column_order = vector[libcudf_types.order](table._num_columns, c_order) + c_column_order = vector[libcudf_types.order](len(source), c_order) c_null_precedence = vector[libcudf_types.null_order]( - table._num_columns, c_null_order + len(source), c_null_order ) if side == 'left': diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5ddc74308f1..af4eee0eb16 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2355,12 +2355,24 @@ def searchsorted( scalar_flag = True if not isinstance(values, Frame): - values = as_column(values) - if values.dtype != self.dtype: - self = self.astype(values.dtype) - values = values.as_frame() + values = [as_column(values)] + else: + values = [*values._columns] + if len(values) != len(self._data): + raise ValueError("Mismatch number of columns to search for.") + + sources = [ + col + if is_dtype_equal(col.dtype, val.dtype) + else col.astype(val.dtype) + for col, val in zip(self._columns, values) + ] outcol = libcudf.search.search_sorted( - self, values, side, ascending=ascending, na_position=na_position + sources, + values, + side, + ascending=ascending, + na_position=na_position, ) # Retrun result as cupy array if the values is non-scalar diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py index cd029d02d79..103e40dbeda 100644 --- a/python/cudf/cudf/tests/test_search.py +++ b/python/cudf/cudf/tests/test_search.py @@ -73,6 +73,14 @@ def test_searchsorted_dataframe(side, multiindex): assert result == [2, 0, 4, 1] +def test_search_sorted_dataframe_unequal_number_of_columns(): + values = cudf.DataFrame({"a": [1, 0, 5, 1]}) + base = cudf.DataFrame({"a": [1, 0, 5, 1], "b": ["x", "z", "w", "a"]}) + + with pytest.raises(ValueError, match="Mismatch number of columns"): + base.searchsorted(values) + + @pytest.mark.parametrize("side", ["left", "right"]) def test_searchsorted_categorical(side): From e86363c52d54edb6f509e624dcde22d16ac0d530 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 18 Mar 2022 17:13:20 -0700 Subject: [PATCH 09/21] Fix copyrights --- python/cudf/cudf/tests/test_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py index 103e40dbeda..d3433a589a7 100644 --- a/python/cudf/cudf/tests/test_search.py +++ b/python/cudf/cudf/tests/test_search.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import cupy import numpy as np import pandas as pd From 881edbaba94de366c3e510269a260fd4fe99b94d Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 14 Apr 2022 15:30:07 -0700 Subject: [PATCH 10/21] Refactor table_encode in `transform.pyx` --- python/cudf/cudf/_lib/transform.pyx | 20 +++++++------------- python/cudf/cudf/core/frame.py | 6 ++---- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 96d25cb92c9..175150b6865 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import numpy as np from numba.np import numpy_support @@ -25,9 +25,9 @@ from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type, type_id from cudf._lib.types cimport underlying_type_t_type_id from cudf._lib.utils cimport ( + columns_from_unique_ptr, data_from_table_view, - data_from_unique_ptr, - table_view_from_table, + table_view_from_columns, ) @@ -123,21 +123,15 @@ def transform(Column input, op): return Column.from_unique_ptr(move(c_output)) -def table_encode(input): - cdef table_view c_input = table_view_from_table( - input, ignore_index=True) +def table_encode(list source_columns): + cdef table_view c_input = table_view_from_columns(source_columns) cdef pair[unique_ptr[table], unique_ptr[column]] c_result with nogil: c_result = move(libcudf_transform.encode(c_input)) - return ( - *data_from_unique_ptr( - move(c_result.first), - column_names=input._column_names, - ), - Column.from_unique_ptr(move(c_result.second)) - ) + return columns_from_unique_ptr( + move(c_result.first)), Column.from_unique_ptr(move(c_result.second)) def one_hot_encode(Column input_column, Column categories): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index deef069f80e..c6a6d21df0e 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2408,10 +2408,8 @@ def _split(self, splits): @_cudf_nvtx_annotate def _encode(self): - data, index, indices = libcudf.transform.table_encode(self) - for name, col in data.items(): - data[name] = col._with_type_metadata(self._data[name].dtype) - keys = self.__class__._from_data(data, index) + columns, indices = libcudf.transform.table_encode([*self._columns]) + keys = self._from_columns_like_self(columns) return keys, indices @_cudf_nvtx_annotate From 63ec965c35f8b51fa18b82017a26d5fc3b1fab09 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 14 Apr 2022 15:53:59 -0700 Subject: [PATCH 11/21] Refactor lists.explode_outer --- python/cudf/cudf/_lib/lists.pyx | 19 +++++++++++-------- python/cudf/cudf/core/indexed_frame.py | 24 +++++++++++------------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 523686fafe6..af938761e71 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -42,7 +42,12 @@ from cudf.core.dtypes import ListDtype from cudf._lib.cpp.lists.contains cimport contains, index_of as cpp_index_of from cudf._lib.cpp.lists.extract cimport extract_list_element -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport ( + columns_from_unique_ptr, + data_from_unique_ptr, + table_view_from_columns, + table_view_from_table, +) def count_elements(Column col): @@ -61,8 +66,10 @@ def count_elements(Column col): return result -def explode_outer(tbl, int explode_column_idx, bool ignore_index=False): - cdef table_view c_table_view = table_view_from_table(tbl, ignore_index) +def explode_outer( + list source_columns, int explode_column_idx +): + cdef table_view c_table_view = table_view_from_columns(source_columns) cdef size_type c_explode_column_idx = explode_column_idx cdef unique_ptr[table] c_result @@ -70,11 +77,7 @@ def explode_outer(tbl, int explode_column_idx, bool ignore_index=False): with nogil: c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx)) - return data_from_unique_ptr( - move(c_result), - column_names=tbl._column_names, - index_names=None if ignore_index else tbl._index_names - ) + return columns_from_unique_ptr(move(c_result)) def drop_list_duplicates(Column col, bool nulls_equal, bool nans_all_equal): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 6c29c3bb8d4..ad7b2f8458a 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2689,21 +2689,19 @@ def _explode(self, explode_column: Any, ignore_index: bool): if not ignore_index and self._index is not None: explode_column_num += self._index.nlevels - data, index = libcudf.lists.explode_outer( - self, explode_column_num, ignore_index - ) - res = self.__class__._from_data( - ColumnAccessor( - data, - multiindex=self._data.multiindex, - level_names=self._data._level_names, - ), - index=index, + exploded = libcudf.lists.explode_outer( + [ + *(self._index._data.columns if not ignore_index else ()), + *self._columns, + ], + explode_column_num, ) - if not ignore_index and self._index is not None: - res.index.names = self._index.names - return res + return self._from_columns_like_self( + exploded, + self._column_names, + self._index_names if not ignore_index else None, + ) @_cudf_nvtx_annotate def tile(self, count): From 96913a14a59b2840486f212e8fd3081c21ecb53e Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 14 Apr 2022 16:12:25 -0700 Subject: [PATCH 12/21] Refactor lists.pyx frame APIs --- python/cudf/cudf/_lib/lists.pyx | 14 ++++---------- python/cudf/cudf/core/column/lists.py | 4 +--- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index af938761e71..e5a705ab603 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -42,12 +42,7 @@ from cudf.core.dtypes import ListDtype from cudf._lib.cpp.lists.contains cimport contains, index_of as cpp_index_of from cudf._lib.cpp.lists.extract cimport extract_list_element -from cudf._lib.utils cimport ( - columns_from_unique_ptr, - data_from_unique_ptr, - table_view_from_columns, - table_view_from_table, -) +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns def count_elements(Column col): @@ -200,18 +195,17 @@ def index_of(Column col, object py_search_key): return Column.from_unique_ptr(move(c_result)) -def concatenate_rows(tbl): +def concatenate_rows(list source_columns): cdef unique_ptr[column] c_result - cdef table_view c_table_view = table_view_from_table(tbl) + cdef table_view c_table_view = table_view_from_columns(source_columns) with nogil: c_result = move(cpp_concatenate_rows( c_table_view, )) - result = Column.from_unique_ptr(move(c_result)) - return result + return Column.from_unique_ptr(move(c_result)) def concatenate_list_elements(Column input_column, dropna=False): diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 8578bfe8147..b383f7bc321 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -113,9 +113,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: return NotImplemented if isinstance(other.dtype, ListDtype): if op == "__add__": - return concatenate_rows( - cudf.core.frame.Frame({0: self, 1: other}) - ) + return concatenate_rows([self, other]) else: raise NotImplementedError( "Lists concatenation for this operation is not yet" From 5f8122c5cdbd64a011d20172c04038032a21b959 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 14 Apr 2022 16:20:31 -0700 Subject: [PATCH 13/21] Refactor `string/combine.concatenate` --- python/cudf/cudf/_lib/strings/combine.pyx | 9 ++++----- python/cudf/cudf/core/column/string.py | 6 ++---- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 3b5ef33a668..eeb39f70728 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -18,10 +18,10 @@ from cudf._lib.cpp.strings.combine cimport ( from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport table_view_from_table +from cudf._lib.utils cimport table_view_from_columns -def concatenate(source_strings, +def concatenate(list source_strings, object sep, object na_rep): """ @@ -33,8 +33,7 @@ def concatenate(source_strings, cdef DeviceScalar narep = na_rep.device_value cdef unique_ptr[column] c_result - cdef table_view source_view = table_view_from_table( - source_strings, ignore_index=True) + cdef table_view source_view = table_view_from_columns(source_strings) cdef const string_scalar* scalar_separator = \ (separator.get_raw_ptr()) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index d5d45c341d5..6f4a6334a1d 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -365,9 +365,7 @@ def cat(self, others=None, sep=None, na_rep=None): other_cols = _get_cols_list(self._parent, others) all_cols = [self._column] + other_cols data = libstrings.concatenate( - cudf.DataFrame( - {index: value for index, value in enumerate(all_cols)} - ), + all_cols, cudf.Scalar(sep), cudf.Scalar(na_rep, "str"), ) @@ -5531,7 +5529,7 @@ def _binaryop( return cast( "column.ColumnBase", libstrings.concatenate( - cudf.DataFrame._from_data(data={0: lhs, 1: rhs}), + [lhs, rhs], sep=cudf.Scalar(""), na_rep=cudf.Scalar(None, "str"), ), From 24f61fe402a570b0924d82dbbc4bfbbfba02dcf5 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 18 Apr 2022 09:22:55 -0700 Subject: [PATCH 14/21] black style fix --- python/cudf/cudf/core/column/column.py | 4 +++- python/cudf/cudf/core/join/join.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 462516e2da4..5c9d8535798 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -229,7 +229,9 @@ def to_arrow(self) -> pa.Array: 4 ] """ - return libcudf.interop.to_arrow([self], [["None"]],)["None"].chunk(0) + return libcudf.interop.to_arrow([self], [["None"]],)[ + "None" + ].chunk(0) @classmethod def from_arrow(cls, array: pa.Array) -> ColumnBase: diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index c3ccf91efe7..6a495ef8d9a 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -201,7 +201,9 @@ def perform_merge(self) -> Frame: right_key.set(self.rhs, rcol_casted, validate=False) left_rows, right_rows = self._joiner( - left_join_cols, right_join_cols, how=self.how, + left_join_cols, + right_join_cols, + how=self.how, ) gather_index = self._using_left_index or self._using_right_index From 349f45f566c744ca18af2ed5faf81b1d8d860059 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 18 Apr 2022 16:36:07 -0700 Subject: [PATCH 15/21] Various docstring updates Co-authored-by: GALI PREM SAGAR --- python/cudf/cudf/_lib/interop.pyx | 7 ++++--- python/cudf/cudf/core/dataframe.py | 3 ++- python/cudf/cudf/core/indexed_frame.py | 3 ++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 26bb2e868f5..dd9b4a87f5a 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -47,7 +47,7 @@ def from_dlpack(dlpack_capsule): def to_dlpack(list source_columns): """ - Converts a cudf Frame into a DLPack Tensor PyCapsule. + Converts a list of columns into a DLPack Tensor PyCapsule. DLPack Tensor PyCapsule will have the name "dltensor". """ @@ -105,7 +105,8 @@ cdef vector[column_metadata] gather_metadata(object metadata) except *: def to_arrow(list source_columns, object metadata): - """Convert from cudf Frame to PyArrow Table. + """Convert a list of columns from + cudf Frame to a PyArrow Table. Parameters ---------- @@ -130,7 +131,7 @@ def to_arrow(list source_columns, object metadata): def from_arrow(object input_table): - """Convert from PyArrow Table to cudf Frame. + """Convert from PyArrow Table to a list of columns. Parameters ---------- diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9fa9b9231d8..569a148388e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6189,7 +6189,8 @@ def interleave_columns(self): Examples -------- - >>> df = DataFrame([['A1', 'A2', 'A3'], ['B1', 'B2', 'B3']]) + >>> import cudf + >>> df = cudf.DataFrame([['A1', 'A2', 'A3'], ['B1', 'B2', 'B3']]) >>> df 0 [A1, A2, A3] 1 [B1, B2, B3] diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ad7b2f8458a..68c9a429227 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2714,7 +2714,8 @@ def tile(self, count): Examples -------- - >>> df = Dataframe([[8, 4, 7], [5, 2, 3]]) + >>> import cudf + >>> df = cudf.Dataframe([[8, 4, 7], [5, 2, 3]]) >>> count = 2 >>> df.tile(df, count) 0 1 2 From 2ba24358fe250cf133be66168f1dd8862ffe079b Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 18 Apr 2022 16:41:41 -0700 Subject: [PATCH 16/21] More docstring changes Co-authored-by: GALI PREM SAGAR --- python/cudf/cudf/_lib/interop.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index dd9b4a87f5a..88c8b19ded0 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -25,7 +25,7 @@ from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns def from_dlpack(dlpack_capsule): """ - Converts a DLPack Tensor PyCapsule into a cudf Frame object. + Converts a DLPack Tensor PyCapsule into a list of columns. DLPack Tensor PyCapsule is expected to have the name "dltensor". """ @@ -139,7 +139,7 @@ def from_arrow(object input_table): Returns ------- - cudf Frame + A list of columns to construct Frame object """ cdef shared_ptr[CTable] cpp_arrow_table = ( pyarrow_unwrap_table(input_table) From c0e1a1bc62bad5650ff229ca2e621aa4a0874e77 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 18 Apr 2022 16:52:10 -0700 Subject: [PATCH 17/21] Use _from_data factory --- python/cudf/cudf/core/dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 569a148388e..d02096d3ef5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5688,7 +5688,7 @@ def stack(self, level=-1, dropna=True): else: index_names = [None] * len(new_index_columns) new_index = MultiIndex.from_frame( - DataFrame( + DataFrame._from_data( dict(zip(range(0, len(new_index_columns)), new_index_columns)) ), names=index_names, @@ -5696,7 +5696,7 @@ def stack(self, level=-1, dropna=True): # Collect datatypes and cast columns as that type common_type = np.result_type(*self.dtypes) - homogenized = DataFrame( + homogenized = DataFrame._from_data( { c: ( self._data[c].astype(common_type) From 665f79b2f8b9baa99f9575a2a13e3f208e137d45 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 18 Apr 2022 16:53:18 -0700 Subject: [PATCH 18/21] Update python/cudf/cudf/core/dataframe.py Co-authored-by: GALI PREM SAGAR --- python/cudf/cudf/core/dataframe.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d02096d3ef5..e69f44042b3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6211,12 +6211,10 @@ def interleave_columns(self): "interleave_columns does not support 'category' dtype." ) - result = self._constructor_sliced._from_data( + return self._constructor_sliced._from_data( {None: libcudf.reshape.interleave_columns([*self._columns])} ) - return result - def from_dataframe(df, allow_copy=False): return df_protocol.from_dataframe(df, allow_copy=allow_copy) From adbd3477d69b85c23e384a55a0f66cc5cf6801c2 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 18 Apr 2022 23:13:02 -0700 Subject: [PATCH 19/21] Fix interleave_columns docstring --- python/cudf/cudf/core/dataframe.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e69f44042b3..17b46c0e34e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6190,10 +6190,12 @@ def interleave_columns(self): Examples -------- >>> import cudf - >>> df = cudf.DataFrame([['A1', 'A2', 'A3'], ['B1', 'B2', 'B3']]) + >>> df = cudf.DataFrame({0: ['A1', 'A2', 'A3'], 1: ['B1', 'B2', 'B3']}) >>> df - 0 [A1, A2, A3] - 1 [B1, B2, B3] + 0 1 + 0 A1 B1 + 1 A2 B2 + 2 A3 B3 >>> df.interleave_columns() 0 A1 1 B1 @@ -6201,6 +6203,7 @@ def interleave_columns(self): 3 B2 4 A3 5 B3 + dtype: object Returns ------- From ca7f99a104d7e4430d82d826bccd2995353baa2b Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 19 Apr 2022 00:12:56 -0700 Subject: [PATCH 20/21] Fixing all failed tests --- python/cudf/cudf/_lib/scalar.pyx | 22 +++++++--------------- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/index.py | 4 ++-- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 32d6cb2ea6d..8138b6c65d0 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -68,6 +68,7 @@ from cudf._lib.utils cimport ( data_from_table_view, table_view_from_columns, table_view_from_table, + columns_from_table_view ) @@ -361,8 +362,8 @@ cdef _set_struct_from_pydict(unique_ptr[scalar]& s, names=columns ) - data, _ = from_arrow(pyarrow_table, column_names=columns) - cdef table_view struct_view = table_view_from_columns(data.values()) + data = from_arrow(pyarrow_table) + cdef table_view struct_view = table_view_from_columns(data) s.reset( new struct_scalar(struct_view, valid) @@ -373,18 +374,10 @@ cdef _get_py_dict_from_struct(unique_ptr[scalar]& s): return cudf.NA cdef table_view struct_table_view = (s.get()).view() - columns = [str(i) for i in range(struct_table_view.num_columns())] + column_names = [str(i) for i in range(struct_table_view.num_columns())] - data, _ = data_from_table_view( - struct_table_view, - None, - column_names=columns - ) - to_arrow_table = cudf.core.frame.Frame( - cudf.core.column_accessor.ColumnAccessor(data) - ) - - python_dict = to_arrow(to_arrow_table, columns).to_pydict() + columns = columns_from_table_view(struct_table_view, None) + python_dict = to_arrow(columns, column_names).to_pydict() return {k: _nested_na_replace(python_dict[k])[0] for k in python_dict} @@ -415,9 +408,8 @@ cdef _get_py_list_from_list(unique_ptr[scalar]& s): cdef column_view list_col_view = (s.get()).view() cdef Column list_col = Column.from_column_view(list_col_view, None) - to_arrow_table = cudf.core.frame.Frame({"col": list_col}) - arrow_table = to_arrow(to_arrow_table, [["col", []]]) + arrow_table = to_arrow([list_col], [["col", []]]) result = arrow_table['col'].to_pylist() return _nested_na_replace(result) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c6a6d21df0e..d10f7c690bf 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1278,7 +1278,7 @@ def _quantiles( ] return self._from_columns_like_self( - *libcudf.quantiles.quantiles( + libcudf.quantiles.quantiles( [*self._columns], q, interpolation, diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index aff13025e72..fd918f723fe 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -76,10 +76,10 @@ def _lexsorted_equal_range( sort_inds = None sort_vals = idx lower_bound = search_sorted( - sort_vals, key_as_table, side="left" + [*sort_vals._data.columns], [*key_as_table._columns], side="left" ).element_indexing(0) upper_bound = search_sorted( - sort_vals, key_as_table, side="right" + [*sort_vals._data.columns], [*key_as_table._columns], side="right" ).element_indexing(0) return lower_bound, upper_bound, sort_inds From 906da0186f97f567163b1f17d66c7444913d2845 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 19 Apr 2022 00:13:07 -0700 Subject: [PATCH 21/21] style fix --- python/cudf/cudf/_lib/scalar.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 8138b6c65d0..a7acfa8f906 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -65,10 +65,10 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( timestamp_us, ) from cudf._lib.utils cimport ( + columns_from_table_view, data_from_table_view, table_view_from_columns, table_view_from_table, - columns_from_table_view )