diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index 6cc52d046af..900be721c9a 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -1,22 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.vector cimport vector - -from pylibcudf.libcudf.column.column cimport column_view -from pylibcudf.libcudf.table.table cimport table, table_view - - -cdef data_from_unique_ptr( - unique_ptr[table] c_tbl, column_names, index_names=*) cpdef data_from_pylibcudf_table(tbl, column_names, index_names=*) cpdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *) -cdef data_from_table_view( - table_view tv, object owner, object column_names, object index_names=*) -cdef table_view table_view_from_columns(columns) except * -cdef table_view table_view_from_table(tbl, ignore_index=*) except* -cdef columns_from_unique_ptr(unique_ptr[table] c_tbl) -cdef columns_from_table_view(table_view tv, object owners) cpdef columns_from_pylibcudf_table(tbl) cpdef _data_from_columns(columns, column_names, index_names=*) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index ff032656f80..975c9eb741c 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -1,233 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np -import pyarrow as pa - import cudf -from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector - -from pylibcudf.libcudf.column.column cimport column, column_view -from pylibcudf.libcudf.table.table cimport table -from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column -from pylibcudf cimport Column as plc_Column -try: - import ujson as json -except ImportError: - import json - -from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype - -PARQUET_META_TYPE_MAP = { - str(cudf_dtype): str(pandas_dtype) - for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items() -} - -cdef table_view table_view_from_columns(columns) except*: - """Create a cudf::table_view from an iterable of Columns.""" - cdef vector[column_view] column_views - - cdef Column col - for col in columns: - column_views.push_back(col.view()) - - return table_view(column_views) - - -cdef table_view table_view_from_table(tbl, ignore_index=False) except*: - """Create a cudf::table_view from a Table. - - Parameters - ---------- - ignore_index : bool, default False - If True, don't include the index in the columns. - """ - return table_view_from_columns( - tbl._index._columns + tbl._columns - if not ignore_index and tbl._index is not None - else tbl._columns - ) - - -cpdef generate_pandas_metadata(table, index): - col_names = [] - types = [] - index_levels = [] - index_descriptors = [] - columns_to_convert = list(table._columns) - # Columns - for name, col in table._column_labels_and_values: - if cudf.get_option("mode.pandas_compatible"): - # in pandas-compat mode, non-string column names are stringified. - col_names.append(str(name)) - else: - col_names.append(name) - - if isinstance(col.dtype, cudf.CategoricalDtype): - raise ValueError( - "'category' column dtypes are currently not " - + "supported by the gpu accelerated parquet writer" - ) - elif isinstance(col.dtype, ( - cudf.ListDtype, - cudf.StructDtype, - cudf.core.dtypes.DecimalDtype - )): - types.append(col.dtype.to_arrow()) - else: - # A boolean element takes 8 bits in cudf and 1 bit in - # pyarrow. To make sure the cudf format is interperable - # in arrow, we use `int8` type when converting from a - # cudf boolean array. - if col.dtype.type == np.bool_: - types.append(pa.int8()) - else: - types.append(np_to_pa_dtype(col.dtype)) - - # Indexes - materialize_index = False - if index is not False: - for level, name in enumerate(table._index.names): - if isinstance(table._index, cudf.MultiIndex): - idx = table.index.get_level_values(level) - else: - idx = table.index - - if isinstance(idx, cudf.RangeIndex): - if index is None: - descr = { - "kind": "range", - "name": table.index.name, - "start": table.index.start, - "stop": table.index.stop, - "step": table.index.step, - } - else: - materialize_index = True - # When `index=True`, RangeIndex needs to be materialized. - materialized_idx = idx._as_int_index() - descr = _index_level_name( - index_name=materialized_idx.name, - level=level, - column_names=col_names - ) - index_levels.append(materialized_idx) - columns_to_convert.append(materialized_idx._values) - col_names.append(descr) - types.append(np_to_pa_dtype(materialized_idx.dtype)) - else: - descr = _index_level_name( - index_name=idx.name, - level=level, - column_names=col_names - ) - columns_to_convert.append(idx._values) - col_names.append(descr) - if isinstance(idx.dtype, cudf.CategoricalDtype): - raise ValueError( - "'category' column dtypes are currently not " - + "supported by the gpu accelerated parquet writer" - ) - elif isinstance(idx.dtype, cudf.ListDtype): - types.append(col.dtype.to_arrow()) - else: - # A boolean element takes 8 bits in cudf and 1 bit in - # pyarrow. To make sure the cudf format is interperable - # in arrow, we use `int8` type when converting from a - # cudf boolean array. - if idx.dtype.type == np.bool_: - types.append(pa.int8()) - else: - types.append(np_to_pa_dtype(idx.dtype)) - - index_levels.append(idx) - index_descriptors.append(descr) - - df_meta = table.head(0) - if materialize_index: - df_meta.index = df_meta.index._as_int_index() - metadata = pa.pandas_compat.construct_metadata( - columns_to_convert=columns_to_convert, - # It is OKAY to do `.head(0).to_pandas()` because - # this method will extract `.columns` metadata only - df=df_meta.to_pandas(), - column_names=col_names, - index_levels=index_levels, - index_descriptors=index_descriptors, - preserve_index=index, - types=types, - ) - - md_dict = json.loads(metadata[b"pandas"]) - - # correct metadata for list and struct and nullable numeric types - for col_meta in md_dict["columns"]: - if ( - col_meta["name"] in table._column_names - and table._data[col_meta["name"]].nullable - and col_meta["numpy_type"] in PARQUET_META_TYPE_MAP - and col_meta["pandas_type"] != "decimal" - ): - col_meta["numpy_type"] = PARQUET_META_TYPE_MAP[ - col_meta["numpy_type"] - ] - if col_meta["numpy_type"] in ("list", "struct"): - col_meta["numpy_type"] = "object" - - return json.dumps(md_dict) - - -def _index_level_name(index_name, level, column_names): - """ - Return the name of an index level or a default name - if `index_name` is None or is already a column name. - - Parameters - ---------- - index_name : name of an Index object - level : level of the Index object - - Returns - ------- - name : str - """ - if index_name is not None and index_name not in column_names: - return index_name - else: - return f"__index_level_{level}__" - - -cdef columns_from_unique_ptr( - unique_ptr[table] c_tbl -): - """Convert a libcudf table into list of columns. - - Parameters - ---------- - c_tbl : unique_ptr[cudf::table] - The libcudf table whose columns will be extracted - - Returns - ------- - list[Column] - A list of columns. - """ - cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release()) - cdef vector[unique_ptr[column]].iterator it = c_columns.begin() - - cdef size_t i - - return [ - Column.from_pylibcudf( - plc_Column.from_libcudf(move(dereference(it+i))) - ) for i in range(c_columns.size()) - ] cpdef columns_from_pylibcudf_table(tbl): @@ -281,8 +55,7 @@ cpdef _data_from_columns(columns, column_names, index_names=None): # the data while actually constructing the Index object here (instead # of just returning a dict for that as well). As we clean up the # Frame factories we may want to look for a less dissonant approach - # that does not impose performance penalties. The same applies to - # data_from_table_view below. + # that does not impose performance penalties. cudf.core.index._index_from_data( { name: columns[i] @@ -300,16 +73,6 @@ cpdef _data_from_columns(columns, column_names, index_names=None): return data, index -cdef data_from_unique_ptr( - unique_ptr[table] c_tbl, column_names, index_names=None -): - return _data_from_columns( - columns_from_unique_ptr(move(c_tbl)), - column_names, - index_names - ) - - cpdef data_from_pylibcudf_table(tbl, column_names, index_names=None): return _data_from_columns( columns_from_pylibcudf_table(tbl), @@ -329,73 +92,3 @@ cpdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None) column_names=column_names, index_names=index_names ) - -cdef columns_from_table_view( - table_view tv, - object owners, -): - """ - Given a ``cudf::table_view``, constructs a list of columns from it, - along with referencing an owner Python object that owns the memory - lifetime. owner must be either None or a list of column. If owner - is a list of columns, the owner of the `i`th ``cudf::column_view`` - in the table view is ``owners[i]``. For more about memory ownership, - see ``Column.from_column_view``. - """ - - return [ - Column.from_column_view( - tv.column(i), owners[i] if isinstance(owners, list) else None - ) for i in range(tv.num_columns()) - ] - -cdef data_from_table_view( - table_view tv, - object owner, - object column_names, - object index_names=None -): - """ - Given a ``cudf::table_view``, constructs a Frame from it, - along with referencing an ``owner`` Python object that owns the memory - lifetime. If ``owner`` is a Frame we reach inside of it and - reach inside of each ``cudf.Column`` to make the owner of each newly - created ``Buffer`` underneath the ``cudf.Column`` objects of the - created Frame the respective ``Buffer`` from the relevant - ``cudf.Column`` of the ``owner`` Frame - """ - cdef size_type column_idx = 0 - table_owner = isinstance(owner, cudf.core.frame.Frame) - - # First construct the index, if any - index = None - if index_names is not None: - index_columns = [] - for _ in index_names: - column_owner = owner - if table_owner: - column_owner = owner._index._columns[column_idx] - index_columns.append( - Column.from_column_view( - tv.column(column_idx), - column_owner - ) - ) - column_idx += 1 - index = cudf.core.index._index_from_data( - dict(zip(index_names, index_columns))) - - # Construct the data dict - cdef size_type source_column_idx = 0 - data_columns = [] - for _ in column_names: - column_owner = owner - if table_owner: - column_owner = owner._columns[source_column_idx] - data_columns.append( - Column.from_column_view(tv.column(column_idx), column_owner) - ) - column_idx += 1 - source_column_idx += 1 - - return dict(zip(column_names, data_columns)), index diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b74128a8a61..1df6ac67f29 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1809,13 +1809,37 @@ def _concat( ) for table in tables ] - - concatted = libcudf.utils.data_from_pylibcudf_table( - plc.concatenate.concatenate(plc_tables), - column_names=column_names, - index_names=index_names, - ) - out = cls._from_data(*concatted) + plc_result = plc.concatenate.concatenate(plc_tables) + if ignore: + index = None + data = { + col_name: ColumnBase.from_pylibcudf(col) + for col_name, col in zip( + column_names, plc_result.columns(), strict=True + ) + } + else: + result_columns = [ + ColumnBase.from_pylibcudf(col) + for col in plc_result.columns() + ] + index = _index_from_data( + dict( + zip( + index_names, + result_columns[: len(index_names)], + strict=True, + ) + ) + ) + data = dict( + zip( + column_names, + result_columns[len(index_names) :], + strict=True, + ) + ) + out = cls._from_data(data=data, index=index) # If ignore_index is True, all input frames are empty, and at # least one input frame has an index, assign a new RangeIndex diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 66095d4a155..d92ac94e333 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -25,9 +25,7 @@ from cudf._lib.column import Column from cudf._lib.utils import ( _data_from_columns, - _index_level_name, data_from_pylibcudf_io, - generate_pandas_metadata, ) from cudf.api.types import is_list_like from cudf.core.buffer import acquire_spill_lock @@ -128,7 +126,7 @@ def _plc_write_parquet( tbl_meta = plc.io.types.TableInputMetadata(plc_table) for level, idx_name in enumerate(table.index.names): tbl_meta.column_metadata[level].set_name( - _index_level_name(idx_name, level, table._column_names) + ioutils._index_level_name(idx_name, level, table._column_names) ) num_index_cols_meta = len(table.index.names) else: @@ -162,7 +160,7 @@ def _plc_write_parquet( if partitions_info is not None: user_data = [ { - "pandas": generate_pandas_metadata( + "pandas": ioutils.generate_pandas_metadata( table.iloc[start_row : start_row + num_row].copy( deep=False ), @@ -172,7 +170,9 @@ def _plc_write_parquet( for start_row, num_row in partitions_info ] else: - user_data = [{"pandas": generate_pandas_metadata(table, index)}] + user_data = [ + {"pandas": ioutils.generate_pandas_metadata(table, index)} + ] if header_version not in ("1.0", "2.0"): raise ValueError( @@ -1738,7 +1738,7 @@ def _initialize_chunked_state( False if isinstance(table.index, cudf.RangeIndex) else self.index ) user_data = [ - {"pandas": generate_pandas_metadata(table, index)} + {"pandas": ioutils.generate_pandas_metadata(table, index)} ] * num_partitions comp_type = _get_comp_type(self.compression) stat_freq = _get_stat_freq(self.statistics)