From 935648b01220deac0dc306014ecd82f5191fc6f5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 13:48:34 -0700 Subject: [PATCH 01/31] Move validation directly into set_by_label and use a raw dict to store the columns in the accessor. --- python/cudf/cudf/core/column_accessor.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index ad1a0c80ef5..a1de373eb37 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -3,7 +3,6 @@ from __future__ import annotations import itertools -from collections import OrderedDict from collections.abc import MutableMapping from typing import ( TYPE_CHECKING, @@ -18,8 +17,8 @@ import pandas as pd import cudf +from cudf.core import column from cudf.utils.utils import ( - OrderedColumnDict, cached_property, to_flat_dict, to_nested_dict, @@ -31,7 +30,7 @@ class ColumnAccessor(MutableMapping): - _data: "OrderedDict[Any, ColumnBase]" + _data: "dict[Any, ColumnBase]" multiindex: bool _level_names: Tuple[Any, ...] @@ -64,7 +63,7 @@ def __init__( self.multiindex = multiindex self._level_names = level_names - self._data = OrderedColumnDict(data) + self._data = dict(data) self.multiindex = multiindex self._level_names = level_names @@ -280,6 +279,15 @@ def set_by_label(self, key: Any, value: Any): value : column-like """ key = self._pad_key(key) + + # Convert all types to columns and ensure that values are of equal + # length. + value = column.as_column(value) + if len(self._data) > 0: + first = next(iter(self._data.values())) + if len(value) != len(first): + raise ValueError("All columns must be of equal length") + self._data[key] = value self._clear_cache() From 806a3ef7740414bf16e21ea8b112982537a6f5ad Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 13:54:27 -0700 Subject: [PATCH 02/31] Remove all references to OrderedColumnDict. --- python/cudf/cudf/_lib/table.pyx | 8 ++++---- python/cudf/cudf/core/dataframe.py | 3 +-- python/cudf/cudf/core/frame.py | 4 ++-- python/cudf/cudf/utils/utils.py | 30 ------------------------------ 4 files changed, 7 insertions(+), 38 deletions(-) diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx index dba0abb9cf0..f97b45d8abf 100644 --- a/python/cudf/cudf/_lib/table.pyx +++ b/python/cudf/cudf/_lib/table.pyx @@ -34,8 +34,8 @@ cdef class Table: Parameters ---------- - data : OrderedColumnDict - An OrderedColumnDict mapping column names to Columns + data : dict + An dict mapping column names to Columns index : Table A Table representing the (optional) index columns. """ @@ -109,7 +109,7 @@ cdef class Table: it += 1 index = Table(dict(zip(index_names, index_columns))) - # Construct the data OrderedColumnDict + # Construct the data dict data_columns = [] for _ in column_names: data_columns.append(Column.from_unique_ptr(move(dereference(it)))) @@ -154,7 +154,7 @@ cdef class Table: column_idx += 1 index = Table(dict(zip(index_names, index_columns))) - # Construct the data OrderedColumnDict + # Construct the data dict cdef size_type source_column_idx = 0 data_columns = [] for _ in column_names: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 18a7f052d62..a04dbb826a8 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -52,7 +52,6 @@ is_struct_dtype, numeric_normalize_types, ) -from cudf.utils.utils import OrderedColumnDict T = TypeVar("T", bound="DataFrame") @@ -4599,7 +4598,7 @@ def hash_columns(self, columns=None): table_to_hash = self else: cols = [self[k]._column for k in columns] - table_to_hash = Frame(data=OrderedColumnDict(zip(columns, cols))) + table_to_hash = Frame(data=dict(zip(columns, cols))) return Series(table_to_hash._hash()).values diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 926aad368b0..e33fda3ee09 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -39,8 +39,8 @@ class Frame(libcudf.table.Table): Parameters ---------- - data : OrderedColumnDict - An OrderedColumnDict mapping column names to Columns + data : dict + An dict mapping column names to Columns index : Table A Frame representing the (optional) index columns. """ diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 03a39f6fb4b..ba9fa734248 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -280,36 +280,6 @@ def __get__(self, instance, cls): return value -class ColumnValuesMappingMixin: - """ - Coerce provided values for the mapping to Columns. - """ - - def __setitem__(self, key, value): - - value = column.as_column(value) - super().__setitem__(key, value) - - -class EqualLengthValuesMappingMixin: - """ - Require all values in the mapping to have the same length. - """ - - def __setitem__(self, key, value): - if len(self) > 0: - first = next(iter(self.values())) - if len(value) != len(first): - raise ValueError("All values must be of equal length") - super().__setitem__(key, value) - - -class OrderedColumnDict( - ColumnValuesMappingMixin, EqualLengthValuesMappingMixin, OrderedDict -): - pass - - class NestedMappingMixin: """ Make missing values of a mapping empty instances From 40a7b173bb3a86bd8d2473121aa519765c442e7f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 13:58:35 -0700 Subject: [PATCH 03/31] Move validation to separate method and use in both set_by_label and constructor. --- python/cudf/cudf/core/column_accessor.py | 25 +++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index a1de373eb37..c6b9236f0d0 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -63,7 +63,7 @@ def __init__( self.multiindex = multiindex self._level_names = level_names - self._data = dict(data) + self._data = {k: self._convert_and_validate(v) for k, v in data.items()} self.multiindex = multiindex self._level_names = level_names @@ -269,6 +269,18 @@ def select_by_index(self, index: Any) -> ColumnAccessor: data, multiindex=self.multiindex, level_names=self.level_names, ) + def _convert_and_validate(self, value: Any): + # Make sure that the provided value can be stored as a column. This + # method will convert the column to an appropriate type and make sure + # that it is the same type as other columns in the accessor. + + value = column.as_column(value) + if len(self._data) > 0: + first = next(iter(self._data.values())) + if len(value) != len(first): + raise ValueError("All columns must be of equal length") + return value + def set_by_label(self, key: Any, value: Any): """ Add (or modify) column by name. @@ -279,16 +291,7 @@ def set_by_label(self, key: Any, value: Any): value : column-like """ key = self._pad_key(key) - - # Convert all types to columns and ensure that values are of equal - # length. - value = column.as_column(value) - if len(self._data) > 0: - first = next(iter(self._data.values())) - if len(value) != len(first): - raise ValueError("All columns must be of equal length") - - self._data[key] = value + self._data[key] = self._convert_and_validate(value) self._clear_cache() def _select_by_label_list_like(self, key: Any) -> ColumnAccessor: From a1c576ebe2cb9e5e344a9aaa00a7c2ef4044c5c6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 14:02:25 -0700 Subject: [PATCH 04/31] Format with black. --- python/cudf/cudf/core/column_accessor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index c6b9236f0d0..fe8058c31ce 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -63,7 +63,9 @@ def __init__( self.multiindex = multiindex self._level_names = level_names - self._data = {k: self._convert_and_validate(v) for k, v in data.items()} + self._data = { + k: self._convert_and_validate(v) for k, v in data.items() + } self.multiindex = multiindex self._level_names = level_names From 788d9d6a0bd1e8254dba31b3e085ed56abec0160 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 14:08:21 -0700 Subject: [PATCH 05/31] Expose parameter to make validation optional. --- python/cudf/cudf/core/column_accessor.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index fe8058c31ce..38832396f1f 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -283,17 +283,24 @@ def _convert_and_validate(self, value: Any): raise ValueError("All columns must be of equal length") return value - def set_by_label(self, key: Any, value: Any): + def set_by_label(self, key: Any, value: Any, validate: bool = True): """ Add (or modify) column by name. Parameters ---------- - key : name of the column + key + name of the column value : column-like + The value to insert into the column. + validate : bool + If True, the provided value will be coerced to a column and + validated before setting (Default value = True). """ key = self._pad_key(key) - self._data[key] = self._convert_and_validate(value) + if validate: + value = self._convert_and_validate(value) + self._data[key] = value self._clear_cache() def _select_by_label_list_like(self, key: Any) -> ColumnAccessor: From 6a64285f4e36a7437c550cf57ccd30dcf694f2e0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 14:13:55 -0700 Subject: [PATCH 06/31] Coerce constructor input to dict before calling items. --- python/cudf/cudf/core/column_accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 38832396f1f..034c74393b1 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -64,7 +64,7 @@ def __init__( self._level_names = level_names self._data = { - k: self._convert_and_validate(v) for k, v in data.items() + k: self._convert_and_validate(v) for k, v in dict(data).items() } self.multiindex = multiindex self._level_names = level_names From e7d09812a3ee18a3ce8cbe3208c9a526d353a38d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 14:21:47 -0700 Subject: [PATCH 07/31] Make construction safe. --- python/cudf/cudf/core/column_accessor.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 034c74393b1..2b5ed21b010 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -63,9 +63,12 @@ def __init__( self.multiindex = multiindex self._level_names = level_names - self._data = { - k: self._convert_and_validate(v) for k, v in dict(data).items() - } + # Explicitly initialize an empty data dict so that we can validate each + # new column. + self._data = {} + for k, v in dict(data).items(): + self._data[k] = self._convert_and_validate(v) + self.multiindex = multiindex self._level_names = level_names @@ -277,7 +280,7 @@ def _convert_and_validate(self, value: Any): # that it is the same type as other columns in the accessor. value = column.as_column(value) - if len(self._data) > 0: + if hasattr(self, '_data') and len(self._data) > 0: first = next(iter(self._data.values())) if len(value) != len(first): raise ValueError("All columns must be of equal length") From c39932c4dc46e35c262440b76d16d0ed3733e8c3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 15:39:01 -0700 Subject: [PATCH 08/31] Final cleanup and documentation. --- python/cudf/cudf/core/column_accessor.py | 52 ++++++++++++++---------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 2b5ed21b010..c175a6d9da7 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -33,6 +33,7 @@ class ColumnAccessor(MutableMapping): _data: "dict[Any, ColumnBase]" multiindex: bool _level_names: Tuple[Any, ...] + _column_length: int def __init__( self, @@ -62,15 +63,30 @@ def __init__( self._data = data._data self.multiindex = multiindex self._level_names = level_names + self._column_length = column_length + else: + # This code path is performance-critical for copies and should be + # modified with care. + self._data = {} + if data: + data = dict(data) + # Faster than next(iter(data.values())) + column_length = len(data[next(iter(data))]) + for k, v in data.items(): + # Much faster to avoid the function call if possible; the + # extra isinstance is negligible if we do have to make a + # column from something else. + if not isinstance(v, column.ColumnBase): + v = column.as_column(v) + if len(v) != column_length: + raise ValueError("All columns must be of equal length") + self._data[k] = v + self._column_length = column_length + else: + self._column_length = None - # Explicitly initialize an empty data dict so that we can validate each - # new column. - self._data = {} - for k, v in dict(data).items(): - self._data[k] = self._convert_and_validate(v) - - self.multiindex = multiindex - self._level_names = level_names + self.multiindex = multiindex + self._level_names = level_names def __iter__(self): return self._data.__iter__() @@ -274,18 +290,6 @@ def select_by_index(self, index: Any) -> ColumnAccessor: data, multiindex=self.multiindex, level_names=self.level_names, ) - def _convert_and_validate(self, value: Any): - # Make sure that the provided value can be stored as a column. This - # method will convert the column to an appropriate type and make sure - # that it is the same type as other columns in the accessor. - - value = column.as_column(value) - if hasattr(self, '_data') and len(self._data) > 0: - first = next(iter(self._data.values())) - if len(value) != len(first): - raise ValueError("All columns must be of equal length") - return value - def set_by_label(self, key: Any, value: Any, validate: bool = True): """ Add (or modify) column by name. @@ -302,7 +306,13 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True): """ key = self._pad_key(key) if validate: - value = self._convert_and_validate(value) + value = column.as_column(value) + if len(self._data) > 0: + if len(value) != self._column_length: + raise ValueError("All columns must be of equal length") + else: + self._column_length = len(value) + self._data[key] = value self._clear_cache() From 4ff09fcf66566ac9aaf3b6df75cf2c60e96c060e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 15:57:35 -0700 Subject: [PATCH 09/31] Address style issues. --- python/cudf/cudf/core/column_accessor.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index c175a6d9da7..6988efeafa7 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -8,6 +8,7 @@ TYPE_CHECKING, Any, Callable, + Dict, Mapping, Optional, Tuple, @@ -30,7 +31,7 @@ class ColumnAccessor(MutableMapping): - _data: "dict[Any, ColumnBase]" + _data: "Dict[Any, ColumnBase]" multiindex: bool _level_names: Tuple[Any, ...] _column_length: int @@ -63,7 +64,7 @@ def __init__( self._data = data._data self.multiindex = multiindex self._level_names = level_names - self._column_length = column_length + self._column_length = data._column_length else: # This code path is performance-critical for copies and should be # modified with care. @@ -82,8 +83,6 @@ def __init__( raise ValueError("All columns must be of equal length") self._data[k] = v self._column_length = column_length - else: - self._column_length = None self.multiindex = multiindex self._level_names = level_names From 0178127205b44383bf4d6a2c3d424512aa80b033 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 17:28:57 -0400 Subject: [PATCH 10/31] CA fix --- python/cudf/cudf/core/column_accessor.py | 29 ++++++++++++++---------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 84a21d78266..bd3e801fbec 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -19,11 +19,7 @@ import cudf from cudf.core import column -from cudf.utils.utils import ( - cached_property, - to_flat_dict, - to_nested_dict, -) +from cudf.utils.utils import cached_property, to_flat_dict, to_nested_dict if TYPE_CHECKING: from cudf.core.column import ColumnBase @@ -34,7 +30,6 @@ class ColumnAccessor(MutableMapping): _data: "Dict[Any, ColumnBase]" multiindex: bool _level_names: Tuple[Any, ...] - _column_length: int def __init__( self, @@ -64,15 +59,13 @@ def __init__( self._data = data._data self.multiindex = multiindex self._level_names = level_names - self._column_length = data._column_length else: # This code path is performance-critical for copies and should be # modified with care. self._data = {} if data: data = dict(data) - # Faster than next(iter(data.values())) - column_length = len(data[next(iter(data))]) + column_length = _length_of_first_value(data) for k, v in data.items(): # Much faster to avoid the function call if possible; the # extra isinstance is negligible if we do have to make a @@ -82,8 +75,6 @@ def __init__( if len(v) != column_length: raise ValueError("All columns must be of equal length") self._data[k] = v - self._column_length = column_length - self.multiindex = multiindex self._level_names = level_names @@ -144,6 +135,10 @@ def nrows(self) -> int: else: return len(next(iter(self.values()))) + @cached_property + def _column_length(self) -> int: + return _length_of_first_value(self._data) + @cached_property def names(self) -> Tuple[Any, ...]: return tuple(self.keys()) @@ -164,7 +159,12 @@ def _grouped_data(self) -> MutableMapping: return self._data def _clear_cache(self): - cached_properties = "columns", "names", "_grouped_data" + cached_properties = ( + "columns", + "names", + "_grouped_data", + "_column_length", + ) for attr in cached_properties: try: self.__delattr__(attr) @@ -473,3 +473,8 @@ def _compare_keys(target: Any, key: Any) -> bool: if k1 != k2: return False return True + + +def _length_of_first_value(data: Dict[Any, Any]) -> int: + # faster than next(iter(data.values())): + return 0 if not data else len(data[next(iter(data))]) From efea63dd02a6d143a72bc8e76d012d24a27a8af6 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 17:53:07 -0400 Subject: [PATCH 11/31] Prioritize numeric columns --- python/cudf/cudf/core/column/column.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b2b2874eeb4..dd06d97d105 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1017,7 +1017,9 @@ def distinct_count( return cpp_distinct_count(self, ignore_nulls=dropna) def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: - if is_categorical_dtype(dtype): + if is_numerical_dtype(dtype): + return self.as_numerical_column(dtype) + elif is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) elif pd.api.types.pandas_dtype(dtype).type in { np.str_, @@ -1548,6 +1550,16 @@ def build_column( """ dtype = pd.api.types.pandas_dtype(dtype) + if is_numerical_dtype(dtype): + assert data is not None + return cudf.core.column.NumericalColumn( + data=data, + dtype=dtype, + mask=mask, + size=size, + offset=offset, + null_count=null_count, + ) if is_categorical_dtype(dtype): if not len(children) == 1: raise ValueError( @@ -1634,15 +1646,7 @@ def build_column( children=children, ) else: - assert data is not None - return cudf.core.column.NumericalColumn( - data=data, - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - ) + raise TypeError(f"Unrecognized dtype: {dtype}") def build_categorical_column( From c3b6444787e29a4536104adad1dc3508b7e5a9dd Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 22 Mar 2021 14:56:08 -0700 Subject: [PATCH 12/31] Lazily compute and delete column length on demand. --- python/cudf/cudf/core/column_accessor.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 84a21d78266..f0677618d76 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -64,7 +64,6 @@ def __init__( self._data = data._data self.multiindex = multiindex self._level_names = level_names - self._column_length = data._column_length else: # This code path is performance-critical for copies and should be # modified with care. @@ -82,7 +81,6 @@ def __init__( if len(v) != column_length: raise ValueError("All columns must be of equal length") self._data[k] = v - self._column_length = column_length self.multiindex = multiindex self._level_names = level_names @@ -163,6 +161,13 @@ def _grouped_data(self) -> MutableMapping: else: return self._data + @cached_property + def _column_length(self): + try: + return len(self._data[next(iter(self._data))]) + except StopIteration: + return 0 + def _clear_cache(self): cached_properties = "columns", "names", "_grouped_data" for attr in cached_properties: @@ -171,6 +176,10 @@ def _clear_cache(self): except AttributeError: pass + # Column length should only be cleared if no data is present. + if len(self._data) == 0 and hasattr(self, "_column_length"): + del self._column_length + def to_pandas_index(self) -> pd.Index: """" Convert the keys of the ColumnAccessor to a Pandas Index object. From 01b2cf572d596da54423fdff36beefe1da382bb3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 22 Mar 2021 14:59:42 -0700 Subject: [PATCH 13/31] Remove redundant clear cache in setitem. --- python/cudf/cudf/core/column_accessor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index f0677618d76..77445dae3c7 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -93,7 +93,6 @@ def __getitem__(self, key: Any) -> ColumnBase: def __setitem__(self, key: Any, value: Any): self.set_by_label(key, value) - self._clear_cache() def __delitem__(self, key: Any): self._data.__delitem__(key) From 88992581ec04b4092e9ef02edbd03350c31af0fa Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 22 Mar 2021 15:06:38 -0700 Subject: [PATCH 14/31] Remove mypy annotation for column length. --- python/cudf/cudf/core/column_accessor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 77445dae3c7..44484927985 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -34,7 +34,6 @@ class ColumnAccessor(MutableMapping): _data: "Dict[Any, ColumnBase]" multiindex: bool _level_names: Tuple[Any, ...] - _column_length: int def __init__( self, From 7f8e1cd60525f3a06e064f8fa4bc4d93bb383700 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 19:01:50 -0400 Subject: [PATCH 15/31] Undo --- python/cudf/cudf/core/column_accessor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 2582f7be287..a527713099f 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -167,7 +167,6 @@ def _clear_cache(self): "columns", "names", "_grouped_data", - "_column_length", ) for attr in cached_properties: try: From f2e4609f63389ba44b65284feaba155d4ba9721a Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 19:04:48 -0400 Subject: [PATCH 16/31] Don't validate when copying type metadata --- python/cudf/cudf/core/frame.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index e6898b8c606..ecff3dee573 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2408,7 +2408,9 @@ def _copy_type_metadata( for name, col, other_col in zip( self._data.keys(), self._data.values(), other._data.values() ): - self._data[name] = other_col._copy_type_metadata(col) + self._data.set_by_label( + name, other_col._copy_type_metadata(col), validate=False + ) if include_index: if self._index is not None and other._index is not None: From 72598fbcacb046d7485b74f1e801772f4006a526 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 19:33:43 -0400 Subject: [PATCH 17/31] Prioritize numeric dtypes in is_numerical_dtype --- python/cudf/cudf/utils/dtypes.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 1438421bb12..375eccce310 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -144,16 +144,16 @@ def numeric_normalize_types(*args): def is_numerical_dtype(obj): - if is_categorical_dtype(obj): - return False - if is_list_dtype(obj): + if np.issubdtype(obj, np.bool_): + return True + elif np.issubdtype(obj, np.floating): + return True + elif np.issubdtype(obj, np.signedinteger): + return True + elif np.issubdtype(obj, np.unsignedinteger): + return True + else: return False - return ( - np.issubdtype(obj, np.bool_) - or np.issubdtype(obj, np.floating) - or np.issubdtype(obj, np.signedinteger) - or np.issubdtype(obj, np.unsignedinteger) - ) def is_string_dtype(obj): From fa220b6d86801ed98c415d48305eb74c0afc9d2e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 19:47:20 -0400 Subject: [PATCH 18/31] Add unsafe CA ctor --- python/cudf/cudf/_lib/table.pyx | 4 +++- python/cudf/cudf/core/column_accessor.py | 13 +++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx index f97b45d8abf..8b83de1e31c 100644 --- a/python/cudf/cudf/_lib/table.pyx +++ b/python/cudf/cudf/_lib/table.pyx @@ -114,7 +114,9 @@ cdef class Table: for _ in column_names: data_columns.append(Column.from_unique_ptr(move(dereference(it)))) it += 1 - data = dict(zip(column_names, data_columns)) + data = ColumnAccessor._init_unsafe( + dict(zip(column_names, data_columns)) + ) return Table(data=data, index=index) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index a527713099f..50c7dbd8812 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -80,6 +80,19 @@ def __init__( self.multiindex = multiindex self._level_names = level_names + @classmethod + def _init_unsafe( + cls, + data: Dict[Any, ColumnBase], + multiindex: bool = False, + level_names=None, + ) -> ColumnAccessor: + obj = cls() + obj._data = data + obj.multiindex = multiindex + obj._level_names = level_names + return obj + def __iter__(self): return self._data.__iter__() From 3760077f4f004129ca099b9d0ce8861bf3d87520 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 19:51:47 -0400 Subject: [PATCH 19/31] Revert "Prioritize numeric dtypes in is_numerical_dtype" This reverts commit 72598fbcacb046d7485b74f1e801772f4006a526. --- python/cudf/cudf/utils/dtypes.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 375eccce310..1438421bb12 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -144,16 +144,16 @@ def numeric_normalize_types(*args): def is_numerical_dtype(obj): - if np.issubdtype(obj, np.bool_): - return True - elif np.issubdtype(obj, np.floating): - return True - elif np.issubdtype(obj, np.signedinteger): - return True - elif np.issubdtype(obj, np.unsignedinteger): - return True - else: + if is_categorical_dtype(obj): + return False + if is_list_dtype(obj): return False + return ( + np.issubdtype(obj, np.bool_) + or np.issubdtype(obj, np.floating) + or np.issubdtype(obj, np.signedinteger) + or np.issubdtype(obj, np.unsignedinteger) + ) def is_string_dtype(obj): From de9ca28f86b46a6fe9cd93be58e865cd6a8afd96 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 22 Mar 2021 19:49:47 -0700 Subject: [PATCH 20/31] Change error message back so that tests pass. --- python/cudf/cudf/core/column_accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 44484927985..d2bab50a8ba 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -78,7 +78,7 @@ def __init__( if not isinstance(v, column.ColumnBase): v = column.as_column(v) if len(v) != column_length: - raise ValueError("All columns must be of equal length") + raise ValueError("All values must be of equal length") self._data[k] = v self.multiindex = multiindex From e35d03b339dc008e8f264dec9d86a8417f3a77db Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 10:47:05 -0400 Subject: [PATCH 21/31] Faster is_numerical_dtype --- python/cudf/cudf/utils/dtypes.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 1438421bb12..8aa0e05bb07 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -148,11 +148,18 @@ def is_numerical_dtype(obj): return False if is_list_dtype(obj): return False + # convert to an np.dtype object first, + # otherwise each of the np.issubdtype() calls + # below will be slow. + try: + dtype = np.dtype(obj) + except TypeError: + return False return ( - np.issubdtype(obj, np.bool_) - or np.issubdtype(obj, np.floating) - or np.issubdtype(obj, np.signedinteger) - or np.issubdtype(obj, np.unsignedinteger) + np.issubdtype(dtype, np.bool_) + or np.issubdtype(dtype, np.floating) + or np.issubdtype(dtype, np.signedinteger) + or np.issubdtype(dtype, np.unsignedinteger) ) From e2fd53369a554cfa887a137261baafdd94854bcd Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 10:51:45 -0400 Subject: [PATCH 22/31] Faster is_numerical_dtype --- python/cudf/cudf/utils/dtypes.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 8aa0e05bb07..225450d84b3 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -148,19 +148,11 @@ def is_numerical_dtype(obj): return False if is_list_dtype(obj): return False - # convert to an np.dtype object first, - # otherwise each of the np.issubdtype() calls - # below will be slow. try: dtype = np.dtype(obj) except TypeError: return False - return ( - np.issubdtype(dtype, np.bool_) - or np.issubdtype(dtype, np.floating) - or np.issubdtype(dtype, np.signedinteger) - or np.issubdtype(dtype, np.unsignedinteger) - ) + return dtype.kind in "biuf" def is_string_dtype(obj): From 64ca702d44d2c75463c19bfd2e7a762e1b7d7717 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 11:12:39 -0400 Subject: [PATCH 23/31] Even faster is_numerical_dtype --- python/cudf/cudf/utils/dtypes.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 225450d84b3..4080d9cff9c 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -144,10 +144,6 @@ def numeric_normalize_types(*args): def is_numerical_dtype(obj): - if is_categorical_dtype(obj): - return False - if is_list_dtype(obj): - return False try: dtype = np.dtype(obj) except TypeError: From 749edf18897ae1667eb6fa34972f76c25e2bec5f Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 11:23:03 -0400 Subject: [PATCH 24/31] Enable fast path for constructing a Buffer from a DeviceBuffer --- python/cudf/cudf/core/buffer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index 350346a87f9..9fc5570e35a 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -42,6 +42,10 @@ def __init__( self.ptr = data.ptr self.size = data.size self._owner = owner or data._owner + elif isinstance(data, rmm.DeviceBuffer): + self.ptr = data.ptr + self.size = data.size + self._owner = data elif hasattr(data, "__array_interface__") or hasattr( data, "__cuda_array_interface__" ): From 739ec57975ae1fa2817633da577989162c01ef93 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 23 Mar 2021 09:49:20 -0700 Subject: [PATCH 25/31] Add validation option to insert and standardize error message. --- python/cudf/cudf/core/column_accessor.py | 11 +++++++++-- python/cudf/cudf/tests/test_dataframe.py | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index d2bab50a8ba..add0570fc8f 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -78,7 +78,7 @@ def __init__( if not isinstance(v, column.ColumnBase): v = column.as_column(v) if len(v) != column_length: - raise ValueError("All values must be of equal length") + raise ValueError("All columns must be of equal length") self._data[k] = v self.multiindex = multiindex @@ -195,7 +195,7 @@ def to_pandas_index(self) -> pd.Index: result = pd.Index(self.names, name=self.name, tupleize_cols=False) return result - def insert(self, name: Any, value: Any, loc: int = -1): + def insert(self, name: Any, value: Any, loc: int = -1, validate: bool = True): """ Insert column into the ColumnAccessor at the specified location. @@ -225,6 +225,13 @@ def insert(self, name: Any, value: Any, loc: int = -1): if name in self._data: raise ValueError(f"Cannot insert '{name}', already exists") if loc == len(self._data): + if validate: + value = column.as_column(value) + if len(self._data) > 0: + if len(value) != self._column_length: + raise ValueError("All columns must be of equal length") + else: + self._column_length = len(value) self._data[name] = value else: new_keys = self.names[:loc] + (name,) + self.names[loc:] diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index b3ba439cb15..76a02d5e74a 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5222,7 +5222,7 @@ def test_memory_usage_multi(): def test_setitem_diff_size_list(list_input, key): gdf = cudf.datasets.randomdata(5) with pytest.raises( - ValueError, match=("All values must be of equal length") + ValueError, match=("All columns must be of equal length") ): gdf[key] = list_input From 498b70ed8b337fd412759504770d85acfe57094b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 23 Mar 2021 10:22:15 -0700 Subject: [PATCH 26/31] Fix style. --- python/cudf/cudf/core/column_accessor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index add0570fc8f..0c580132290 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -195,7 +195,9 @@ def to_pandas_index(self) -> pd.Index: result = pd.Index(self.names, name=self.name, tupleize_cols=False) return result - def insert(self, name: Any, value: Any, loc: int = -1, validate: bool = True): + def insert( + self, name: Any, value: Any, loc: int = -1, validate: bool = True + ): """ Insert column into the ColumnAccessor at the specified location. From 01e13fa62bba2ad0b4b34e5574c2152291d65ee2 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 17:08:04 -0400 Subject: [PATCH 27/31] Undo formatting change --- python/cudf/cudf/core/column_accessor.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index e2233423db4..68ce4c4c070 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -176,11 +176,7 @@ def _column_length(self): return 0 def _clear_cache(self): - cached_properties = ( - "columns", - "names", - "_grouped_data", - ) + cached_properties = ("columns", "names", "_grouped_data") for attr in cached_properties: try: self.__delattr__(attr) From 89a03013ef99452e7b12bb6380d98f3cde0635ba Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 17:10:16 -0400 Subject: [PATCH 28/31] Add TODO --- python/cudf/cudf/utils/dtypes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 4080d9cff9c..8875a36dba8 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -144,6 +144,8 @@ def numeric_normalize_types(*args): def is_numerical_dtype(obj): + # TODO: we should handle objects with a `.dtype` attribute, + # e.g., arrays, here. try: dtype = np.dtype(obj) except TypeError: From 5e73de76451740ce5b52694c1c501f92e7429d25 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 20:21:36 -0400 Subject: [PATCH 29/31] init->create + doc --- python/cudf/cudf/_lib/table.pyx | 2 +- python/cudf/cudf/core/column_accessor.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx index 8b83de1e31c..0d6e9c16e8c 100644 --- a/python/cudf/cudf/_lib/table.pyx +++ b/python/cudf/cudf/_lib/table.pyx @@ -114,7 +114,7 @@ cdef class Table: for _ in column_names: data_columns.append(Column.from_unique_ptr(move(dereference(it)))) it += 1 - data = ColumnAccessor._init_unsafe( + data = ColumnAccessor._create_unsafe( dict(zip(column_names, data_columns)) ) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 68ce4c4c070..33bae5c1328 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -81,12 +81,14 @@ def __init__( self._level_names = level_names @classmethod - def _init_unsafe( + def _create_unsafe( cls, data: Dict[Any, ColumnBase], multiindex: bool = False, level_names=None, ) -> ColumnAccessor: + # create a ColumnAccessor without verifying column + # type or size obj = cls() obj._data = data obj.multiindex = multiindex From a4fe7b4db5c1356af9c78414035d1611191543bc Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 24 Mar 2021 10:28:53 -0400 Subject: [PATCH 30/31] Use dict comprehension instead of building list --- python/cudf/cudf/_lib/table.pyx | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx index 0d6e9c16e8c..83ad9865d08 100644 --- a/python/cudf/cudf/_lib/table.pyx +++ b/python/cudf/cudf/_lib/table.pyx @@ -101,21 +101,25 @@ cdef class Table: # First construct the index, if any index = None if index_names is not None: - index_columns = [] - for _ in index_names: - index_columns.append(Column.from_unique_ptr( - move(dereference(it)) - )) - it += 1 - index = Table(dict(zip(index_names, index_columns))) + index_data = ColumnAccessor._create_unsafe( + { + index_names[i]: Column.from_unique_ptr( + move(dereference(it + i)) + ) + for i in range(len(index_names)) + } + ) + index = Table(data=index_data) # Construct the data dict - data_columns = [] - for _ in column_names: - data_columns.append(Column.from_unique_ptr(move(dereference(it)))) - it += 1 + cdef int n_index_columns = len(index_names) if index_names else 0 data = ColumnAccessor._create_unsafe( - dict(zip(column_names, data_columns)) + { + column_names[i]: Column.from_unique_ptr( + move(dereference(it + i + n_index_columns)) + ) + for i in range(len(column_names)) + } ) return Table(data=data, index=index) From eadcc9c73bb3812be6ce8c22d9949356b664c6a2 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 24 Mar 2021 10:32:53 -0400 Subject: [PATCH 31/31] Use enumeration instead --- python/cudf/cudf/_lib/table.pyx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx index 83ad9865d08..93d79ba6843 100644 --- a/python/cudf/cudf/_lib/table.pyx +++ b/python/cudf/cudf/_lib/table.pyx @@ -99,14 +99,16 @@ cdef class Table: cdef vector[unique_ptr[column]].iterator it = columns.begin() # First construct the index, if any + cdef int i + index = None if index_names is not None: index_data = ColumnAccessor._create_unsafe( { - index_names[i]: Column.from_unique_ptr( + name: Column.from_unique_ptr( move(dereference(it + i)) ) - for i in range(len(index_names)) + for i, name in enumerate(index_names) } ) index = Table(data=index_data) @@ -115,10 +117,10 @@ cdef class Table: cdef int n_index_columns = len(index_names) if index_names else 0 data = ColumnAccessor._create_unsafe( { - column_names[i]: Column.from_unique_ptr( + name: Column.from_unique_ptr( move(dereference(it + i + n_index_columns)) ) - for i in range(len(column_names)) + for i, name in enumerate(column_names) } )