From 935648b01220deac0dc306014ecd82f5191fc6f5 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 19 Mar 2021 13:48:34 -0700
Subject: [PATCH 01/31] Move validation directly into set_by_label and use a
 raw dict to store the columns in the accessor.

---
 python/cudf/cudf/core/column_accessor.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index ad1a0c80ef5..a1de373eb37 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import itertools
-from collections import OrderedDict
 from collections.abc import MutableMapping
 from typing import (
     TYPE_CHECKING,
@@ -18,8 +17,8 @@
 import pandas as pd
 
 import cudf
+from cudf.core import column
 from cudf.utils.utils import (
-    OrderedColumnDict,
     cached_property,
     to_flat_dict,
     to_nested_dict,
@@ -31,7 +30,7 @@
 
 class ColumnAccessor(MutableMapping):
 
-    _data: "OrderedDict[Any, ColumnBase]"
+    _data: "dict[Any, ColumnBase]"
     multiindex: bool
     _level_names: Tuple[Any, ...]
 
@@ -64,7 +63,7 @@ def __init__(
             self.multiindex = multiindex
             self._level_names = level_names
 
-        self._data = OrderedColumnDict(data)
+        self._data = dict(data)
         self.multiindex = multiindex
         self._level_names = level_names
 
@@ -280,6 +279,15 @@ def set_by_label(self, key: Any, value: Any):
         value : column-like
         """
         key = self._pad_key(key)
+
+        # Convert all types to columns and ensure that values are of equal
+        # length.
+        value = column.as_column(value)
+        if len(self._data) > 0:
+            first = next(iter(self._data.values()))
+            if len(value) != len(first):
+                raise ValueError("All columns must be of equal length")
+
         self._data[key] = value
         self._clear_cache()
 

From 806a3ef7740414bf16e21ea8b112982537a6f5ad Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 19 Mar 2021 13:54:27 -0700
Subject: [PATCH 02/31] Remove all references to OrderedColumnDict.

---
 python/cudf/cudf/_lib/table.pyx    |  8 ++++----
 python/cudf/cudf/core/dataframe.py |  3 +--
 python/cudf/cudf/core/frame.py     |  4 ++--
 python/cudf/cudf/utils/utils.py    | 30 ------------------------------
 4 files changed, 7 insertions(+), 38 deletions(-)

diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx
index dba0abb9cf0..f97b45d8abf 100644
--- a/python/cudf/cudf/_lib/table.pyx
+++ b/python/cudf/cudf/_lib/table.pyx
@@ -34,8 +34,8 @@ cdef class Table:
 
         Parameters
         ----------
-        data : OrderedColumnDict
-            An OrderedColumnDict mapping column names to Columns
+        data : dict
+            An dict mapping column names to Columns
         index : Table
             A Table representing the (optional) index columns.
         """
@@ -109,7 +109,7 @@ cdef class Table:
                 it += 1
             index = Table(dict(zip(index_names, index_columns)))
 
-        # Construct the data OrderedColumnDict
+        # Construct the data dict
         data_columns = []
         for _ in column_names:
             data_columns.append(Column.from_unique_ptr(move(dereference(it))))
@@ -154,7 +154,7 @@ cdef class Table:
                 column_idx += 1
             index = Table(dict(zip(index_names, index_columns)))
 
-        # Construct the data OrderedColumnDict
+        # Construct the data dict
         cdef size_type source_column_idx = 0
         data_columns = []
         for _ in column_names:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 18a7f052d62..a04dbb826a8 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -52,7 +52,6 @@
     is_struct_dtype,
     numeric_normalize_types,
 )
-from cudf.utils.utils import OrderedColumnDict
 
 T = TypeVar("T", bound="DataFrame")
 
@@ -4599,7 +4598,7 @@ def hash_columns(self, columns=None):
             table_to_hash = self
         else:
             cols = [self[k]._column for k in columns]
-            table_to_hash = Frame(data=OrderedColumnDict(zip(columns, cols)))
+            table_to_hash = Frame(data=dict(zip(columns, cols)))
 
         return Series(table_to_hash._hash()).values
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 926aad368b0..e33fda3ee09 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -39,8 +39,8 @@ class Frame(libcudf.table.Table):
 
     Parameters
     ----------
-    data : OrderedColumnDict
-        An OrderedColumnDict mapping column names to Columns
+    data : dict
+        An dict mapping column names to Columns
     index : Table
         A Frame representing the (optional) index columns.
     """
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 03a39f6fb4b..ba9fa734248 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -280,36 +280,6 @@ def __get__(self, instance, cls):
             return value
 
 
-class ColumnValuesMappingMixin:
-    """
-    Coerce provided values for the mapping to Columns.
-    """
-
-    def __setitem__(self, key, value):
-
-        value = column.as_column(value)
-        super().__setitem__(key, value)
-
-
-class EqualLengthValuesMappingMixin:
-    """
-    Require all values in the mapping to have the same length.
-    """
-
-    def __setitem__(self, key, value):
-        if len(self) > 0:
-            first = next(iter(self.values()))
-            if len(value) != len(first):
-                raise ValueError("All values must be of equal length")
-        super().__setitem__(key, value)
-
-
-class OrderedColumnDict(
-    ColumnValuesMappingMixin, EqualLengthValuesMappingMixin, OrderedDict
-):
-    pass
-
-
 class NestedMappingMixin:
     """
     Make missing values of a mapping empty instances

From 40a7b173bb3a86bd8d2473121aa519765c442e7f Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 19 Mar 2021 13:58:35 -0700
Subject: [PATCH 03/31] Move validation to separate method and use in both
 set_by_label and constructor.

---
 python/cudf/cudf/core/column_accessor.py | 25 +++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index a1de373eb37..c6b9236f0d0 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -63,7 +63,7 @@ def __init__(
             self.multiindex = multiindex
             self._level_names = level_names
 
-        self._data = dict(data)
+        self._data = {k: self._convert_and_validate(v) for k, v in data.items()}
         self.multiindex = multiindex
         self._level_names = level_names
 
@@ -269,6 +269,18 @@ def select_by_index(self, index: Any) -> ColumnAccessor:
             data, multiindex=self.multiindex, level_names=self.level_names,
         )
 
+    def _convert_and_validate(self, value: Any):
+        # Make sure that the provided value can be stored as a column. This
+        # method will convert the column to an appropriate type and make sure
+        # that it is the same type as other columns in the accessor.
+
+        value = column.as_column(value)
+        if len(self._data) > 0:
+            first = next(iter(self._data.values()))
+            if len(value) != len(first):
+                raise ValueError("All columns must be of equal length")
+        return value
+
     def set_by_label(self, key: Any, value: Any):
         """
         Add (or modify) column by name.
@@ -279,16 +291,7 @@ def set_by_label(self, key: Any, value: Any):
         value : column-like
         """
         key = self._pad_key(key)
-
-        # Convert all types to columns and ensure that values are of equal
-        # length.
-        value = column.as_column(value)
-        if len(self._data) > 0:
-            first = next(iter(self._data.values()))
-            if len(value) != len(first):
-                raise ValueError("All columns must be of equal length")
-
-        self._data[key] = value
+        self._data[key] = self._convert_and_validate(value)
         self._clear_cache()
 
     def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:

From a1c576ebe2cb9e5e344a9aaa00a7c2ef4044c5c6 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 19 Mar 2021 14:02:25 -0700
Subject: [PATCH 04/31] Format with black.

---
 python/cudf/cudf/core/column_accessor.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index c6b9236f0d0..fe8058c31ce 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -63,7 +63,9 @@ def __init__(
             self.multiindex = multiindex
             self._level_names = level_names
 
-        self._data = {k: self._convert_and_validate(v) for k, v in data.items()}
+        self._data = {
+            k: self._convert_and_validate(v) for k, v in data.items()
+        }
         self.multiindex = multiindex
         self._level_names = level_names
 

From 788d9d6a0bd1e8254dba31b3e085ed56abec0160 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 19 Mar 2021 14:08:21 -0700
Subject: [PATCH 05/31] Expose parameter to make validation optional.

---
 python/cudf/cudf/core/column_accessor.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index fe8058c31ce..38832396f1f 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -283,17 +283,24 @@ def _convert_and_validate(self, value: Any):
                 raise ValueError("All columns must be of equal length")
         return value
 
-    def set_by_label(self, key: Any, value: Any):
+    def set_by_label(self, key: Any, value: Any, validate: bool = True):
         """
         Add (or modify) column by name.
 
         Parameters
         ----------
-        key : name of the column
+        key
+            name of the column
         value : column-like
+            The value to insert into the column.
+        validate : bool
+            If True, the provided value will be coerced to a column and
+            validated before setting (Default value = True).
         """
         key = self._pad_key(key)
-        self._data[key] = self._convert_and_validate(value)
+        if validate:
+            value = self._convert_and_validate(value)
+        self._data[key] = value
         self._clear_cache()
 
     def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:

From 6a64285f4e36a7437c550cf57ccd30dcf694f2e0 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 19 Mar 2021 14:13:55 -0700
Subject: [PATCH 06/31] Coerce constructor input to dict before calling items.

---
 python/cudf/cudf/core/column_accessor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 38832396f1f..034c74393b1 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -64,7 +64,7 @@ def __init__(
             self._level_names = level_names
 
         self._data = {
-            k: self._convert_and_validate(v) for k, v in data.items()
+            k: self._convert_and_validate(v) for k, v in dict(data).items()
         }
         self.multiindex = multiindex
         self._level_names = level_names

From e7d09812a3ee18a3ce8cbe3208c9a526d353a38d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 19 Mar 2021 14:21:47 -0700
Subject: [PATCH 07/31] Make construction safe.

---
 python/cudf/cudf/core/column_accessor.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 034c74393b1..2b5ed21b010 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -63,9 +63,12 @@ def __init__(
             self.multiindex = multiindex
             self._level_names = level_names
 
-        self._data = {
-            k: self._convert_and_validate(v) for k, v in dict(data).items()
-        }
+        # Explicitly initialize an empty data dict so that we can validate each
+        # new column.
+        self._data = {}
+        for k, v in dict(data).items():
+            self._data[k] =  self._convert_and_validate(v)
+
         self.multiindex = multiindex
         self._level_names = level_names
 
@@ -277,7 +280,7 @@ def _convert_and_validate(self, value: Any):
         # that it is the same type as other columns in the accessor.
 
         value = column.as_column(value)
-        if len(self._data) > 0:
+        if hasattr(self, '_data') and len(self._data) > 0:
             first = next(iter(self._data.values()))
             if len(value) != len(first):
                 raise ValueError("All columns must be of equal length")

From c39932c4dc46e35c262440b76d16d0ed3733e8c3 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 19 Mar 2021 15:39:01 -0700
Subject: [PATCH 08/31] Final cleanup and documentation.

---
 python/cudf/cudf/core/column_accessor.py | 52 ++++++++++++++----------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 2b5ed21b010..c175a6d9da7 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -33,6 +33,7 @@ class ColumnAccessor(MutableMapping):
     _data: "dict[Any, ColumnBase]"
     multiindex: bool
     _level_names: Tuple[Any, ...]
+    _column_length: int
 
     def __init__(
         self,
@@ -62,15 +63,30 @@ def __init__(
             self._data = data._data
             self.multiindex = multiindex
             self._level_names = level_names
+            self._column_length = column_length
+        else:
+            # This code path is performance-critical for copies and should be
+            # modified with care.
+            self._data = {}
+            if data:
+                data = dict(data)
+                # Faster than next(iter(data.values()))
+                column_length = len(data[next(iter(data))])
+                for k, v in data.items():
+                    # Much faster to avoid the function call if possible; the
+                    # extra isinstance is negligible if we do have to make a
+                    # column from something else.
+                    if not isinstance(v, column.ColumnBase):
+                        v = column.as_column(v)
+                    if len(v) != column_length:
+                        raise ValueError("All columns must be of equal length")
+                    self._data[k] = v
+                self._column_length = column_length
+            else:
+                self._column_length = None
 
-        # Explicitly initialize an empty data dict so that we can validate each
-        # new column.
-        self._data = {}
-        for k, v in dict(data).items():
-            self._data[k] =  self._convert_and_validate(v)
-
-        self.multiindex = multiindex
-        self._level_names = level_names
+            self.multiindex = multiindex
+            self._level_names = level_names
 
     def __iter__(self):
         return self._data.__iter__()
@@ -274,18 +290,6 @@ def select_by_index(self, index: Any) -> ColumnAccessor:
             data, multiindex=self.multiindex, level_names=self.level_names,
         )
 
-    def _convert_and_validate(self, value: Any):
-        # Make sure that the provided value can be stored as a column. This
-        # method will convert the column to an appropriate type and make sure
-        # that it is the same type as other columns in the accessor.
-
-        value = column.as_column(value)
-        if hasattr(self, '_data') and len(self._data) > 0:
-            first = next(iter(self._data.values()))
-            if len(value) != len(first):
-                raise ValueError("All columns must be of equal length")
-        return value
-
     def set_by_label(self, key: Any, value: Any, validate: bool = True):
         """
         Add (or modify) column by name.
@@ -302,7 +306,13 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True):
         """
         key = self._pad_key(key)
         if validate:
-            value = self._convert_and_validate(value)
+            value = column.as_column(value)
+            if len(self._data) > 0:
+                if len(value) != self._column_length:
+                    raise ValueError("All columns must be of equal length")
+            else:
+                self._column_length = len(value)
+
         self._data[key] = value
         self._clear_cache()
 

From 4ff09fcf66566ac9aaf3b6df75cf2c60e96c060e Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 19 Mar 2021 15:57:35 -0700
Subject: [PATCH 09/31] Address style issues.

---
 python/cudf/cudf/core/column_accessor.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index c175a6d9da7..6988efeafa7 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -8,6 +8,7 @@
     TYPE_CHECKING,
     Any,
     Callable,
+    Dict,
     Mapping,
     Optional,
     Tuple,
@@ -30,7 +31,7 @@
 
 class ColumnAccessor(MutableMapping):
 
-    _data: "dict[Any, ColumnBase]"
+    _data: "Dict[Any, ColumnBase]"
     multiindex: bool
     _level_names: Tuple[Any, ...]
     _column_length: int
@@ -63,7 +64,7 @@ def __init__(
             self._data = data._data
             self.multiindex = multiindex
             self._level_names = level_names
-            self._column_length = column_length
+            self._column_length = data._column_length
         else:
             # This code path is performance-critical for copies and should be
             # modified with care.
@@ -82,8 +83,6 @@ def __init__(
                         raise ValueError("All columns must be of equal length")
                     self._data[k] = v
                 self._column_length = column_length
-            else:
-                self._column_length = None
 
             self.multiindex = multiindex
             self._level_names = level_names

From 0178127205b44383bf4d6a2c3d424512aa80b033 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 22 Mar 2021 17:28:57 -0400
Subject: [PATCH 10/31] CA  fix

---
 python/cudf/cudf/core/column_accessor.py | 29 ++++++++++++++----------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 84a21d78266..bd3e801fbec 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -19,11 +19,7 @@
 
 import cudf
 from cudf.core import column
-from cudf.utils.utils import (
-    cached_property,
-    to_flat_dict,
-    to_nested_dict,
-)
+from cudf.utils.utils import cached_property, to_flat_dict, to_nested_dict
 
 if TYPE_CHECKING:
     from cudf.core.column import ColumnBase
@@ -34,7 +30,6 @@ class ColumnAccessor(MutableMapping):
     _data: "Dict[Any, ColumnBase]"
     multiindex: bool
     _level_names: Tuple[Any, ...]
-    _column_length: int
 
     def __init__(
         self,
@@ -64,15 +59,13 @@ def __init__(
             self._data = data._data
             self.multiindex = multiindex
             self._level_names = level_names
-            self._column_length = data._column_length
         else:
             # This code path is performance-critical for copies and should be
             # modified with care.
             self._data = {}
             if data:
                 data = dict(data)
-                # Faster than next(iter(data.values()))
-                column_length = len(data[next(iter(data))])
+                column_length = _length_of_first_value(data)
                 for k, v in data.items():
                     # Much faster to avoid the function call if possible; the
                     # extra isinstance is negligible if we do have to make a
@@ -82,8 +75,6 @@ def __init__(
                     if len(v) != column_length:
                         raise ValueError("All columns must be of equal length")
                     self._data[k] = v
-                self._column_length = column_length
-
             self.multiindex = multiindex
             self._level_names = level_names
 
@@ -144,6 +135,10 @@ def nrows(self) -> int:
         else:
             return len(next(iter(self.values())))
 
+    @cached_property
+    def _column_length(self) -> int:
+        return _length_of_first_value(self._data)
+
     @cached_property
     def names(self) -> Tuple[Any, ...]:
         return tuple(self.keys())
@@ -164,7 +159,12 @@ def _grouped_data(self) -> MutableMapping:
             return self._data
 
     def _clear_cache(self):
-        cached_properties = "columns", "names", "_grouped_data"
+        cached_properties = (
+            "columns",
+            "names",
+            "_grouped_data",
+            "_column_length",
+        )
         for attr in cached_properties:
             try:
                 self.__delattr__(attr)
@@ -473,3 +473,8 @@ def _compare_keys(target: Any, key: Any) -> bool:
         if k1 != k2:
             return False
     return True
+
+
+def _length_of_first_value(data: Dict[Any, Any]) -> int:
+    # faster than next(iter(data.values())):
+    return 0 if not data else len(data[next(iter(data))])

From efea63dd02a6d143a72bc8e76d012d24a27a8af6 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 22 Mar 2021 17:53:07 -0400
Subject: [PATCH 11/31] Prioritize numeric columns

---
 python/cudf/cudf/core/column/column.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b2b2874eeb4..dd06d97d105 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1017,7 +1017,9 @@ def distinct_count(
         return cpp_distinct_count(self, ignore_nulls=dropna)
 
     def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
-        if is_categorical_dtype(dtype):
+        if is_numerical_dtype(dtype):
+            return self.as_numerical_column(dtype)
+        elif is_categorical_dtype(dtype):
             return self.as_categorical_column(dtype, **kwargs)
         elif pd.api.types.pandas_dtype(dtype).type in {
             np.str_,
@@ -1548,6 +1550,16 @@ def build_column(
     """
     dtype = pd.api.types.pandas_dtype(dtype)
 
+    if is_numerical_dtype(dtype):
+        assert data is not None
+        return cudf.core.column.NumericalColumn(
+            data=data,
+            dtype=dtype,
+            mask=mask,
+            size=size,
+            offset=offset,
+            null_count=null_count,
+        )
     if is_categorical_dtype(dtype):
         if not len(children) == 1:
             raise ValueError(
@@ -1634,15 +1646,7 @@ def build_column(
             children=children,
         )
     else:
-        assert data is not None
-        return cudf.core.column.NumericalColumn(
-            data=data,
-            dtype=dtype,
-            mask=mask,
-            size=size,
-            offset=offset,
-            null_count=null_count,
-        )
+        raise TypeError(f"Unrecognized dtype: {dtype}")
 
 
 def build_categorical_column(

From c3b6444787e29a4536104adad1dc3508b7e5a9dd Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 22 Mar 2021 14:56:08 -0700
Subject: [PATCH 12/31] Lazily compute and delete column length on demand.

---
 python/cudf/cudf/core/column_accessor.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 84a21d78266..f0677618d76 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -64,7 +64,6 @@ def __init__(
             self._data = data._data
             self.multiindex = multiindex
             self._level_names = level_names
-            self._column_length = data._column_length
         else:
             # This code path is performance-critical for copies and should be
             # modified with care.
@@ -82,7 +81,6 @@ def __init__(
                     if len(v) != column_length:
                         raise ValueError("All columns must be of equal length")
                     self._data[k] = v
-                self._column_length = column_length
 
             self.multiindex = multiindex
             self._level_names = level_names
@@ -163,6 +161,13 @@ def _grouped_data(self) -> MutableMapping:
         else:
             return self._data
 
+    @cached_property
+    def _column_length(self):
+        try:
+            return len(self._data[next(iter(self._data))])
+        except StopIteration:
+            return 0
+
     def _clear_cache(self):
         cached_properties = "columns", "names", "_grouped_data"
         for attr in cached_properties:
@@ -171,6 +176,10 @@ def _clear_cache(self):
             except AttributeError:
                 pass
 
+        # Column length should only be cleared if no data is present.
+        if len(self._data) == 0 and hasattr(self, "_column_length"):
+            del self._column_length
+
     def to_pandas_index(self) -> pd.Index:
         """"
         Convert the keys of the ColumnAccessor to a Pandas Index object.

From 01b2cf572d596da54423fdff36beefe1da382bb3 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 22 Mar 2021 14:59:42 -0700
Subject: [PATCH 13/31] Remove redundant clear cache in setitem.

---
 python/cudf/cudf/core/column_accessor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index f0677618d76..77445dae3c7 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -93,7 +93,6 @@ def __getitem__(self, key: Any) -> ColumnBase:
 
     def __setitem__(self, key: Any, value: Any):
         self.set_by_label(key, value)
-        self._clear_cache()
 
     def __delitem__(self, key: Any):
         self._data.__delitem__(key)

From 88992581ec04b4092e9ef02edbd03350c31af0fa Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 22 Mar 2021 15:06:38 -0700
Subject: [PATCH 14/31] Remove mypy annotation for column length.

---
 python/cudf/cudf/core/column_accessor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 77445dae3c7..44484927985 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -34,7 +34,6 @@ class ColumnAccessor(MutableMapping):
     _data: "Dict[Any, ColumnBase]"
     multiindex: bool
     _level_names: Tuple[Any, ...]
-    _column_length: int
 
     def __init__(
         self,

From 7f8e1cd60525f3a06e064f8fa4bc4d93bb383700 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 22 Mar 2021 19:01:50 -0400
Subject: [PATCH 15/31] Undo

---
 python/cudf/cudf/core/column_accessor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 2582f7be287..a527713099f 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -167,7 +167,6 @@ def _clear_cache(self):
             "columns",
             "names",
             "_grouped_data",
-            "_column_length",
         )
         for attr in cached_properties:
             try:

From f2e4609f63389ba44b65284feaba155d4ba9721a Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 22 Mar 2021 19:04:48 -0400
Subject: [PATCH 16/31] Don't validate when copying type metadata

---
 python/cudf/cudf/core/frame.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index e6898b8c606..ecff3dee573 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -2408,7 +2408,9 @@ def _copy_type_metadata(
         for name, col, other_col in zip(
             self._data.keys(), self._data.values(), other._data.values()
         ):
-            self._data[name] = other_col._copy_type_metadata(col)
+            self._data.set_by_label(
+                name, other_col._copy_type_metadata(col), validate=False
+            )
 
         if include_index:
             if self._index is not None and other._index is not None:

From 72598fbcacb046d7485b74f1e801772f4006a526 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 22 Mar 2021 19:33:43 -0400
Subject: [PATCH 17/31] Prioritize numeric dtypes in is_numerical_dtype

---
 python/cudf/cudf/utils/dtypes.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 1438421bb12..375eccce310 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -144,16 +144,16 @@ def numeric_normalize_types(*args):
 
 
 def is_numerical_dtype(obj):
-    if is_categorical_dtype(obj):
-        return False
-    if is_list_dtype(obj):
+    if np.issubdtype(obj, np.bool_):
+        return True
+    elif np.issubdtype(obj, np.floating):
+        return True
+    elif np.issubdtype(obj, np.signedinteger):
+        return True
+    elif np.issubdtype(obj, np.unsignedinteger):
+        return True
+    else:
         return False
-    return (
-        np.issubdtype(obj, np.bool_)
-        or np.issubdtype(obj, np.floating)
-        or np.issubdtype(obj, np.signedinteger)
-        or np.issubdtype(obj, np.unsignedinteger)
-    )
 
 
 def is_string_dtype(obj):

From fa220b6d86801ed98c415d48305eb74c0afc9d2e Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 22 Mar 2021 19:47:20 -0400
Subject: [PATCH 18/31] Add unsafe CA ctor

---
 python/cudf/cudf/_lib/table.pyx          |  4 +++-
 python/cudf/cudf/core/column_accessor.py | 13 +++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx
index f97b45d8abf..8b83de1e31c 100644
--- a/python/cudf/cudf/_lib/table.pyx
+++ b/python/cudf/cudf/_lib/table.pyx
@@ -114,7 +114,9 @@ cdef class Table:
         for _ in column_names:
             data_columns.append(Column.from_unique_ptr(move(dereference(it))))
             it += 1
-        data = dict(zip(column_names, data_columns))
+        data = ColumnAccessor._init_unsafe(
+            dict(zip(column_names, data_columns))
+        )
 
         return Table(data=data, index=index)
 
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index a527713099f..50c7dbd8812 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -80,6 +80,19 @@ def __init__(
             self.multiindex = multiindex
             self._level_names = level_names
 
+    @classmethod
+    def _init_unsafe(
+        cls,
+        data: Dict[Any, ColumnBase],
+        multiindex: bool = False,
+        level_names=None,
+    ) -> ColumnAccessor:
+        obj = cls()
+        obj._data = data
+        obj.multiindex = multiindex
+        obj._level_names = level_names
+        return obj
+
     def __iter__(self):
         return self._data.__iter__()
 

From 3760077f4f004129ca099b9d0ce8861bf3d87520 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 22 Mar 2021 19:51:47 -0400
Subject: [PATCH 19/31] Revert "Prioritize numeric dtypes in
 is_numerical_dtype"

This reverts commit 72598fbcacb046d7485b74f1e801772f4006a526.
---
 python/cudf/cudf/utils/dtypes.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 375eccce310..1438421bb12 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -144,16 +144,16 @@ def numeric_normalize_types(*args):
 
 
 def is_numerical_dtype(obj):
-    if np.issubdtype(obj, np.bool_):
-        return True
-    elif np.issubdtype(obj, np.floating):
-        return True
-    elif np.issubdtype(obj, np.signedinteger):
-        return True
-    elif np.issubdtype(obj, np.unsignedinteger):
-        return True
-    else:
+    if is_categorical_dtype(obj):
+        return False
+    if is_list_dtype(obj):
         return False
+    return (
+        np.issubdtype(obj, np.bool_)
+        or np.issubdtype(obj, np.floating)
+        or np.issubdtype(obj, np.signedinteger)
+        or np.issubdtype(obj, np.unsignedinteger)
+    )
 
 
 def is_string_dtype(obj):

From de9ca28f86b46a6fe9cd93be58e865cd6a8afd96 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 22 Mar 2021 19:49:47 -0700
Subject: [PATCH 20/31] Change error message back so that tests pass.

---
 python/cudf/cudf/core/column_accessor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 44484927985..d2bab50a8ba 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -78,7 +78,7 @@ def __init__(
                     if not isinstance(v, column.ColumnBase):
                         v = column.as_column(v)
                     if len(v) != column_length:
-                        raise ValueError("All columns must be of equal length")
+                        raise ValueError("All values must be of equal length")
                     self._data[k] = v
 
             self.multiindex = multiindex

From e35d03b339dc008e8f264dec9d86a8417f3a77db Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 23 Mar 2021 10:47:05 -0400
Subject: [PATCH 21/31] Faster is_numerical_dtype

---
 python/cudf/cudf/utils/dtypes.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 1438421bb12..8aa0e05bb07 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -148,11 +148,18 @@ def is_numerical_dtype(obj):
         return False
     if is_list_dtype(obj):
         return False
+    # convert to an np.dtype object first,
+    # otherwise each of the np.issubdtype() calls
+    # below will be slow.
+    try:
+        dtype = np.dtype(obj)
+    except TypeError:
+        return False
     return (
-        np.issubdtype(obj, np.bool_)
-        or np.issubdtype(obj, np.floating)
-        or np.issubdtype(obj, np.signedinteger)
-        or np.issubdtype(obj, np.unsignedinteger)
+        np.issubdtype(dtype, np.bool_)
+        or np.issubdtype(dtype, np.floating)
+        or np.issubdtype(dtype, np.signedinteger)
+        or np.issubdtype(dtype, np.unsignedinteger)
     )
 
 

From e2fd53369a554cfa887a137261baafdd94854bcd Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 23 Mar 2021 10:51:45 -0400
Subject: [PATCH 22/31] Faster is_numerical_dtype

---
 python/cudf/cudf/utils/dtypes.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 8aa0e05bb07..225450d84b3 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -148,19 +148,11 @@ def is_numerical_dtype(obj):
         return False
     if is_list_dtype(obj):
         return False
-    # convert to an np.dtype object first,
-    # otherwise each of the np.issubdtype() calls
-    # below will be slow.
     try:
         dtype = np.dtype(obj)
     except TypeError:
         return False
-    return (
-        np.issubdtype(dtype, np.bool_)
-        or np.issubdtype(dtype, np.floating)
-        or np.issubdtype(dtype, np.signedinteger)
-        or np.issubdtype(dtype, np.unsignedinteger)
-    )
+    return dtype.kind in "biuf"
 
 
 def is_string_dtype(obj):

From 64ca702d44d2c75463c19bfd2e7a762e1b7d7717 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 23 Mar 2021 11:12:39 -0400
Subject: [PATCH 23/31] Even faster is_numerical_dtype

---
 python/cudf/cudf/utils/dtypes.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 225450d84b3..4080d9cff9c 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -144,10 +144,6 @@ def numeric_normalize_types(*args):
 
 
 def is_numerical_dtype(obj):
-    if is_categorical_dtype(obj):
-        return False
-    if is_list_dtype(obj):
-        return False
     try:
         dtype = np.dtype(obj)
     except TypeError:

From 749edf18897ae1667eb6fa34972f76c25e2bec5f Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 23 Mar 2021 11:23:03 -0400
Subject: [PATCH 24/31] Enable fast path for constructing a Buffer from a
 DeviceBuffer

---
 python/cudf/cudf/core/buffer.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py
index 350346a87f9..9fc5570e35a 100644
--- a/python/cudf/cudf/core/buffer.py
+++ b/python/cudf/cudf/core/buffer.py
@@ -42,6 +42,10 @@ def __init__(
             self.ptr = data.ptr
             self.size = data.size
             self._owner = owner or data._owner
+        elif isinstance(data, rmm.DeviceBuffer):
+            self.ptr = data.ptr
+            self.size = data.size
+            self._owner = data
         elif hasattr(data, "__array_interface__") or hasattr(
             data, "__cuda_array_interface__"
         ):

From 739ec57975ae1fa2817633da577989162c01ef93 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 23 Mar 2021 09:49:20 -0700
Subject: [PATCH 25/31] Add validation option to insert and standardize error
 message.

---
 python/cudf/cudf/core/column_accessor.py | 11 +++++++++--
 python/cudf/cudf/tests/test_dataframe.py |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index d2bab50a8ba..add0570fc8f 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -78,7 +78,7 @@ def __init__(
                     if not isinstance(v, column.ColumnBase):
                         v = column.as_column(v)
                     if len(v) != column_length:
-                        raise ValueError("All values must be of equal length")
+                        raise ValueError("All columns must be of equal length")
                     self._data[k] = v
 
             self.multiindex = multiindex
@@ -195,7 +195,7 @@ def to_pandas_index(self) -> pd.Index:
             result = pd.Index(self.names, name=self.name, tupleize_cols=False)
         return result
 
-    def insert(self, name: Any, value: Any, loc: int = -1):
+    def insert(self, name: Any, value: Any, loc: int = -1, validate: bool = True):
         """
         Insert column into the ColumnAccessor at the specified location.
 
@@ -225,6 +225,13 @@ def insert(self, name: Any, value: Any, loc: int = -1):
         if name in self._data:
             raise ValueError(f"Cannot insert '{name}', already exists")
         if loc == len(self._data):
+            if validate:
+                value = column.as_column(value)
+                if len(self._data) > 0:
+                    if len(value) != self._column_length:
+                        raise ValueError("All columns must be of equal length")
+                else:
+                    self._column_length = len(value)
             self._data[name] = value
         else:
             new_keys = self.names[:loc] + (name,) + self.names[loc:]
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index b3ba439cb15..76a02d5e74a 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -5222,7 +5222,7 @@ def test_memory_usage_multi():
 def test_setitem_diff_size_list(list_input, key):
     gdf = cudf.datasets.randomdata(5)
     with pytest.raises(
-        ValueError, match=("All values must be of equal length")
+        ValueError, match=("All columns must be of equal length")
     ):
         gdf[key] = list_input
 

From 498b70ed8b337fd412759504770d85acfe57094b Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 23 Mar 2021 10:22:15 -0700
Subject: [PATCH 26/31] Fix style.

---
 python/cudf/cudf/core/column_accessor.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index add0570fc8f..0c580132290 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -195,7 +195,9 @@ def to_pandas_index(self) -> pd.Index:
             result = pd.Index(self.names, name=self.name, tupleize_cols=False)
         return result
 
-    def insert(self, name: Any, value: Any, loc: int = -1, validate: bool = True):
+    def insert(
+        self, name: Any, value: Any, loc: int = -1, validate: bool = True
+    ):
         """
         Insert column into the ColumnAccessor at the specified location.
 

From 01e13fa62bba2ad0b4b34e5574c2152291d65ee2 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 23 Mar 2021 17:08:04 -0400
Subject: [PATCH 27/31] Undo formatting change

---
 python/cudf/cudf/core/column_accessor.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index e2233423db4..68ce4c4c070 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -176,11 +176,7 @@ def _column_length(self):
             return 0
 
     def _clear_cache(self):
-        cached_properties = (
-            "columns",
-            "names",
-            "_grouped_data",
-        )
+        cached_properties = ("columns", "names", "_grouped_data")
         for attr in cached_properties:
             try:
                 self.__delattr__(attr)

From 89a03013ef99452e7b12bb6380d98f3cde0635ba Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 23 Mar 2021 17:10:16 -0400
Subject: [PATCH 28/31] Add TODO

---
 python/cudf/cudf/utils/dtypes.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 4080d9cff9c..8875a36dba8 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -144,6 +144,8 @@ def numeric_normalize_types(*args):
 
 
 def is_numerical_dtype(obj):
+    # TODO: we should handle objects with a `.dtype` attribute,
+    # e.g., arrays, here.
     try:
         dtype = np.dtype(obj)
     except TypeError:

From 5e73de76451740ce5b52694c1c501f92e7429d25 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 23 Mar 2021 20:21:36 -0400
Subject: [PATCH 29/31] init->create + doc

---
 python/cudf/cudf/_lib/table.pyx          | 2 +-
 python/cudf/cudf/core/column_accessor.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx
index 8b83de1e31c..0d6e9c16e8c 100644
--- a/python/cudf/cudf/_lib/table.pyx
+++ b/python/cudf/cudf/_lib/table.pyx
@@ -114,7 +114,7 @@ cdef class Table:
         for _ in column_names:
             data_columns.append(Column.from_unique_ptr(move(dereference(it))))
             it += 1
-        data = ColumnAccessor._init_unsafe(
+        data = ColumnAccessor._create_unsafe(
             dict(zip(column_names, data_columns))
         )
 
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 68ce4c4c070..33bae5c1328 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -81,12 +81,14 @@ def __init__(
             self._level_names = level_names
 
     @classmethod
-    def _init_unsafe(
+    def _create_unsafe(
         cls,
         data: Dict[Any, ColumnBase],
         multiindex: bool = False,
         level_names=None,
     ) -> ColumnAccessor:
+        # create a ColumnAccessor without verifying column
+        # type or size
         obj = cls()
         obj._data = data
         obj.multiindex = multiindex

From a4fe7b4db5c1356af9c78414035d1611191543bc Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Wed, 24 Mar 2021 10:28:53 -0400
Subject: [PATCH 30/31] Use dict comprehension instead  of building list

---
 python/cudf/cudf/_lib/table.pyx | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx
index 0d6e9c16e8c..83ad9865d08 100644
--- a/python/cudf/cudf/_lib/table.pyx
+++ b/python/cudf/cudf/_lib/table.pyx
@@ -101,21 +101,25 @@ cdef class Table:
         # First construct the index, if any
         index = None
         if index_names is not None:
-            index_columns = []
-            for _ in index_names:
-                index_columns.append(Column.from_unique_ptr(
-                    move(dereference(it))
-                ))
-                it += 1
-            index = Table(dict(zip(index_names, index_columns)))
+            index_data = ColumnAccessor._create_unsafe(
+                {
+                    index_names[i]: Column.from_unique_ptr(
+                        move(dereference(it + i))
+                    )
+                    for i in range(len(index_names))
+                }
+            )
+            index = Table(data=index_data)
 
         # Construct the data dict
-        data_columns = []
-        for _ in column_names:
-            data_columns.append(Column.from_unique_ptr(move(dereference(it))))
-            it += 1
+        cdef int n_index_columns = len(index_names) if index_names else 0
         data = ColumnAccessor._create_unsafe(
-            dict(zip(column_names, data_columns))
+            {
+                column_names[i]: Column.from_unique_ptr(
+                    move(dereference(it + i + n_index_columns))
+                )
+                for i in range(len(column_names))
+            }
         )
 
         return Table(data=data, index=index)

From eadcc9c73bb3812be6ce8c22d9949356b664c6a2 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Wed, 24 Mar 2021 10:32:53 -0400
Subject: [PATCH 31/31] Use enumeration instead

---
 python/cudf/cudf/_lib/table.pyx | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx
index 83ad9865d08..93d79ba6843 100644
--- a/python/cudf/cudf/_lib/table.pyx
+++ b/python/cudf/cudf/_lib/table.pyx
@@ -99,14 +99,16 @@ cdef class Table:
         cdef vector[unique_ptr[column]].iterator it = columns.begin()
 
         # First construct the index, if any
+        cdef int i
+
         index = None
         if index_names is not None:
             index_data = ColumnAccessor._create_unsafe(
                 {
-                    index_names[i]: Column.from_unique_ptr(
+                    name: Column.from_unique_ptr(
                         move(dereference(it + i))
                     )
-                    for i in range(len(index_names))
+                    for i, name in enumerate(index_names)
                 }
             )
             index = Table(data=index_data)
@@ -115,10 +117,10 @@ cdef class Table:
         cdef int n_index_columns = len(index_names) if index_names else 0
         data = ColumnAccessor._create_unsafe(
             {
-                column_names[i]: Column.from_unique_ptr(
+                name: Column.from_unique_ptr(
                     move(dereference(it + i + n_index_columns))
                 )
-                for i in range(len(column_names))
+                for i, name in enumerate(column_names)
             }
         )