Add __repr__ for Column and ColumnAccessor (#7531)

## Summary: * Add a `__repr__` for Column (thin wrapper around the `__repr__` of the underlying pa.Array) * Add a `__repr__` for ColumnAccessor (similar to pa.Table, shows the names/types of the columns of the ColumnAccessor) ## Additional info: Debugging is sometimes made painful by the fact that we don't have a `__repr__` for columns and column accessors. For example, here's what a `ColumnAccessor` and a `Column` currently look like when printed...: ```python In [2]: cudf.DataFrame({'a': [1, 2, 3], "b": [4, 5, 6], "z_1": [2, 3, 4]})._data Out[2]: ColumnAccessor(OrderedColumnDict([('a', <cudf.core.column.numerical.NumericalColumn object at 0x7f0306336f80>), ('b', <cudf.core.column.numerical.NumericalColumn object at 0x7f03062a05f0>), ('z_1', <cudf.core.column.numerical.NumericalColumn object at 0x7f03062a0e60>)]), multiindex=False, level_names=(None,)) In [3]: cudf.Series([1, 2, None, 3])._column Out[3]: <cudf.core.column.numerical.NumericalColumn at 0x7f2190746710> ``` After this PR: ```python In [2]: cudf.DataFrame({'a': [1, 2, 3], "b": [4, 5, 6], "z_1": [2, 3, 4]})._data Out[2]: ColumnAccessor(multiindex=False, level_names=(None,)) a: int64 b: int64 z_1: int64 In [3]: cudf.Series([1, 2, None, 3])._column Out[3]: <cudf.core.column.numerical.NumericalColumn object at 0x7f3e90c2ac20> [ 1, 2, null, 3 ] dtype: int64 ``` Authors: - Ashwin Srinath (@shwina) Approvers: - Keith Kraus (@kkraus14) URL: #7531
rapidsai · Mar 12, 2021 · 5c4fa28 · 5c4fa28
1 parent 365e649
commit 5c4fa28
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 10 deletions.
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -109,6 +109,13 @@ def mask_array_view(self) -> "cuda.devicearray.DeviceNDArray":
     def __len__(self) -> int:
         return self.size
 
+    def __repr__(self):
+        return (
+            f"{object.__repr__(self)}\n"
+            f"{self.to_arrow().to_string()}\n"
+            f"dtype: {self.dtype}"
+        )
+
     def to_pandas(
         self, index: ColumnLike = None, nullable: bool = False, **kwargs
     ) -> "pd.Series":

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
@@ -86,15 +86,15 @@ def __len__(self) -> int:
         return len(self._data)
 
     def __repr__(self) -> str:
-        data_repr = self._data.__repr__()
-        multiindex_repr = self.multiindex.__repr__()
-        level_names_repr = self.level_names.__repr__()
-        return "{}({}, multiindex={}, level_names={})".format(
-            self.__class__.__name__,
-            data_repr,
-            multiindex_repr,
-            level_names_repr,
+        type_info = (
+            f"{self.__class__.__name__}("
+            f"multiindex={self.multiindex}, "
+            f"level_names={self.level_names})"
         )
+        column_info = "\n".join(
+            [f"{name}: {col.dtype}" for name, col in self.items()]
+        )
+        return f"{type_info}\n{column_info}"
 
     @property
     def level_names(self) -> Tuple[Any, ...]:

diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
+from pickle import dumps
+
 import cachetools
 import cupy
 import numpy as np
 from numba import cuda
-from pickle import dumps
 
 import cudf
 from cudf.utils.utils import check_equals_float, check_equals_int
@@ -239,7 +240,7 @@ def grouped_window_sizes_from_offset(arr, group_starts, offset):
 # it can hit for distinct functions that are similar. The lru_cache wrapping
 # compile_udf misses for these similar functions, but doesn't need to serialize
 # closure variables to check for a hit.
-_udf_code_cache = cachetools.LRUCache(maxsize=32)
+_udf_code_cache: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
 
 
 def compile_udf(udf, type_signature):