Skip to content

Commit

Permalink
Add __repr__ for Column and ColumnAccessor (#7531)
Browse files Browse the repository at this point in the history
## Summary:

* Add a `__repr__` for Column (thin wrapper around the `__repr__` of the underlying pa.Array)
* Add a `__repr__` for ColumnAccessor (similar to pa.Table, shows the names/types of the columns of the ColumnAccessor)

## Additional info:

Debugging is sometimes made painful by the fact that we don't have a `__repr__` for columns and column accessors. For example, here's what a `ColumnAccessor` and a `Column` currently look like when printed...:

```python
In [2]: cudf.DataFrame({'a': [1, 2, 3], "b": [4, 5, 6], "z_1": [2, 3, 4]})._data
Out[2]: ColumnAccessor(OrderedColumnDict([('a', <cudf.core.column.numerical.NumericalColumn object at 0x7f0306336f80>), ('b', <cudf.core.column.numerical.NumericalColumn object at 0x7f03062a05f0>), ('z_1', <cudf.core.column.numerical.NumericalColumn object at 0x7f03062a0e60>)]), multiindex=False, level_names=(None,))

In [3]: cudf.Series([1, 2, None, 3])._column
Out[3]: <cudf.core.column.numerical.NumericalColumn at 0x7f2190746710>
```

After this PR:

```python
In [2]: cudf.DataFrame({'a': [1, 2, 3], "b": [4, 5, 6], "z_1": [2, 3, 4]})._data
Out[2]:
ColumnAccessor(multiindex=False, level_names=(None,))
a: int64
b: int64
z_1: int64

In [3]: cudf.Series([1, 2, None, 3])._column
Out[3]:
<cudf.core.column.numerical.NumericalColumn object at 0x7f3e90c2ac20>
[
  1,
  2,
  null,
  3
]
dtype: int64
```

Authors:
  - Ashwin Srinath (@shwina)

Approvers:
  - Keith Kraus (@kkraus14)

URL: #7531
  • Loading branch information
shwina authored Mar 12, 2021
1 parent 365e649 commit 5c4fa28
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 10 deletions.
7 changes: 7 additions & 0 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,13 @@ def mask_array_view(self) -> "cuda.devicearray.DeviceNDArray":
def __len__(self) -> int:
return self.size

def __repr__(self):
return (
f"{object.__repr__(self)}\n"
f"{self.to_arrow().to_string()}\n"
f"dtype: {self.dtype}"
)

def to_pandas(
self, index: ColumnLike = None, nullable: bool = False, **kwargs
) -> "pd.Series":
Expand Down
16 changes: 8 additions & 8 deletions python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,15 @@ def __len__(self) -> int:
return len(self._data)

def __repr__(self) -> str:
data_repr = self._data.__repr__()
multiindex_repr = self.multiindex.__repr__()
level_names_repr = self.level_names.__repr__()
return "{}({}, multiindex={}, level_names={})".format(
self.__class__.__name__,
data_repr,
multiindex_repr,
level_names_repr,
type_info = (
f"{self.__class__.__name__}("
f"multiindex={self.multiindex}, "
f"level_names={self.level_names})"
)
column_info = "\n".join(
[f"{name}: {col.dtype}" for name, col in self.items()]
)
return f"{type_info}\n{column_info}"

@property
def level_names(self) -> Tuple[Any, ...]:
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/utils/cudautils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
from pickle import dumps

import cachetools
import cupy
import numpy as np
from numba import cuda
from pickle import dumps

import cudf
from cudf.utils.utils import check_equals_float, check_equals_int
Expand Down Expand Up @@ -239,7 +240,7 @@ def grouped_window_sizes_from_offset(arr, group_starts, offset):
# it can hit for distinct functions that are similar. The lru_cache wrapping
# compile_udf misses for these similar functions, but doesn't need to serialize
# closure variables to check for a hit.
_udf_code_cache = cachetools.LRUCache(maxsize=32)
_udf_code_cache: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)


def compile_udf(udf, type_signature):
Expand Down

0 comments on commit 5c4fa28

Please sign in to comment.