You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When a dataframe column contains an ArrowExtensionArray whose type is an Arrow struct, certain dataframe operations referencing the column error out, such as grouping by it. The above minimal example causes the following traceback for me:
---------------------------------------------------------------------------
ArrowNotImplementedError Traceback (most recent call last)
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pandas/core/groupby/groupby.py:1942, in GroupBy._agg_py_fallback(self, how, values, ndim, alt)
1941 try:
-> 1942 res_values = self._grouper.agg_series(ser, alt, preserve_dtype=True)
1943 except Exception as err:
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pandas/core/groupby/ops.py:864, in BaseGrouper.agg_series(self, obj, func, preserve_dtype)
862 preserve_dtype = True
--> 864 result = self._aggregate_series_pure_python(obj, func)
866 npvalues = lib.maybe_convert_objects(result, try_float=False)
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pandas/core/groupby/ops.py:877, in BaseGrouper._aggregate_series_pure_python(self, obj, func)
873 @final
874 def _aggregate_series_pure_python(
875 self, obj: Series, func: Callable
876 ) -> npt.NDArray[np.object_]:
--> 877 _, _, ngroups = self.group_info
879 result = np.empty(ngroups, dtype="O")
File properties.pyx:36, in pandas._libs.properties.CachedProperty.__get__()
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pandas/core/groupby/ops.py:745, in BaseGrouper.group_info(self)
743 @cache_readonly
744 def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
--> 745 comp_ids, obs_group_ids = self._get_compressed_codes()
747 ngroups = len(obs_group_ids)
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pandas/core/groupby/ops.py:769, in BaseGrouper._get_compressed_codes(self)
768 ping = self.groupings[0]
--> 769 return ping.codes, np.arange(len(ping._group_index), dtype=np.intp)
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pandas/core/groupby/grouper.py:691, in Grouping.codes(self)
689 @property
690 def codes(self) -> npt.NDArray[np.signedinteger]:
--> 691 return self._codes_and_uniques[0]
File properties.pyx:36, in pandas._libs.properties.CachedProperty.__get__()
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pandas/core/groupby/grouper.py:835, in Grouping._codes_and_uniques(self)
831 else:
832 # GH35667, replace dropna=False with use_na_sentinel=False
833 # error: Incompatible types in assignment (expression has type "Union[
834 # ndarray[Any, Any], Index]", variable has type "Categorical")
--> 835 codes, uniques = algorithms.factorize( # type: ignore[assignment]
836 self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna
837 )
838 return codes, uniques
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pandas/core/algorithms.py:779, in factorize(values, sort, use_na_sentinel, size_hint)
777 elif not isinstance(values, np.ndarray):
778 # i.e. ExtensionArray
--> 779 codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel)
781 else:
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/array.py:1151, in ArrowExtensionArray.factorize(self, use_na_sentinel)
1150 else:
-> 1151 encoded = data.dictionary_encode(null_encoding=null_encoding)
1152 if encoded.length() == 0:
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pyarrow/table.pxi:657, in pyarrow.lib.ChunkedArray.dictionary_encode()
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pyarrow/_compute.pyx:598, in pyarrow._compute.call_function()
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pyarrow/_compute.pyx:393, in pyarrow._compute.Function.call()
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pyarrow/error.pxi:155, in pyarrow.lib.pyarrow_internal_check_status()
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pyarrow/error.pxi:92, in pyarrow.lib.check_status()
ArrowNotImplementedError: Function 'dictionary_encode' has no kernel matching input types (struct<x: string, y: int64>)
The above exception was the direct cause of the following exception:
ArrowNotImplementedError Traceback (most recent call last)
Cell In[135], line 4
2 array = pa.StructArray.from_arrays([['a','b'], [1,2]], ('x','y'))
3 df = pd.DataFrame({'struct': pd.Series(array, dtype=pd.ArrowDtype(array.type)), 'val': range(2)})
----> 4 df.groupby('struct').mean()
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pandas/core/groupby/groupby.py:2452, in GroupBy.mean(self, numeric_only, engine, engine_kwargs)
2445 return self._numba_agg_general(
2446 grouped_mean,
2447 executor.float_dtype_mapping,
2448 engine_kwargs,
2449 min_periods=0,
2450 )
2451 else:
-> 2452 result = self._cython_agg_general(
2453 "mean",
2454 alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only),
2455 numeric_only=numeric_only,
2456 )
2457 return result.__finalize__(self.obj, method="groupby")
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pandas/core/groupby/groupby.py:1998, in GroupBy._cython_agg_general(self, how, alt, numeric_only, min_count, **kwargs)
1995 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
1996 return result
-> 1998 new_mgr = data.grouped_reduce(array_func)
1999 res = self._wrap_agged_manager(new_mgr)
2000 if how in ["idxmin", "idxmax"]:
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pandas/core/internals/managers.py:1472, in BlockManager.grouped_reduce(self, func)
1470 result_blocks = extend_blocks(applied, result_blocks)
1471 else:
-> 1472 applied = blk.apply(func)
1473 result_blocks = extend_blocks(applied, result_blocks)
1475 if len(result_blocks) == 0:
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pandas/core/internals/blocks.py:393, in Block.apply(self, func, **kwargs)
387 @final
388 def apply(self, func, **kwargs) -> list[Block]:
389 """
390 apply the function to my values; return a block if we are not
391 one
392 """
--> 393 result = func(self.values, **kwargs)
395 result = maybe_coerce_values(result)
396 return self._split_op_result(result)
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pandas/core/groupby/groupby.py:1995, in GroupBy._cython_agg_general.<locals>.array_func(values)
1992 return result
1994 assert alt is not None
-> 1995 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
1996 return result
File ~/peopyl_density/.venv/lib/python3.12/site-packages/pandas/core/groupby/groupby.py:1946, in GroupBy._agg_py_fallback(self, how, values, ndim, alt)
1944 msg = f"agg function failed [how->{how},dtype->{ser.dtype}]"
1945 # preserve the kind of exception that raised
-> 1946 raise type(err)(msg) from err
1948 if ser.dtype == object:
1949 res_values = res_values.astype(object, copy=False)
ArrowNotImplementedError: agg function failed [how->mean,dtype->int64]
Feel free to let me know if this is more due to an issue with Arrow functionality that pandas relies on. If this can't/shouldn't be solved on the pandas side I can discuss on the Arrow issue tracker.
Expected Behavior
I would generally expect grouping a dataframe on a column to succeed/work the same regardless of if the dtype is e.g. numpy based or a pyarrow primitive type rather than a struct.
Installed Versions
INSTALLED VERSIONS
commit : 0691c5c
python : 3.12.2
python-bits : 64
OS : Linux
OS-release : 5.15.167.4-microsoft-standard-WSL2
Version : #1 SMP Tue Nov 5 00:21:55 UTC 2024
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : C.UTF-8
LOCALE : C.UTF-8
Thanks for the report. A minimum requirement for groupby (and unique / drop_duplicates), is that the values are hashable. pandas uses hashes to be able to efficiently perform these operations. Based on the stack above, pandas is relying on PyArrow's dictionary_encode, which is saying this dtype is not supported. As such, I think pandas is not able to support this dtype in groupby. If there is an effect alternative provided by PyArrow, we could possibly use that instead.
But we should make this error more clear, and higher up in the callstack. PRs to improve this are welcome!
Pandas version checks
I have checked that this issue has not already been reported.
I have confirmed this bug exists on the latest version of pandas.
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
Issue Description
When a dataframe column contains an ArrowExtensionArray whose type is an Arrow struct, certain dataframe operations referencing the column error out, such as grouping by it. The above minimal example causes the following traceback for me:
Feel free to let me know if this is more due to an issue with Arrow functionality that pandas relies on. If this can't/shouldn't be solved on the pandas side I can discuss on the Arrow issue tracker.
Expected Behavior
I would generally expect grouping a dataframe on a column to succeed/work the same regardless of if the dtype is e.g. numpy based or a pyarrow primitive type rather than a struct.
Installed Versions
INSTALLED VERSIONS
commit : 0691c5c
python : 3.12.2
python-bits : 64
OS : Linux
OS-release : 5.15.167.4-microsoft-standard-WSL2
Version : #1 SMP Tue Nov 5 00:21:55 UTC 2024
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : C.UTF-8
LOCALE : C.UTF-8
pandas : 2.2.3
numpy : 2.1.3
pytz : 2024.2
dateutil : 2.9.0.post0
pip : None
Cython : None
sphinx : None
IPython : 8.30.0
adbc-driver-postgresql: None
adbc-driver-sqlite : None
bs4 : 4.12.3
blosc : None
bottleneck : None
dataframe-api-compat : None
fastparquet : None
fsspec : None
html5lib : None
hypothesis : None
gcsfs : None
jinja2 : 3.1.4
lxml.etree : None
matplotlib : None
numba : None
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
psycopg2 : None
pymysql : None
pyarrow : 18.1.0
pyreadstat : None
pytest : None
python-calamine : None
pyxlsb : None
s3fs : None
scipy : 1.14.1
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
xlsxwriter : None
zstandard : None
tzdata : 2024.2
qtpy : None
pyqt5 : None
The text was updated successfully, but these errors were encountered: