Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): support DataFrame export to numpy structured/record arrays #8628

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 58 additions & 11 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1964,12 +1964,18 @@ def to_dicts(self) -> list[dict[str, Any]]:
"""
return list(self.iter_rows(named=True))

def to_numpy(self) -> np.ndarray[Any, Any]:
def to_numpy(self, structured: bool = False) -> np.ndarray[Any, Any]:
"""
Convert DataFrame to a 2D NumPy array.

This operation clones data.

Parameters
----------
structured
Optionally return a structured array, with field names and
dtypes that correspond to the DataFrame schema.

Notes
-----
If you're attempting to convert Utf8 to an array you'll need to install
Expand All @@ -1978,20 +1984,61 @@ def to_numpy(self) -> np.ndarray[Any, Any]:
Examples
--------
>>> df = pl.DataFrame(
... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]}
... {
... "foo": [1, 2, 3],
... "bar": [6.5, 7.0, 8.5],
... "ham": ["a", "b", "c"],
... },
... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32},
... )
>>> numpy_array = df.to_numpy()
>>> type(numpy_array)
<class 'numpy.ndarray'>

Export to a standard 2D numpy array.

>>> df.to_numpy()
array([[1, 6.5, 'a'],
[2, 7.0, 'b'],
[3, 8.5, 'c']], dtype=object)

Export to a structured array, which can better-preserve individual
column data, such as name and dtype...

>>> df.to_numpy(structured=True)
array([(1, 6.5, 'a'), (2, 7. , 'b'), (3, 8.5, 'c')],
dtype=[('foo', 'u1'), ('bar', '<f4'), ('ham', '<U1')])

...optionally zero-copying as a record array view:

>>> import numpy as np
>>> df.to_numpy(True).view(np.recarray)
rec.array([(1, 6.5, 'a'), (2, 7. , 'b'), (3, 8.5, 'c')],
dtype=[('foo', 'u1'), ('bar', '<f4'), ('ham', '<U1')])

"""
out = self._df.to_numpy()
if out is None:
return np.vstack(
[self.to_series(i).to_numpy() for i in range(self.width)]
).T
if structured:
# see: https://numpy.org/doc/stable/user/basics.rec.html
alexander-beedie marked this conversation as resolved.
Show resolved Hide resolved
arrays = []
for c, tp in self.schema.items():
s = self[c]
a = s.to_numpy()
arrays.append(
a.astype(str, copy=False)
if tp == Utf8 and not s.has_validity()
else a
)

out = np.empty(
len(self), dtype=list(zip(self.columns, (a.dtype for a in arrays)))
)
for idx, c in enumerate(self.columns):
out[c] = arrays[idx]
else:
return out
out = self._df.to_numpy()
if out is None:
return np.vstack(
[self.to_series(i).to_numpy() for i in range(self.width)]
).T

return out

def to_pandas( # noqa: D417
self,
Expand Down
57 changes: 55 additions & 2 deletions py-polars/tests/unit/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
import numpy as np
import pyarrow as pa
import pytest
from numpy.testing import assert_array_equal
from numpy.testing import assert_array_equal, assert_equal

import polars as pl
from polars.datatypes import DTYPE_TEMPORAL_UNITS, INTEGER_DTYPES
from polars.datatypes import DTYPE_TEMPORAL_UNITS, FLOAT_DTYPES, INTEGER_DTYPES
from polars.testing import (
assert_frame_equal,
assert_frame_not_equal,
Expand Down Expand Up @@ -1401,14 +1401,67 @@ def test_assign() -> None:

def test_to_numpy() -> None:
df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})

out_array = df.to_numpy()
expected_array = np.array([[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]], dtype=np.float64)
assert_array_equal(out_array, expected_array)
assert out_array.flags["F_CONTIGUOUS"] is True

structured_array = df.to_numpy(structured=True)
expected_array = np.array(
[(1, 1.0), (2, 2.0), (3, 3.0)], dtype=[("a", "<i8"), ("b", "<f8")]
)
assert_array_equal(structured_array, expected_array)
assert structured_array.flags["F_CONTIGUOUS"] is True


def test_to_numpy_structured() -> None:
# round-trip structured array: validate init/export
structured_array = np.array(
[
("Google Pixel 7", 521.90, True),
("Apple iPhone 14 Pro", 999.00, True),
("OnePlus 11", 699.00, True),
("Samsung Galaxy S23 Ultra", 1199.99, False),
],
dtype=np.dtype(
[
("product", "U24"),
("price_usd", "float64"),
("in_stock", "bool"),
]
),
)

df = pl.from_numpy(structured_array)
assert df.schema == {
"product": pl.Utf8,
"price_usd": pl.Float64,
"in_stock": pl.Boolean,
}
exported_array = df.to_numpy(structured=True)
assert exported_array["product"].dtype == np.dtype("U24")
assert_array_equal(exported_array, structured_array)

# none/nan values
df = pl.DataFrame({"x": ["a", None, "b"], "y": [5.5, None, -5.5]})
exported_array = df.to_numpy(structured=True)

assert exported_array.dtype == np.dtype([("x", object), ("y", float)])
for name in df.columns:
assert_equal(
list(exported_array[name]),
(
df[name].fill_null(float("nan"))
if df.schema[name] in FLOAT_DTYPES
else df[name]
).to_list(),
)


def test__array__() -> None:
df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})

out_array = np.asarray(df.to_numpy())
expected_array = np.array([[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]], dtype=np.float64)
assert_array_equal(out_array, expected_array)
Expand Down
6 changes: 3 additions & 3 deletions py-polars/tests/unit/test_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -719,9 +719,9 @@ def test_from_pyarrow_chunked_array() -> None:


def test_numpy_preserve_uint64_4112() -> None:
assert pl.DataFrame({"a": [1, 2, 3]}).with_columns(
pl.col("a").hash()
).to_numpy().dtype == np.dtype("uint64")
df = pl.DataFrame({"a": [1, 2, 3]}).with_columns(pl.col("a").hash())
assert df.to_numpy().dtype == np.dtype("uint64")
assert df.to_numpy(structured=True).dtype == np.dtype([("a", "uint64")])


def test_view_ub() -> None:
Expand Down