Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): support transparent DataFrame init from numpy structured/record arrays. #8620

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 53 additions & 30 deletions py-polars/polars/utils/_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -1120,54 +1120,77 @@ def numpy_to_pydf(
orient: Orientation | None = None,
nan_to_null: bool = False,
) -> PyDataFrame:
"""Construct a PyDataFrame from a numpy ndarray."""
"""Construct a PyDataFrame from a numpy ndarray (including structured ndarrays)."""
shape = data.shape

# Unpack columns
if shape == (0,):
n_columns = 0
if data.dtype.names is not None:
structured_array, orient = True, "col"
record_names = list(data.dtype.names)
n_columns = len(record_names)
for nm in record_names:
shape = data[nm].shape
if len(data[nm].shape) > 2:
raise ValueError(
f"Cannot create DataFrame from structured array with elements > 2D; shape[{nm!r}] = {shape}"
)
if not schema:
schema = record_names
else:
# Unpack columns
structured_array, record_names = False, []
if shape == (0,):
n_columns = 0

elif len(shape) == 1:
n_columns = 1
elif len(shape) == 1:
n_columns = 1

elif len(shape) == 2:
# default convention
# first axis is rows, second axis is columns
if orient is None and schema is None:
n_columns = shape[1]
orient = "row"
elif len(shape) == 2:
if orient is None and schema is None:
# default convention; first axis is rows, second axis is columns
n_columns = shape[1]
orient = "row"

# Infer orientation if columns argument is given
elif orient is None and schema is not None:
if len(schema) == shape[0]:
orient = "col"
elif orient is None and schema is not None:
# infer orientation from 'schema' param
if len(schema) == shape[0]:
orient = "col"
n_columns = shape[0]
else:
orient = "row"
n_columns = shape[1]

elif orient == "row":
n_columns = shape[1]
elif orient == "col":
n_columns = shape[0]
else:
orient = "row"
n_columns = shape[1]

elif orient == "row":
n_columns = shape[1]
elif orient == "col":
n_columns = shape[0]
raise ValueError(
f"orient must be one of {{'col', 'row', None}}; found {orient!r} instead."
)
else:
raise ValueError(
f"orient must be one of {{'col', 'row', None}}, got {orient} instead."
f"Cannot create DataFrame from array with more than two dimensions; shape = {shape}"
)
else:
raise ValueError(
"Cannot create DataFrame from numpy array with more than two dimensions."
)

if schema is not None and len(schema) != n_columns:
raise ValueError("Dimensions of columns arg must match data dimensions.")
raise ValueError("Dimensions of 'schema' arg must match data dimensions.")

column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides, n_expected=n_columns
)

# Convert data to series
if shape == (0,):
if structured_array:
data_series = [
pl.Series(
name=series_name,
values=data[record_name],
dtype=schema_overrides.get(record_name),
nan_to_null=nan_to_null,
)._s
for series_name, record_name in zip(column_names, record_names)
]
elif shape == (0,):
data_series = []

elif len(shape) == 1:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ tzdata; platform_system == 'Windows'
xlsx2csv
XlsxWriter
adbc_driver_sqlite; python_version >= '3.9' and platform_system != 'Windows'
connectorx==0.3.2a2; python_version >= '3.8' # Latest full release is broken - unpin when 0.3.2 released
connectorx==0.3.2a5; python_version >= '3.8' # Latest full release is broken - unpin when 0.3.2 released

# Tooling
hypothesis==6.75.1
Expand Down
50 changes: 50 additions & 0 deletions py-polars/tests/unit/test_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,56 @@ def test_from_numpy() -> None:
assert df.schema == {"a": pl.UInt32, "b": pl.UInt32}


def test_from_numpy_structured() -> None:
test_data = [
("Google Pixel 7", 521.90, True),
("Apple iPhone 14 Pro", 999.00, True),
("Samsung Galaxy S23 Ultra", 1199.99, False),
("OnePlus 11", 699.00, True),
]
# create a numpy structured array...
arr_structured = np.array(
test_data,
dtype=np.dtype(
[
("product", "U32"),
("price_usd", "float64"),
("in_stock", "bool"),
]
),
)
# ...and also establish as a record array view
arr_records = arr_structured.view(np.recarray)

# confirm that we can cleanly initialise a DataFrame from both,
# respecting the native dtypes and any schema overrides, etc.
for arr in (arr_structured, arr_records):
df = pl.DataFrame(data=arr).sort(by="price_usd", descending=True)

assert df.schema == {
"product": pl.Utf8,
"price_usd": pl.Float64,
"in_stock": pl.Boolean,
}
assert df.rows() == sorted(test_data, key=lambda row: -row[1])

for df in (
pl.DataFrame(
data=arr, schema=["phone", ("price_usd", pl.Float32), "available"]
),
pl.DataFrame(
data=arr,
schema=["phone", "price_usd", "available"],
schema_overrides={"price_usd": pl.Float32},
),
):
assert df.schema == {
"phone": pl.Utf8,
"price_usd": pl.Float32,
"available": pl.Boolean,
}


def test_from_arrow() -> None:
data = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]})
df = pl.from_arrow(data)
Expand Down