Skip to content

Commit

Permalink
feat(python): support transparent DataFrame init from numpy structure…
Browse files Browse the repository at this point in the history
…d arrays and record arrays.
  • Loading branch information
alexander-beedie committed May 1, 2023
1 parent 03ef717 commit e586753
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 31 deletions.
83 changes: 53 additions & 30 deletions py-polars/polars/utils/_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -1120,54 +1120,77 @@ def numpy_to_pydf(
orient: Orientation | None = None,
nan_to_null: bool = False,
) -> PyDataFrame:
"""Construct a PyDataFrame from a numpy ndarray."""
"""Construct a PyDataFrame from a numpy ndarray (including structured ndarrays)."""
shape = data.shape

# Unpack columns
if shape == (0,):
n_columns = 0
if data.dtype.names is not None:
structured_array, orient = True, "col"
record_names = list(data.dtype.names)
n_columns = len(record_names)
for nm in record_names:
shape = data[nm].shape
if len(data[nm].shape) > 2:
raise ValueError(
f"Cannot create DataFrame from structured array with elements > 2D; shape[{nm!r}] = {shape}"
)
if not schema:
schema = record_names
else:
# Unpack columns
structured_array, record_names = False, []
if shape == (0,):
n_columns = 0

elif len(shape) == 1:
n_columns = 1
elif len(shape) == 1:
n_columns = 1

elif len(shape) == 2:
# default convention
# first axis is rows, second axis is columns
if orient is None and schema is None:
n_columns = shape[1]
orient = "row"
elif len(shape) == 2:
if orient is None and schema is None:
# default convention; first axis is rows, second axis is columns
n_columns = shape[1]
orient = "row"

# Infer orientation if columns argument is given
elif orient is None and schema is not None:
if len(schema) == shape[0]:
orient = "col"
elif orient is None and schema is not None:
# infer orientation from 'schema' param
if len(schema) == shape[0]:
orient = "col"
n_columns = shape[0]
else:
orient = "row"
n_columns = shape[1]

elif orient == "row":
n_columns = shape[1]
elif orient == "col":
n_columns = shape[0]
else:
orient = "row"
n_columns = shape[1]

elif orient == "row":
n_columns = shape[1]
elif orient == "col":
n_columns = shape[0]
raise ValueError(
f"orient must be one of {{'col', 'row', None}}; found {orient!r} instead."
)
else:
raise ValueError(
f"orient must be one of {{'col', 'row', None}}, got {orient} instead."
f"Cannot create DataFrame from array with more than two dimensions; shape = {shape}"
)
else:
raise ValueError(
"Cannot create DataFrame from numpy array with more than two dimensions."
)

if schema is not None and len(schema) != n_columns:
raise ValueError("Dimensions of columns arg must match data dimensions.")
raise ValueError("Dimensions of 'schema' arg must match data dimensions.")

column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides, n_expected=n_columns
)

# Convert data to series
if shape == (0,):
if structured_array:
data_series = [
pl.Series(
name=series_name,
values=data[record_name],
dtype=schema_overrides.get(record_name),
nan_to_null=nan_to_null,
)._s
for series_name, record_name in zip(column_names, record_names)
]
elif shape == (0,):
data_series = []

elif len(shape) == 1:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ tzdata; platform_system == 'Windows'
xlsx2csv
XlsxWriter
adbc_driver_sqlite; python_version >= '3.9' and platform_system != 'Windows'
connectorx==0.3.2a2; python_version >= '3.8' # Latest full release is broken - unpin when 0.3.2 released
connectorx==0.3.2a5; python_version >= '3.8' # Latest full release is broken - unpin when 0.3.2 released

# Tooling
hypothesis==6.75.1
Expand Down
50 changes: 50 additions & 0 deletions py-polars/tests/unit/test_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,56 @@ def test_from_numpy() -> None:
assert df.schema == {"a": pl.UInt32, "b": pl.UInt32}


def test_from_numpy_structured() -> None:
test_data = [
("Google Pixel 7", 521.90, True),
("Apple iPhone 14 Pro", 999.00, True),
("Samsung Galaxy S23 Ultra", 1199.99, False),
("OnePlus 11", 699.00, True),
]
# create a numpy structured array...
arr_structured = np.array(
test_data,
dtype=np.dtype(
[
("product", "U32"),
("price_usd", "float64"),
("in_stock", "bool"),
]
),
)
# ...and also establish as a record array view
arr_records = arr_structured.view(np.recarray)

# confirm that we can cleanly initialise a DataFrame from both,
# respecting the native dtypes and any schema overrides, etc.
for arr in (arr_structured, arr_records):
df = pl.DataFrame(data=arr).sort(by="price_usd", descending=True)

assert df.schema == {
"product": pl.Utf8,
"price_usd": pl.Float64,
"in_stock": pl.Boolean,
}
assert df.rows() == sorted(test_data, key=lambda row: -row[1])

for df in (
pl.DataFrame(
data=arr, schema=["phone", ("price_usd", pl.Float32), "available"]
),
pl.DataFrame(
data=arr,
schema=["phone", "price_usd", "available"],
schema_overrides={"price_usd": pl.Float32},
),
):
assert df.schema == {
"phone": pl.Utf8,
"price_usd": pl.Float32,
"available": pl.Boolean,
}


def test_from_arrow() -> None:
data = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]})
df = pl.from_arrow(data)
Expand Down

0 comments on commit e586753

Please sign in to comment.