feat(python): support transparent DataFrame init from numpy structure…

…d arrays and record arrays.
pola-rs · May 1, 2023 · e586753 · e586753
1 parent 03ef717
commit e586753
Show file tree

Hide file tree

Showing 3 changed files with 104 additions and 31 deletions.
diff --git a/py-polars/polars/utils/_construction.py b/py-polars/polars/utils/_construction.py
@@ -1120,54 +1120,77 @@ def numpy_to_pydf(
     orient: Orientation | None = None,
     nan_to_null: bool = False,
 ) -> PyDataFrame:
-    """Construct a PyDataFrame from a numpy ndarray."""
+    """Construct a PyDataFrame from a numpy ndarray (including structured ndarrays)."""
     shape = data.shape
 
-    # Unpack columns
-    if shape == (0,):
-        n_columns = 0
+    if data.dtype.names is not None:
+        structured_array, orient = True, "col"
+        record_names = list(data.dtype.names)
+        n_columns = len(record_names)
+        for nm in record_names:
+            shape = data[nm].shape
+            if len(data[nm].shape) > 2:
+                raise ValueError(
+                    f"Cannot create DataFrame from structured array with elements > 2D; shape[{nm!r}] = {shape}"
+                )
+        if not schema:
+            schema = record_names
+    else:
+        # Unpack columns
+        structured_array, record_names = False, []
+        if shape == (0,):
+            n_columns = 0
 
-    elif len(shape) == 1:
-        n_columns = 1
+        elif len(shape) == 1:
+            n_columns = 1
 
-    elif len(shape) == 2:
-        # default convention
-        # first axis is rows, second axis is columns
-        if orient is None and schema is None:
-            n_columns = shape[1]
-            orient = "row"
+        elif len(shape) == 2:
+            if orient is None and schema is None:
+                # default convention; first axis is rows, second axis is columns
+                n_columns = shape[1]
+                orient = "row"
 
-        # Infer orientation if columns argument is given
-        elif orient is None and schema is not None:
-            if len(schema) == shape[0]:
-                orient = "col"
+            elif orient is None and schema is not None:
+                # infer orientation from 'schema' param
+                if len(schema) == shape[0]:
+                    orient = "col"
+                    n_columns = shape[0]
+                else:
+                    orient = "row"
+                    n_columns = shape[1]
+
+            elif orient == "row":
+                n_columns = shape[1]
+            elif orient == "col":
                 n_columns = shape[0]
             else:
-                orient = "row"
-                n_columns = shape[1]
-
-        elif orient == "row":
-            n_columns = shape[1]
-        elif orient == "col":
-            n_columns = shape[0]
+                raise ValueError(
+                    f"orient must be one of {{'col', 'row', None}}; found {orient!r} instead."
+                )
         else:
             raise ValueError(
-                f"orient must be one of {{'col', 'row', None}}, got {orient} instead."
+                f"Cannot create DataFrame from array with more than two dimensions; shape = {shape}"
             )
-    else:
-        raise ValueError(
-            "Cannot create DataFrame from numpy array with more than two dimensions."
-        )
 
     if schema is not None and len(schema) != n_columns:
-        raise ValueError("Dimensions of columns arg must match data dimensions.")
+        raise ValueError("Dimensions of 'schema' arg must match data dimensions.")
 
     column_names, schema_overrides = _unpack_schema(
         schema, schema_overrides=schema_overrides, n_expected=n_columns
     )
 
     # Convert data to series
-    if shape == (0,):
+    if structured_array:
+        data_series = [
+            pl.Series(
+                name=series_name,
+                values=data[record_name],
+                dtype=schema_overrides.get(record_name),
+                nan_to_null=nan_to_null,
+            )._s
+            for series_name, record_name in zip(column_names, record_names)
+        ]
+    elif shape == (0,):
         data_series = []
 
     elif len(shape) == 1:

diff --git a/py-polars/requirements-dev.txt b/py-polars/requirements-dev.txt
@@ -15,7 +15,7 @@ tzdata; platform_system == 'Windows'
 xlsx2csv
 XlsxWriter
 adbc_driver_sqlite; python_version >= '3.9' and platform_system != 'Windows'
-connectorx==0.3.2a2; python_version >= '3.8'  # Latest full release is broken - unpin when 0.3.2 released
+connectorx==0.3.2a5; python_version >= '3.8'  # Latest full release is broken - unpin when 0.3.2 released
 
 # Tooling
 hypothesis==6.75.1

diff --git a/py-polars/tests/unit/test_interop.py b/py-polars/tests/unit/test_interop.py
@@ -386,6 +386,56 @@ def test_from_numpy() -> None:
     assert df.schema == {"a": pl.UInt32, "b": pl.UInt32}
 
 
+def test_from_numpy_structured() -> None:
+    test_data = [
+        ("Google Pixel 7", 521.90, True),
+        ("Apple iPhone 14 Pro", 999.00, True),
+        ("Samsung Galaxy S23 Ultra", 1199.99, False),
+        ("OnePlus 11", 699.00, True),
+    ]
+    # create a numpy structured array...
+    arr_structured = np.array(
+        test_data,
+        dtype=np.dtype(
+            [
+                ("product", "U32"),
+                ("price_usd", "float64"),
+                ("in_stock", "bool"),
+            ]
+        ),
+    )
+    # ...and also establish as a record array view
+    arr_records = arr_structured.view(np.recarray)
+
+    # confirm that we can cleanly initialise a DataFrame from both,
+    # respecting the native dtypes and any schema overrides, etc.
+    for arr in (arr_structured, arr_records):
+        df = pl.DataFrame(data=arr).sort(by="price_usd", descending=True)
+
+        assert df.schema == {
+            "product": pl.Utf8,
+            "price_usd": pl.Float64,
+            "in_stock": pl.Boolean,
+        }
+        assert df.rows() == sorted(test_data, key=lambda row: -row[1])
+
+        for df in (
+            pl.DataFrame(
+                data=arr, schema=["phone", ("price_usd", pl.Float32), "available"]
+            ),
+            pl.DataFrame(
+                data=arr,
+                schema=["phone", "price_usd", "available"],
+                schema_overrides={"price_usd": pl.Float32},
+            ),
+        ):
+            assert df.schema == {
+                "phone": pl.Utf8,
+                "price_usd": pl.Float32,
+                "available": pl.Boolean,
+            }
+
+
 def test_from_arrow() -> None:
     data = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]})
     df = pl.from_arrow(data)