Interface with polars (#108)

* Convert `BiocFrame` objects to polars `DataFrame` and vice-verse. * Add method to flatten a nested `BiocFrame` object. * Update tests and documentation.
BiocPy · Jun 10, 2024 · b5646a1 · b5646a1
1 parent 641fcd6
commit b5646a1
Show file tree

Hide file tree

Showing 6 changed files with 203 additions and 44 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -314,6 +314,7 @@
     "setuptools": ("https://setuptools.pypa.io/en/stable/", None),
     "pyscaffold": ("https://pyscaffold.org/en/stable", None),
     "biocutils": ("https://biocpy.github.io/BiocUtils", None),
+    "polars": ("https://docs.pola.rs/api/python/stable/", None),
 }
 
 print(f"loading configurations for {project} {version} ...", file=sys.stderr)
diff --git a/setup.cfg b/setup.cfg
@@ -62,13 +62,15 @@ exclude =
 # `pip install BiocFrame[PDF]` like:
 optional =
     pandas
+    polars
 
 # Add here test requirements (semicolon/line-separated)
 testing =
     setuptools
     pytest
     pytest-cov
     pandas
+    polars
 
 [options.entry_points]
 # Add here console scripts like:

diff --git a/src/biocframe/BiocFrame.py b/src/biocframe/BiocFrame.py
@@ -1166,20 +1166,14 @@ def to_pandas(self):
         """Convert the ``BiocFrame`` into a :py:class:`~pandas.DataFrame` object.
 
         Returns:
-            A :py:class:`~pandas.DataFrame` object.
+            A :py:class:`~pandas.DataFrame` object. Column names of the resulting
+            dataframe may be different is the `BiocFrame` is nested.
         """
         from pandas import DataFrame
 
         if len(self.column_names) > 0:
-            _data_copy = OrderedDict()
-            for col in self.column_names:
-                _data_copy[col] = self.column(col)
-                if isinstance(self.column(col), ut.Factor):
-                    _data_copy[col] = _data_copy[col].to_pandas()
-
-            return DataFrame(
-                data=_data_copy, index=self._row_names, columns=self._column_names
-            )
+            _data_copy = self.flatten(as_type="dict")
+            return DataFrame(data=_data_copy, index=self._row_names)
         else:
             return DataFrame(data={}, index=range(self._number_of_rows))
 
@@ -1208,10 +1202,92 @@ def from_pandas(cls, input: "pandas.DataFrame") -> "BiocFrame":
 
         return cls(data=rdata, row_names=rindex, column_names=input.columns.to_list())
 
+    ################################
+    ######>> polars interop <<######
+    ################################
+
+    @classmethod
+    def from_polars(cls, input: "polars.DataFrame") -> "BiocFrame":
+        """Create a ``BiocFrame`` from a :py:class:`~polars.DataFrame` object.
+
+        Args:
+            input:
+                Input data.
+
+        Returns:
+            A ``BiocFrame`` object.
+        """
+
+        from polars import DataFrame
+
+        if not isinstance(input, DataFrame):
+            raise TypeError("`data` is not a polars `DataFrame` object.")
+
+        rdata = input.to_dict(as_series=False)
+
+        return cls(data=rdata)
+
+    def to_polars(self):
+        """Convert the ``BiocFrame`` into a :py:class:`~polars.DataFrame` object.
+
+        Returns:
+            A :py:class:`~polars.DataFrame` object. Column names of the resulting
+            dataframe may be different is the `BiocFrame` is nested.
+        """
+        from polars import DataFrame
+
+        if len(self.column_names) > 0:
+            _data_copy = self.flatten(as_type="dict")
+            return DataFrame(data=_data_copy)
+        else:
+            return DataFrame(data={})
+
     ###############################
     ######>> Miscellaneous <<######
     ###############################
 
+    def flatten(
+        self, as_type: Literal["dict", "biocframe"] = "dict", delim: str = "."
+    ) -> "BiocFrame":
+        """Flatten a nested BiocFrame object.
+
+        Args:
+            as_type:
+                Return type of the result. Either a :py:class:`~dict` or a
+                :py:class:`~biocframe.BiocFrame.BiocFrame` object.
+
+            delim:
+                Delimiter to join nested column names. Defaults to `"."`.
+
+        Returns:
+            An object with the type specified by ``as_type`` argument.
+            If ``as_type`` is `dict`, an additional column "rownames" is added if the object
+            contains rownames.
+        """
+
+        if as_type not in ["dict", "biocframe"]:
+            raise ValueError("'as_type' must be either 'dict' or 'biocframe'.")
+
+        _data_copy = OrderedDict()
+        for col in list(self.get_column_names()):
+            _cold = self.column(col)
+            if isinstance(_cold, BiocFrame):
+                _res = _cold.flatten(as_type=as_type)
+                for k in _res.keys():
+                    _data_copy[f"{col}{delim}{k}"] = _res[k]
+            elif isinstance(_cold, ut.Factor):
+                _data_copy[col] = _cold.to_pandas()
+            else:
+                _data_copy[col] = _cold
+
+        if as_type == "biocframe":
+            return BiocFrame(_data_copy, row_names=self._row_names)
+
+        if self._row_names is not None:
+            _data_copy["rownames"] = self._row_names
+
+        return _data_copy
+
     def combine(self, *other):
         """Wrapper around :py:func:`~relaxed_combine_rows`, provided for back-compatibility only."""
         return relaxed_combine_rows([self] + other)
@@ -1227,6 +1303,8 @@ def __array_ufunc__(self, func, method, *inputs, **kwargs) -> "BiocFrame":
             An object with the same type as the caller.
         """
 
+        warn("Not all NumPy array methods are fully tested.", UserWarning)
+
         from pandas import Series
         from pandas.api.types import is_numeric_dtype
 

diff --git a/tests/test_methods.py b/tests/test_methods.py
@@ -536,40 +536,6 @@ def test_nested_biocFrame_preserve_types():
     assert isinstance(sliced.column("column2"), np.ndarray)
 
 
-def test_export_pandas():
-    obj = BiocFrame(
-        {
-            "column1": [1, 2, 3],
-            "nested": BiocFrame(
-                {
-                    "ncol1": [4, 5, 6],
-                    "ncol2": ["a", "b", "c"],
-                    "deep": ["j", "k", "l"],
-                }
-            ),
-            "column2": np.array([1, 2, 3]),
-        }
-    )
-
-    pdf = obj.to_pandas()
-    assert pdf is not None
-    assert isinstance(pdf, pd.DataFrame)
-    assert len(pdf) == len(obj)
-    assert len(set(pdf.columns).difference(obj.colnames)) == 0
-
-    obj["factor"] = Factor([0, 2, 1], levels=["A", "B", "C"])
-    pdf = obj.to_pandas()
-    assert pdf is not None
-    assert isinstance(pdf, pd.DataFrame)
-    assert len(pdf) == len(obj)
-    assert len(set(pdf.columns).difference(obj.colnames)) == 0
-    assert pdf["factor"] is not None
-
-    emptyobj = BiocFrame(number_of_rows=100)
-    pdf = emptyobj.to_pandas()
-    assert len(pdf) == len(emptyobj)
-
-
 def test_names_generics():
     obj = BiocFrame(
         {

diff --git a/tests/test_pandas.py b/tests/test_pandas.py
@@ -0,0 +1,43 @@
+import numpy as np
+import pytest
+import pandas as pd
+from biocframe.BiocFrame import BiocFrame
+from biocutils import Factor
+
+__author__ = "jkanche"
+__copyright__ = "jkanche"
+__license__ = "MIT"
+
+
+def test_export_pandas():
+    obj = BiocFrame(
+        {
+            "column1": [1, 2, 3],
+            "nested": BiocFrame(
+                {
+                    "ncol1": [4, 5, 6],
+                    "ncol2": ["a", "b", "c"],
+                    "deep": ["j", "k", "l"],
+                }
+            ),
+            "column2": np.array([1, 2, 3]),
+        }
+    )
+
+    pdf = obj.to_pandas()
+    assert pdf is not None
+    assert isinstance(pdf, pd.DataFrame)
+    assert len(pdf) == len(obj)
+    assert len(set(pdf.columns).difference(obj.colnames)) == 3
+
+    obj["factor"] = Factor([0, 2, 1], levels=["A", "B", "C"])
+    pdf = obj.to_pandas()
+    assert pdf is not None
+    assert isinstance(pdf, pd.DataFrame)
+    assert len(pdf) == len(obj)
+    assert len(set(pdf.columns).difference(obj.colnames)) == 3
+    assert pdf["factor"] is not None
+
+    emptyobj = BiocFrame(number_of_rows=100)
+    pdf = emptyobj.to_pandas()
+    assert len(pdf) == len(emptyobj)
diff --git a/tests/test_polars.py b/tests/test_polars.py
@@ -0,0 +1,69 @@
+import pytest
+
+import polars as pl
+from biocframe import BiocFrame
+from biocutils import Names
+
+__author__ = "jkanche"
+__copyright__ = "jkanche"
+__license__ = "MIT"
+
+
+def test_from_polars():
+    obj = pl.DataFrame(
+        {
+            "a": [None, 2, 3, 4],
+            "b": [0.5, None, 2.5, 13],
+            "c": [True, True, False, None],
+        }
+    )
+
+    bframe = BiocFrame.from_polars(obj)
+    assert bframe is not None
+    assert isinstance(bframe.get_column_names(), Names)
+    assert list(bframe.get_column_names()) == ["a", "b", "c"]
+
+
+def test_to_polars():
+    obj = BiocFrame(
+        {
+            "a": [None, 2, 3, 4],
+            "b": [0.5, None, 2.5, 13],
+            "c": [True, True, False, None],
+        }
+    )
+
+    plframe = obj.to_polars()
+    assert plframe is not None
+    assert isinstance(plframe, pl.DataFrame)
+    assert plframe.columns == ["a", "b", "c"]
+
+
+def test_to_polars_nested():
+    obj = BiocFrame(
+        {
+            "a": [None, 2, 3],
+            "b": [0.5, None, 2.5],
+            "c": [True, True, False],
+            "nested": BiocFrame(
+                {
+                    "ncol1": [4, 5, 6],
+                    "ncol2": ["a", "b", "c"],
+                    "deep": ["j", "k", "l"],
+                }
+            ),
+        }
+    )
+
+    plframe = obj.to_polars()
+    assert plframe is not None
+    assert isinstance(plframe, pl.DataFrame)
+    assert len(plframe.columns) == 6
+    assert plframe.columns == [
+        "a",
+        "b",
+        "c",
+        "nested.ncol1",
+        "nested.ncol2",
+        "nested.deep",
+    ]