Skip to content

Commit

Permalink
Interface with polars (#108)
Browse files Browse the repository at this point in the history
* Convert `BiocFrame` objects to polars `DataFrame` and vice-verse. 
* Add method to flatten a nested `BiocFrame` object.
* Update tests and documentation.
  • Loading branch information
jkanche authored Jun 10, 2024
1 parent 641fcd6 commit b5646a1
Show file tree
Hide file tree
Showing 6 changed files with 203 additions and 44 deletions.
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,7 @@
"setuptools": ("https://setuptools.pypa.io/en/stable/", None),
"pyscaffold": ("https://pyscaffold.org/en/stable", None),
"biocutils": ("https://biocpy.github.io/BiocUtils", None),
"polars": ("https://docs.pola.rs/api/python/stable/", None),
}

print(f"loading configurations for {project} {version} ...", file=sys.stderr)
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,15 @@ exclude =
# `pip install BiocFrame[PDF]` like:
optional =
pandas
polars

# Add here test requirements (semicolon/line-separated)
testing =
setuptools
pytest
pytest-cov
pandas
polars

[options.entry_points]
# Add here console scripts like:
Expand Down
98 changes: 88 additions & 10 deletions src/biocframe/BiocFrame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1166,20 +1166,14 @@ def to_pandas(self):
"""Convert the ``BiocFrame`` into a :py:class:`~pandas.DataFrame` object.
Returns:
A :py:class:`~pandas.DataFrame` object.
A :py:class:`~pandas.DataFrame` object. Column names of the resulting
dataframe may be different is the `BiocFrame` is nested.
"""
from pandas import DataFrame

if len(self.column_names) > 0:
_data_copy = OrderedDict()
for col in self.column_names:
_data_copy[col] = self.column(col)
if isinstance(self.column(col), ut.Factor):
_data_copy[col] = _data_copy[col].to_pandas()

return DataFrame(
data=_data_copy, index=self._row_names, columns=self._column_names
)
_data_copy = self.flatten(as_type="dict")
return DataFrame(data=_data_copy, index=self._row_names)
else:
return DataFrame(data={}, index=range(self._number_of_rows))

Expand Down Expand Up @@ -1208,10 +1202,92 @@ def from_pandas(cls, input: "pandas.DataFrame") -> "BiocFrame":

return cls(data=rdata, row_names=rindex, column_names=input.columns.to_list())

################################
######>> polars interop <<######
################################

@classmethod
def from_polars(cls, input: "polars.DataFrame") -> "BiocFrame":
"""Create a ``BiocFrame`` from a :py:class:`~polars.DataFrame` object.
Args:
input:
Input data.
Returns:
A ``BiocFrame`` object.
"""

from polars import DataFrame

if not isinstance(input, DataFrame):
raise TypeError("`data` is not a polars `DataFrame` object.")

rdata = input.to_dict(as_series=False)

return cls(data=rdata)

def to_polars(self):
"""Convert the ``BiocFrame`` into a :py:class:`~polars.DataFrame` object.
Returns:
A :py:class:`~polars.DataFrame` object. Column names of the resulting
dataframe may be different is the `BiocFrame` is nested.
"""
from polars import DataFrame

if len(self.column_names) > 0:
_data_copy = self.flatten(as_type="dict")
return DataFrame(data=_data_copy)
else:
return DataFrame(data={})

###############################
######>> Miscellaneous <<######
###############################

def flatten(
self, as_type: Literal["dict", "biocframe"] = "dict", delim: str = "."
) -> "BiocFrame":
"""Flatten a nested BiocFrame object.
Args:
as_type:
Return type of the result. Either a :py:class:`~dict` or a
:py:class:`~biocframe.BiocFrame.BiocFrame` object.
delim:
Delimiter to join nested column names. Defaults to `"."`.
Returns:
An object with the type specified by ``as_type`` argument.
If ``as_type`` is `dict`, an additional column "rownames" is added if the object
contains rownames.
"""

if as_type not in ["dict", "biocframe"]:
raise ValueError("'as_type' must be either 'dict' or 'biocframe'.")

_data_copy = OrderedDict()
for col in list(self.get_column_names()):
_cold = self.column(col)
if isinstance(_cold, BiocFrame):
_res = _cold.flatten(as_type=as_type)
for k in _res.keys():
_data_copy[f"{col}{delim}{k}"] = _res[k]
elif isinstance(_cold, ut.Factor):
_data_copy[col] = _cold.to_pandas()
else:
_data_copy[col] = _cold

if as_type == "biocframe":
return BiocFrame(_data_copy, row_names=self._row_names)

if self._row_names is not None:
_data_copy["rownames"] = self._row_names

return _data_copy

def combine(self, *other):
"""Wrapper around :py:func:`~relaxed_combine_rows`, provided for back-compatibility only."""
return relaxed_combine_rows([self] + other)
Expand All @@ -1227,6 +1303,8 @@ def __array_ufunc__(self, func, method, *inputs, **kwargs) -> "BiocFrame":
An object with the same type as the caller.
"""

warn("Not all NumPy array methods are fully tested.", UserWarning)

from pandas import Series
from pandas.api.types import is_numeric_dtype

Expand Down
34 changes: 0 additions & 34 deletions tests/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,40 +536,6 @@ def test_nested_biocFrame_preserve_types():
assert isinstance(sliced.column("column2"), np.ndarray)


def test_export_pandas():
obj = BiocFrame(
{
"column1": [1, 2, 3],
"nested": BiocFrame(
{
"ncol1": [4, 5, 6],
"ncol2": ["a", "b", "c"],
"deep": ["j", "k", "l"],
}
),
"column2": np.array([1, 2, 3]),
}
)

pdf = obj.to_pandas()
assert pdf is not None
assert isinstance(pdf, pd.DataFrame)
assert len(pdf) == len(obj)
assert len(set(pdf.columns).difference(obj.colnames)) == 0

obj["factor"] = Factor([0, 2, 1], levels=["A", "B", "C"])
pdf = obj.to_pandas()
assert pdf is not None
assert isinstance(pdf, pd.DataFrame)
assert len(pdf) == len(obj)
assert len(set(pdf.columns).difference(obj.colnames)) == 0
assert pdf["factor"] is not None

emptyobj = BiocFrame(number_of_rows=100)
pdf = emptyobj.to_pandas()
assert len(pdf) == len(emptyobj)


def test_names_generics():
obj = BiocFrame(
{
Expand Down
43 changes: 43 additions & 0 deletions tests/test_pandas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import numpy as np
import pytest
import pandas as pd
from biocframe.BiocFrame import BiocFrame
from biocutils import Factor

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"


def test_export_pandas():
obj = BiocFrame(
{
"column1": [1, 2, 3],
"nested": BiocFrame(
{
"ncol1": [4, 5, 6],
"ncol2": ["a", "b", "c"],
"deep": ["j", "k", "l"],
}
),
"column2": np.array([1, 2, 3]),
}
)

pdf = obj.to_pandas()
assert pdf is not None
assert isinstance(pdf, pd.DataFrame)
assert len(pdf) == len(obj)
assert len(set(pdf.columns).difference(obj.colnames)) == 3

obj["factor"] = Factor([0, 2, 1], levels=["A", "B", "C"])
pdf = obj.to_pandas()
assert pdf is not None
assert isinstance(pdf, pd.DataFrame)
assert len(pdf) == len(obj)
assert len(set(pdf.columns).difference(obj.colnames)) == 3
assert pdf["factor"] is not None

emptyobj = BiocFrame(number_of_rows=100)
pdf = emptyobj.to_pandas()
assert len(pdf) == len(emptyobj)
69 changes: 69 additions & 0 deletions tests/test_polars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import pytest

import polars as pl
from biocframe import BiocFrame
from biocutils import Names

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"


def test_from_polars():
obj = pl.DataFrame(
{
"a": [None, 2, 3, 4],
"b": [0.5, None, 2.5, 13],
"c": [True, True, False, None],
}
)

bframe = BiocFrame.from_polars(obj)
assert bframe is not None
assert isinstance(bframe.get_column_names(), Names)
assert list(bframe.get_column_names()) == ["a", "b", "c"]


def test_to_polars():
obj = BiocFrame(
{
"a": [None, 2, 3, 4],
"b": [0.5, None, 2.5, 13],
"c": [True, True, False, None],
}
)

plframe = obj.to_polars()
assert plframe is not None
assert isinstance(plframe, pl.DataFrame)
assert plframe.columns == ["a", "b", "c"]


def test_to_polars_nested():
obj = BiocFrame(
{
"a": [None, 2, 3],
"b": [0.5, None, 2.5],
"c": [True, True, False],
"nested": BiocFrame(
{
"ncol1": [4, 5, 6],
"ncol2": ["a", "b", "c"],
"deep": ["j", "k", "l"],
}
),
}
)

plframe = obj.to_polars()
assert plframe is not None
assert isinstance(plframe, pl.DataFrame)
assert len(plframe.columns) == 6
assert plframe.columns == [
"a",
"b",
"c",
"nested.ncol1",
"nested.ncol2",
"nested.deep",
]

0 comments on commit b5646a1

Please sign in to comment.