Skip to content

Commit

Permalink
Update coersion to pandas for both biocframe and factor. Add method t…
Browse files Browse the repository at this point in the history
…o initialize a factor form a list
  • Loading branch information
jkanche committed Oct 29, 2023
1 parent 41a9de5 commit c3a835f
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 4 deletions.
9 changes: 8 additions & 1 deletion src/biocframe/BiocFrame.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from biocutils import is_list_of_type, normalize_subscript

from ._validators import validate_cols, validate_rows, validate_unique_list
from .Factor import Factor
from .types import SlicerArgTypes, SlicerTypes
from .utils import _slice_or_index

Expand Down Expand Up @@ -784,8 +785,14 @@ def to_pandas(self):
"""
from pandas import DataFrame

_data_copy = OrderedDict()
for col in self.column_names:
_data_copy[col] = self.column(col)
if isinstance(self.column(col), Factor):
_data_copy[col] = _data_copy[col].to_pandas()

return DataFrame(
data=self._data, index=self._row_names, columns=self._column_names
data=_data_copy, index=self._row_names, columns=self._column_names
)

# TODO: very primitive implementation, needs very robust testing
Expand Down
35 changes: 33 additions & 2 deletions src/biocframe/Factor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from typing import List, Sequence, Union
from copy import deepcopy
from biocgenerics.combine import combine
from typing import List, Sequence, Union

import biocutils as ut
from biocgenerics.combine import combine


class Factor:
Expand Down Expand Up @@ -312,6 +313,36 @@ def __deepcopy__(self, memo) -> "Factor":
validate=False,
)

def to_pandas(self):
"""Coerce to :py:class:`~pandas.Categorical` object.
Returns:
Categorical: A :py:class:`~pandas.Categorical` object.
"""
from pandas import Categorical

return Categorical(
values=[self._levels[c] for c in self._codes],
ordered=self._ordered,
)

@staticmethod
def from_list(values: Sequence[str]) -> "Factor":
"""Represent a categorical vector as a Factor.
Args:
values (Sequence[str]): List of strings
Raises:
ValueError: If values is not a list.
Returns:
Factor: A Factor object.
"""
levels, indices = ut.factor(values)

return Factor(indices, levels=levels)


@combine.register(Factor)
def _combine_factors(*x: Factor):
Expand Down
22 changes: 22 additions & 0 deletions tests/test_Factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from biocgenerics.combine import combine
import pytest
import copy
import pandas as pd


def test_Factor_basics():
Expand Down Expand Up @@ -176,3 +177,24 @@ def test_Factor_combine():
f2 = Factor([1, 3, 2], levels=["D", "E", "F", "G"], ordered=True)
out = combine(f1, f2)
assert not out.get_ordered()


def test_Factor_pandas():
f1 = Factor([0, 2, 4, 2, 0], levels=["A", "B", "C", "D", "E"])
pcat = f1.to_pandas()
assert pcat is not None
assert len(pcat) == len(f1)

f2 = Factor([1, 3, 2], levels=["D", "E", "F", "G"], ordered=True)
pcat = f2.to_pandas()
assert pcat is not None
assert len(pcat) == len(f2)
assert pcat.ordered == f2.get_ordered()


def test_Factor_init_from_list():
f1 = Factor.from_list(["A", "B", "A", "B", "E"])

assert isinstance(f1, Factor)
assert len(f1) == 5
assert len(f1.get_levels()) == 3
33 changes: 32 additions & 1 deletion tests/test_methods.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import numpy as np
import pytest

import pandas as pd
from biocframe.BiocFrame import BiocFrame
from biocframe.Factor import Factor

__author__ = "jkanche"
__copyright__ = "jkanche"
Expand Down Expand Up @@ -430,3 +431,33 @@ def test_nested_biocFrame_preserve_types():
assert isinstance(sliced.column("nested"), BiocFrame)
assert isinstance(sliced.row(0), dict)
assert isinstance(sliced.column("column2"), np.ndarray)


def test_export_pandas():
obj = BiocFrame(
{
"column1": [1, 2, 3],
"nested": BiocFrame(
{
"ncol1": [4, 5, 6],
"ncol2": ["a", "b", "c"],
"deep": ["j", "k", "l"],
}
),
"column2": np.array([1, 2, 3]),
}
)

pdf = obj.to_pandas()
assert pdf is not None
assert isinstance(pdf, pd.DataFrame)
assert len(pdf) == len(obj)
assert len(set(pdf.columns).difference(obj.colnames)) == 0

obj["factor"] = Factor([0, 2, 1], levels=["A", "B", "C"])
pdf = obj.to_pandas()
assert pdf is not None
assert isinstance(pdf, pd.DataFrame)
assert len(pdf) == len(obj)
assert len(set(pdf.columns).difference(obj.colnames)) == 0
assert pdf["factor"] is not None

0 comments on commit c3a835f

Please sign in to comment.