Skip to content

Commit

Permalink
Split BiocFrame by a column (#91).
Browse files Browse the repository at this point in the history
  • Loading branch information
jkanche authored Jan 2, 2024
1 parent 66dbd15 commit 20c4304
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 0 deletions.
46 changes: 46 additions & 0 deletions src/biocframe/BiocFrame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1088,6 +1088,52 @@ def copy(self):
"""Alias for :py:meth:`~__copy__`."""
return self.__copy__()

##########################
######>> split by <<######
##########################

def split(
self, name: str, only_indices: bool = False
) -> Dict[str, Union["BiocFrame", List[int]]]:
"""Split the object by a column.
Args:
group:
Name of the column to split by.
only_indices:
Whether to only return indices.
Defaults to False
Returns:
A dictionary of biocframe objects, with names representing the
group and the value the sliced frames.
if ``only_indices`` is True, the values contain the row indices
that map to the same group.
"""
if name not in self._column_names:
raise ValueError(f"'{name}' is not a valid column name.")

_column = self.get_column(name)

_grps = {}
for i in range(len(self)):
_key = _column[i]
if _key not in _grps:
_grps[_key] = []

_grps[_key].append(i)

if only_indices is True:
return _grps

_sliced_grps = {}
for k, v in _grps.items():
_sliced_grps[k] = self[v,]

return _sliced_grps

################################
######>> pandas interop <<######
################################
Expand Down
30 changes: 30 additions & 0 deletions tests/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,3 +623,33 @@ def test_set_names():
with pytest.raises(ValueError) as ex:
obj.set_column_names(["A", "A"])
assert str(ex.value).find("duplicate column name") >= 0


def test_bframe_split():
obj = {
"column1": [1, 2, 3],
"nested": [
{
"ncol1": [4, 5, 6],
"ncol2": ["a", "b", "c"],
"deep": {"dcol1": ["j", "k", "l"], "dcol2": ["a", "s", "l"]},
},
{
"ncol2": ["a"],
"deep": {"dcol1": ["j"], "dcol2": ["a"]},
},
{
"ncol1": [5, 6],
"ncol2": ["b", "c"],
},
],
"column2": ["b", "n", "b"],
}

bframe = BiocFrame(obj)
split_frame = bframe.split("column2")

assert split_frame is not None
assert isinstance(split_frame, dict)
assert len(split_frame) == 2
assert len(split_frame["b"]) == 2

0 comments on commit 20c4304

Please sign in to comment.