Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(python): Fix next on group by objects throw TypeError #19615

Open
wants to merge 44 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
a62e7a2
on groupby in non-iterator context throws TypeError
tylerriccio33 Nov 3, 2024
64a2cad
perf: Dispatch Parquet Primitive PLAIN decoding to faster kernels whe…
coastalwhite Nov 4, 2024
1703e0b
fix: Fix incorrect `scan_parquet().with_row_index()` with non-zero sl…
nameexhaustion Nov 4, 2024
e729f90
fix: Sortedness was incorrectly being preserved in dt.offset_by when …
MarcoGorelli Nov 4, 2024
7bbfed0
ci: Configure grouped Dependabot updates (#19604)
stinodego Nov 4, 2024
cc22693
fix: Allow `.struct.with_fields` inside `list.eval` (#19617)
cmdlineluser Nov 4, 2024
c462875
fix: Correct wildcard and input expansion for some more functions (#1…
siddharth-vi Nov 4, 2024
650fd16
fix(rust): Run join type coercion with correct schemas active (#19625)
wence- Nov 4, 2024
6653ef7
perf: Rechunk in DataFrame.rows if needed (#19628)
orlp Nov 4, 2024
0d0cc17
chore(rust): bump fs4 to 0.11 as 0.10 has been yanked (#19631)
jqnatividad Nov 5, 2024
95b0975
fix: Copy height in .vstack() for empty dataframes (#19641) (#19642)
letkemann Nov 5, 2024
ae06d35
test(python): Move credential provider tests to separate file (#19639)
nameexhaustion Nov 5, 2024
5f3e1e7
perf: Reorder conditions in is_leap_year (#19602)
janpipek Nov 5, 2024
6255f5f
docs: Improve `replace` and `replace_all` docstring explanation of th…
alexander-beedie Nov 5, 2024
b6ae85e
fix: Ensure `mean_horizontal` raises on non-numeric input (#19648)
alexander-beedie Nov 6, 2024
a6d6047
fix(python): Fix typing for SchemaDefinition (#19647)
rodrigogiraoserrao Nov 6, 2024
5b8c689
fix: Fix filter incorrectly pushed past struct unnest when unnested c…
nameexhaustion Nov 6, 2024
5d1a5df
feat: Add SQL support for `RIGHT JOIN`, fix an issue with wildcard al…
alexander-beedie Nov 6, 2024
8798412
feat: Parallel IPC sink for the new streaming engine (#19622)
coastalwhite Nov 6, 2024
6fc4985
refactor: Factor out logic for re-use by new streaming CSV source (#1…
nameexhaustion Nov 6, 2024
3ffad45
fix(python): Fix fill null types (#19656)
ritchie46 Nov 6, 2024
8d8ae34
fix: Fix incorrect lazy schema for `explode()` in `agg()` (#19629)
nameexhaustion Nov 6, 2024
e405e6b
refactor: Streamline internal SQL join condition processing (#19658)
alexander-beedie Nov 6, 2024
eb4019c
perf: Allow for arbitrary skips in Parquet Dictionary Decoding (#19649)
coastalwhite Nov 6, 2024
22f8c0b
fix: Update line-splitting logic in batched CSV reader (#19508)
nameexhaustion Nov 6, 2024
54135bc
refactor(rust): Delegate feature flags for polars-stream (#19659)
orlp Nov 6, 2024
ef16c92
refactor(rust): Remove unused file (#19661)
orlp Nov 6, 2024
0c2734b
docs: Assorted fixes to Rust API docs (#19664)
rodrigogiraoserrao Nov 6, 2024
df2100a
refactor: Get `Column` into `polars-expr` (#19660)
coastalwhite Nov 6, 2024
37d49aa
feat: Identify inefficient use of Python string `replace` in `map_ele…
alexander-beedie Nov 6, 2024
347b7b0
docs: Alter examples for round_sig_figs to make behaviour clearer (#1…
3tilley Nov 6, 2024
45250a7
chore: Mark test_parquet.py test_dict_slices as slow (#19675)
coastalwhite Nov 7, 2024
e2e3088
refactor: Move Series bitops to `std::ops::Bit...` (#19673)
coastalwhite Nov 7, 2024
1d2acb7
refactor: Remove more `@scalar-opt` (#19666)
coastalwhite Nov 7, 2024
2148c19
fix(python): Address inconsistency with use of Python types in frame-…
alexander-beedie Nov 7, 2024
9f678a6
ci: Bump crate-ci/typos from 1.26.8 to 1.27.0 in the ci group (#19620)
dependabot[bot] Nov 7, 2024
7be5c34
fix: Make Array arithmetic ops fully elementwise (#19682)
coastalwhite Nov 7, 2024
7285d6e
chore(python): bump the python group across 1 directory with 2 update…
dependabot[bot] Nov 7, 2024
fd801b1
feat(python): Automatically use boto3 / google-auth if installed when…
nameexhaustion Nov 7, 2024
40589e5
feat(python): Identify inefficient use of Python string `removeprefix…
alexander-beedie Nov 7, 2024
ccfa278
refactor: Use `Column` for the `{try,}_apply_columns{_par,}` function…
coastalwhite Nov 7, 2024
88fbd06
Group by iter returns class
tylerriccio33 Nov 7, 2024
d501834
Remove group by test for dev work
tylerriccio33 Nov 8, 2024
755a680
Merge branch 'pola-rs:main' into groupby-error-on-next
tylerriccio33 Nov 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 43 additions & 40 deletions py-polars/polars/dataframe/group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@

if TYPE_CHECKING:
import sys
from collections.abc import Iterable
from collections.abc import Iterable, Iterator
from datetime import timedelta
from typing import Any

from polars import DataFrame
from polars import DataFrame, Series
from polars._typing import (
ClosedInterval,
IntoExpr,
Expand All @@ -22,9 +23,41 @@
)

if sys.version_info >= (3, 11):
from typing import Self
pass
else:
from typing_extensions import Self
pass


class _GroupByIterator:
def __init__(
self,
df: DataFrame,
group_names: Iterator[tuple[Any, ...]],
group_indices: Series,
) -> None:
self._group_names = group_names
self._group_indices = group_indices
self._current_index = 0
self.df = df

def __next__(self) -> tuple[tuple[object, ...], DataFrame]:
try:
end_iter = self._current_index >= len(self._group_indices)
except AttributeError:
msg = "`next` must be called on an iterable."
raise TypeError(msg) from None

if end_iter:
raise StopIteration

group_name = next(self._group_names)
group_data = self.df[self._group_indices[self._current_index], :]
self._current_index += 1

return group_name, group_data

def __iter__(self) -> _GroupByIterator:
return self


class GroupBy:
Expand Down Expand Up @@ -61,7 +94,7 @@ def __init__(
self.named_by = named_by
self.maintain_order = maintain_order

def __iter__(self) -> Self:
def __iter__(self) -> _GroupByIterator:
"""
Allows iteration over the groups of the group by operation.

Expand Down Expand Up @@ -108,17 +141,7 @@ def __iter__(self) -> Self:
self._group_indices = groups_df.select(temp_col).to_series()
self._current_index = 0

return self

def __next__(self) -> tuple[tuple[object, ...], DataFrame]:
if self._current_index >= len(self._group_indices):
raise StopIteration

group_name = next(self._group_names)
group_data = self.df[self._group_indices[self._current_index], :]
self._current_index += 1

return group_name, group_data
return _GroupByIterator(self.df, self._group_names, self._group_indices)

def agg(
self,
Expand Down Expand Up @@ -784,7 +807,7 @@ def __init__(
self.closed = closed
self.group_by = group_by

def __iter__(self) -> Self:
def __iter__(self) -> _GroupByIterator:
temp_col = "__POLARS_GB_GROUP_INDICES"
groups_df = (
self.df.lazy()
Expand All @@ -803,17 +826,7 @@ def __iter__(self) -> Self:
self._group_indices = groups_df.select(temp_col).to_series()
self._current_index = 0

return self

def __next__(self) -> tuple[tuple[object, ...], DataFrame]:
if self._current_index >= len(self._group_indices):
raise StopIteration

group_name = next(self._group_names)
group_data = self.df[self._group_indices[self._current_index], :]
self._current_index += 1

return group_name, group_data
return _GroupByIterator(self.df, self._group_names, self._group_indices)

def agg(
self,
Expand Down Expand Up @@ -926,7 +939,7 @@ def __init__(
self.group_by = group_by
self.start_by = start_by

def __iter__(self) -> Self:
def __iter__(self) -> _GroupByIterator:
temp_col = "__POLARS_GB_GROUP_INDICES"
groups_df = (
self.df.lazy()
Expand All @@ -949,17 +962,7 @@ def __iter__(self) -> Self:
self._group_indices = groups_df.select(temp_col).to_series()
self._current_index = 0

return self

def __next__(self) -> tuple[tuple[object, ...], DataFrame]:
if self._current_index >= len(self._group_indices):
raise StopIteration

group_name = next(self._group_names)
group_data = self.df[self._group_indices[self._current_index], :]
self._current_index += 1

return group_name, group_data
return _GroupByIterator(self.df, self._group_names, self._group_indices)

def agg(
self,
Expand Down
14 changes: 14 additions & 0 deletions py-polars/tests/unit/operations/test_group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,20 @@

if TYPE_CHECKING:
from polars._typing import PolarsDataType
from polars.dataframe.group_by import _GroupByIterator


@pytest.mark.parametrize(
"context",
[
pl.DataFrame().group_by(1),
pl.DataFrame().group_by_dynamic(1, every="days"),
pl.DataFrame({"int": []}).rolling("int", period="31"),
],
)
def test_group_by_no_iter(context: _GroupByIterator) -> None:
with pytest.raises(TypeError, match="object is not an iterator"):
next(context)


def test_group_by() -> None:
Expand Down
Loading