Skip to content

Commit

Permalink
SNOW-1445838: Support DataFrame.items. (#2060)
Browse files Browse the repository at this point in the history
Signed-off-by: sfc-gh-mvashishtha <[email protected]>
  • Loading branch information
sfc-gh-mvashishtha authored Aug 12, 2024
1 parent 9250c5a commit 131e820
Show file tree
Hide file tree
Showing 7 changed files with 108 additions and 29 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
- Added support for lazy `DatetimeIndex`.
- Added support for `Series.argmax` and `Series.argmin`.
- Added support for `Series.dt.is_leap_year`.
- Added support for `DataFrame.items`.

#### Improvements
- Removed the public preview warning message upon importing Snowpark pandas.
Expand Down
1 change: 1 addition & 0 deletions docs/source/modin/dataframe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ DataFrame
DataFrame.__iter__
DataFrame.keys
DataFrame.iterrows
DataFrame.items
DataFrame.itertuples
DataFrame.tail
DataFrame.isin
Expand Down
2 changes: 1 addition & 1 deletion docs/source/modin/supported/dataframe_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ Methods
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``isnull`` | Y | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``items`` | N | | |
| ``items`` | Y | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``iterrows`` | Y | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
Expand Down
11 changes: 1 addition & 10 deletions src/snowflake/snowpark/modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1348,27 +1348,18 @@ def iterrow_builder(s):
partition_iterator = SnowparkPandasRowPartitionIterator(self, iterrow_builder)
yield from partition_iterator

@dataframe_not_implemented()
def items(self): # noqa: D200
"""
Iterate over (column name, ``Series``) pairs.
"""
# TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions

def items_builder(s):
"""Return tuple of the given `s` parameter name and the parameter themselves."""
return s.name, s

partition_iterator = PartitionIterator(self, 1, items_builder)
yield from partition_iterator

@dataframe_not_implemented()
def iteritems(self): # noqa: RT01, D200
"""
Iterate over (column name, ``Series``) pairs.
"""
# TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions
return self.items()

def itertuples(
self, index: bool = True, name: str | None = "Pandas"
) -> Iterable[tuple[Any, ...]]:
Expand Down
47 changes: 43 additions & 4 deletions src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1921,11 +1921,50 @@ def iterrows():
def items():
"""
Iterate over (column name, ``Series``) pairs.
"""
def iteritems():
"""
Iterate over (column name, ``Series``) pairs.
Iterates over the DataFrame columns, returning a tuple with
the column name and the content as a Series.
Yields
------
label : object
The column names for the DataFrame being iterated over.
content : Series
The column entries belonging to each label, as a Series.
See Also
--------
DataFrame.iterrows : Iterate over DataFrame rows as
(index, Series) pairs.
DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
of the values.
Examples
--------
>>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
... 'population': [1864, 22000, 80000]},
... index=['panda', 'polar', 'koala'])
>>> df
species population
panda bear 1864
polar bear 22000
koala marsupial 80000
>>> for label, content in df.items():
... print(f'label: {label}')
... print(f'content:\\n{content}')
...
label: species
content:
panda bear
polar bear
koala marsupial
Name: species, dtype: object
label: population
content:
panda 1864
polar 22000
koala 80000
Name: population, dtype: int64
"""

def itertuples():
Expand Down
61 changes: 61 additions & 0 deletions tests/integ/modin/frame/test_items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
#

import pandas as native_pd
import pytest

from tests.integ.modin.sql_counter import SqlCounter
from tests.integ.modin.utils import (
assert_snowpark_pandas_equals_to_pandas_without_dtypecheck,
create_test_dfs,
eval_snowpark_pandas_result,
)


def assert_items_results_equal(snow_result, pandas_result) -> None:
snow_list = list(snow_result)
pandas_list = list(pandas_result)
assert len(snow_list) == len(pandas_list), "lengths of items are not equal."
if len(snow_list) == 0:
# Expect no queries if there are no columns.
with SqlCounter(query_count=0):
return
for ((snow_label, snow_column), (pandas_label, pandas_column)) in zip(
snow_list, pandas_list
):
assert snow_label == pandas_label
# Execute one query to materialize each column.
with SqlCounter(query_count=1):
assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(
snow_column, pandas_column
)


@pytest.mark.parametrize(
"dataframe",
[
native_pd.DataFrame(
{
"species": ["bear", "bear", "marsupial"],
"population": [1864, 22000, 80000],
},
index=["panda", "polar", "koala"],
),
native_pd.DataFrame(
{
(0, "species"): ["bear", "bear", "marsupial"],
(0, "population"): [1864, 22000, 80000],
},
index=["panda", "polar", "koala"],
),
native_pd.DataFrame(index=["a"]),
native_pd.DataFrame(columns=["a"]),
],
)
def test_items(dataframe):
eval_snowpark_pandas_result(
*create_test_dfs(dataframe),
lambda df: df.items(),
comparator=assert_items_results_equal,
)
14 changes: 0 additions & 14 deletions tests/unit/modin/test_unsupported.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,20 +135,6 @@ def test_unsupported_df(df_method, kwargs):
getattr(mock_df, df_method)(**kwargs)


@pytest.mark.parametrize(
"df_method, kwargs",
[["items", {}], ["iteritems", {}]],
)
def test_unsupported_df_generator(df_method, kwargs):
mock_query_compiler = mock.create_autospec(SnowflakeQueryCompiler)
mock_query_compiler.columnarize.return_value = mock_query_compiler
mock_df = DataFrame(query_compiler=mock_query_compiler)

with pytest.raises(NotImplementedError):
for x in getattr(mock_df, df_method)(**kwargs):
x + 1


@pytest.mark.parametrize(
"series_method, kwargs",
[
Expand Down

0 comments on commit 131e820

Please sign in to comment.