Skip to content

Commit

Permalink
FIX-modin-project#4481: Allow clipping with a Modin Series of bounds. (
Browse files Browse the repository at this point in the history
…modin-project#4486)

Signed-off-by: mvashishtha <[email protected]>
  • Loading branch information
mvashishtha authored May 26, 2022
1 parent ce8f187 commit 958c26a
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 14 deletions.
1 change: 1 addition & 0 deletions docs/release_notes/release_notes-0.15.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Key Features and Updates
* FIX-#4461: Fix S3 CSV data path (#4462)
* FIX-#4467: `drop_duplicates` no longer removes items based on index values (#4468)
* FIX-#4449: Drain the call queue before waiting on result in benchmark mode (#4472)
* FIX-#4481: Allow clipping with a Modin Series of bounds (#4486)
* Performance enhancements
* FEAT-#4320: Add connectorx as an alternative engine for read_sql (#4346)
* Benchmarking enhancements
Expand Down
4 changes: 4 additions & 0 deletions modin/core/storage_formats/base/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -768,6 +768,10 @@ def clip(self, lower, upper, **kwargs): # noqa: PR02
BaseQueryCompiler
QueryCompiler with values limited by the specified thresholds.
"""
if isinstance(lower, BaseQueryCompiler):
lower = lower.to_pandas().squeeze(1)
if isinstance(upper, BaseQueryCompiler):
upper = upper.to_pandas().squeeze(1)
return DataFrameDefault.register(pandas.DataFrame.clip)(
self, lower=lower, upper=upper, **kwargs
)
Expand Down
4 changes: 4 additions & 0 deletions modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1601,6 +1601,10 @@ def describe_builder(df, internal_indices=[]):
diff = Fold.register(pandas.DataFrame.diff)

def clip(self, lower, upper, **kwargs):
if isinstance(lower, BaseQueryCompiler):
lower = lower.to_pandas().squeeze(1)
if isinstance(upper, BaseQueryCompiler):
upper = upper.to_pandas().squeeze(1)
kwargs["upper"] = upper
kwargs["lower"] = lower
axis = kwargs.get("axis", 0)
Expand Down
10 changes: 5 additions & 5 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1056,18 +1056,18 @@ def clip(
if axis is not None:
axis = self._get_axis_number(axis)
self._validate_dtypes(numeric_only=True)
if is_list_like(lower) or is_list_like(upper):
if axis is None:
raise ValueError("Must specify axis = 0 or 1")
self._validate_other(lower, axis)
self._validate_other(upper, axis)
inplace = validate_bool_kwarg(inplace, "inplace")
axis = numpy_compat.function.validate_clip_with_axis(axis, args, kwargs)
# any np.nan bounds are treated as None
if lower is not None and np.any(np.isnan(lower)):
lower = None
if upper is not None and np.any(np.isnan(upper)):
upper = None
if is_list_like(lower) or is_list_like(upper):
if axis is None:
raise ValueError("Must specify axis = 0 or 1")
lower = self._validate_other(lower, axis)
upper = self._validate_other(upper, axis)
# FIXME: Judging by pandas docs `*args` and `**kwargs` serves only compatibility
# purpose and does not affect the result, we shouldn't pass them to the query compiler.
new_query_compiler = self._query_compiler.clip(
Expand Down
25 changes: 18 additions & 7 deletions modin/pandas/test/dataframe/test_map_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,8 @@ def test_astype_category_large():

@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize("axis", axis_values, ids=axis_keys)
def test_clip(request, data, axis):
@pytest.mark.parametrize("bound_type", ["list", "series"], ids=["list", "series"])
def test_clip(request, data, axis, bound_type):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)

Expand All @@ -576,8 +577,6 @@ def test_clip(request, data, axis):
)
# set bounds
lower, upper = np.sort(random_state.random_integers(RAND_LOW, RAND_HIGH, 2))
lower_list = random_state.random_integers(RAND_LOW, RAND_HIGH, ind_len)
upper_list = random_state.random_integers(RAND_LOW, RAND_HIGH, ind_len)

# test only upper scalar bound
modin_result = modin_df.clip(None, upper, axis=axis)
Expand All @@ -589,14 +588,26 @@ def test_clip(request, data, axis):
pandas_result = pandas_df.clip(lower, upper, axis=axis)
df_equals(modin_result, pandas_result)

lower = random_state.random_integers(RAND_LOW, RAND_HIGH, ind_len)
upper = random_state.random_integers(RAND_LOW, RAND_HIGH, ind_len)

if bound_type == "series":
modin_lower = pd.Series(lower)
pandas_lower = pandas.Series(lower)
modin_upper = pd.Series(upper)
pandas_upper = pandas.Series(upper)
else:
modin_lower = pandas_lower = lower
modin_upper = pandas_upper = upper

# test lower and upper list bound on each column
modin_result = modin_df.clip(lower_list, upper_list, axis=axis)
pandas_result = pandas_df.clip(lower_list, upper_list, axis=axis)
modin_result = modin_df.clip(modin_lower, modin_upper, axis=axis)
pandas_result = pandas_df.clip(pandas_lower, pandas_upper, axis=axis)
df_equals(modin_result, pandas_result)

# test only upper list bound on each column
modin_result = modin_df.clip(np.nan, upper_list, axis=axis)
pandas_result = pandas_df.clip(np.nan, upper_list, axis=axis)
modin_result = modin_df.clip(np.nan, modin_upper, axis=axis)
pandas_result = pandas_df.clip(np.nan, pandas_upper, axis=axis)
df_equals(modin_result, pandas_result)

with pytest.raises(ValueError):
Expand Down
38 changes: 36 additions & 2 deletions modin/pandas/test/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1164,8 +1164,11 @@ def test_bool(data):


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_clip(request, data):
modin_series, pandas_series = create_test_series(data)
@pytest.mark.parametrize("bound_type", ["list", "series"], ids=["list", "series"])
def test_clip_scalar(request, data, bound_type):
modin_series, pandas_series = create_test_series(
data,
)

if name_contains(request.node.name, numeric_dfs):
# set bounds
Expand All @@ -1182,6 +1185,37 @@ def test_clip(request, data):
df_equals(modin_result, pandas_result)


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize("bound_type", ["list", "series"], ids=["list", "series"])
def test_clip_sequence(request, data, bound_type):
modin_series, pandas_series = create_test_series(
data,
)

if name_contains(request.node.name, numeric_dfs):
lower = random_state.random_integers(RAND_LOW, RAND_HIGH, len(pandas_series))
upper = random_state.random_integers(RAND_LOW, RAND_HIGH, len(pandas_series))

if bound_type == "series":
modin_lower = pd.Series(lower)
pandas_lower = pandas.Series(lower)
modin_upper = pd.Series(upper)
pandas_upper = pandas.Series(upper)
else:
modin_lower = pandas_lower = lower
modin_upper = pandas_upper = upper

# test lower and upper list bound
modin_result = modin_series.clip(modin_lower, modin_upper, axis=0)
pandas_result = pandas_series.clip(pandas_lower, pandas_upper)
df_equals(modin_result, pandas_result)

# test only upper list bound
modin_result = modin_series.clip(np.nan, modin_upper, axis=0)
pandas_result = pandas_series.clip(np.nan, pandas_upper)
df_equals(modin_result, pandas_result)


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_combine(data):
modin_series, _ = create_test_series(data) # noqa: F841
Expand Down

0 comments on commit 958c26a

Please sign in to comment.