From fb4f0d68140f4df666d5ded18625cfee7b294625 Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Tue, 29 Oct 2024 23:57:22 +0500 Subject: [PATCH 01/20] Bump pandas from 1.5.3 to 2.0.3 Signed-off-by: Yerzhaisang Taskali --- docs/requirements-docs.txt | 2 +- opensearch_py_ml/common.py | 8 +++++- opensearch_py_ml/dataframe.py | 37 +++++++++++++++++++++---- opensearch_py_ml/operations.py | 15 ++++++---- opensearch_py_ml/series.py | 3 +- requirements-dev.txt | 2 +- requirements.txt | 2 +- setup.py | 2 +- tests/dataframe/test_describe_pytest.py | 2 +- tests/dataframe/test_groupby_pytest.py | 32 +++++++++++++++++++-- tests/dataframe/test_metrics_pytest.py | 20 +++++++++---- tests/series/test_arithmetics_pytest.py | 4 +-- tests/series/test_metrics_pytest.py | 10 +++++-- 13 files changed, 107 insertions(+), 32 deletions(-) diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index d4e8a521f..5ae7950de 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,5 +1,5 @@ opensearch-py>=2 -pandas>=1.5,<3 +pandas==2.0.3 matplotlib>=3.6.0,<4 nbval sphinx diff --git a/opensearch_py_ml/common.py b/opensearch_py_ml/common.py index add998305..1449f03d5 100644 --- a/opensearch_py_ml/common.py +++ b/opensearch_py_ml/common.py @@ -55,7 +55,10 @@ def build_pd_series( - data: Dict[str, Any], dtype: Optional["DTypeLike"] = None, **kwargs: Any + data: Dict[str, Any], + dtype: Optional["DTypeLike"] = None, + index_name: Optional[str] = None, + **kwargs: Any, ) -> pd.Series: """Builds a pd.Series while squelching the warning for unspecified dtype on empty series @@ -63,6 +66,9 @@ def build_pd_series( dtype = dtype or (EMPTY_SERIES_DTYPE if not data else dtype) if dtype is not None: kwargs["dtype"] = dtype + if index_name: + index = pd.Index(data.keys(), name=index_name) + kwargs["index"] = index return pd.Series(data, **kwargs) diff --git a/opensearch_py_ml/dataframe.py b/opensearch_py_ml/dataframe.py index 64772887f..cdc196d8a 100644 --- a/opensearch_py_ml/dataframe.py +++ b/opensearch_py_ml/dataframe.py @@ -424,9 +424,36 @@ def drop( axis = pd.DataFrame._get_axis_name(axis) axes = {axis: labels} elif index is not None or columns is not None: - axes, _ = pd.DataFrame()._construct_axes_from_arguments( - (index, columns), {} - ) + # axes, _ = pd.DataFrame()._construct_axes_from_arguments( + # (index, columns), {} + # ) + axes = {} + if index is not None: + if isinstance(index, pd.Index): + index = index.tolist() # Convert Index to list + elif not is_list_like(index): + index = [index] # Convert to list if it's not list-like already + axes["index"] = index + else: + axes["index"] = None + + if columns is not None: + if isinstance(columns, pd.Index): + columns = columns.tolist() # Convert Index to list + elif not is_list_like(columns): + columns = [columns] # Convert to list if it's not list-like already + axes["columns"] = columns + else: + axes["columns"] = None + + if columns is not None: + if not is_list_like(columns): + columns = [columns] + axes["columns"] = ( + pd.Index(columns) if isinstance(columns, list) else columns + ) + else: + axes["columns"] = None else: raise ValueError( "Need to specify at least one of 'labels', 'index' or 'columns'" @@ -440,7 +467,7 @@ def drop( axes["index"] = [axes["index"]] if errors == "raise": # Check if axes['index'] values exists in index - count = self._query_compiler._index_matches_count(axes["index"]) + count = self._query_compiler._index_matches_count(list(axes["index"])) if count != len(axes["index"]): raise ValueError( f"number of labels {count}!={len(axes['index'])} not contained in axis" @@ -1326,7 +1353,6 @@ def to_csv( compression="infer", quoting=None, quotechar='"', - line_terminator=None, chunksize=None, tupleize_cols=None, date_format=None, @@ -1355,7 +1381,6 @@ def to_csv( "compression": compression, "quoting": quoting, "quotechar": quotechar, - "line_terminator": line_terminator, "chunksize": chunksize, "date_format": date_format, "doublequote": doublequote, diff --git a/opensearch_py_ml/operations.py b/opensearch_py_ml/operations.py index c3d01e9e6..0a2c52990 100644 --- a/opensearch_py_ml/operations.py +++ b/opensearch_py_ml/operations.py @@ -475,7 +475,7 @@ def _terms_aggs( except IndexError: name = None - return build_pd_series(results, name=name) + return build_pd_series(results, index_name=name, name="count") def _hist_aggs( self, query_compiler: "QueryCompiler", num_bins: int @@ -1205,7 +1205,7 @@ def describe(self, query_compiler: "QueryCompiler") -> pd.DataFrame: df1 = self.aggs( query_compiler=query_compiler, - pd_aggs=["count", "mean", "std", "min", "max"], + pd_aggs=["count", "mean", "min", "max", "std"], numeric_only=True, ) df2 = self.quantile( @@ -1219,9 +1219,14 @@ def describe(self, query_compiler: "QueryCompiler") -> pd.DataFrame: # Convert [.25,.5,.75] to ["25%", "50%", "75%"] df2 = df2.set_index([["25%", "50%", "75%"]]) - return pd.concat([df1, df2]).reindex( - ["count", "mean", "std", "min", "25%", "50%", "75%", "max"] - ) + df = pd.concat([df1, df2]) + + if df.shape[1] == 1: + return df.reindex( + ["count", "mean", "std", "min", "25%", "50%", "75%", "max"] + ) + + return df.reindex(["count", "mean", "min", "25%", "50%", "75%", "max", "std"]) def to_pandas( self, query_compiler: "QueryCompiler", show_progress: bool = False diff --git a/opensearch_py_ml/series.py b/opensearch_py_ml/series.py index 772660b13..538d8a879 100644 --- a/opensearch_py_ml/series.py +++ b/opensearch_py_ml/series.py @@ -312,11 +312,12 @@ def value_counts(self, os_size: int = 10) -> pd.Series: >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> df['Carrier'].value_counts() + Carrier Logstash Airways 3331 JetBeats 3274 Kibana Airlines 3234 ES-Air 3220 - Name: Carrier, dtype: int64 + Name: count, dtype: int64 """ if not isinstance(os_size, int): raise TypeError("os_size must be a positive integer.") diff --git a/requirements-dev.txt b/requirements-dev.txt index e7b62bcf0..6bbc817b0 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ # # Basic requirements # -pandas>=1.5.2,<2 +pandas==2.0.3 matplotlib>=3.6.2,<4 numpy>=1.24.0,<2 opensearch-py>=2.2.0 diff --git a/requirements.txt b/requirements.txt index cddfe801c..8af3ac141 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # # Basic requirements # -pandas>=1.5.2,<2 +pandas==2.0.3 matplotlib>=3.6.2,<4 numpy>=1.24.0,<2 opensearch-py>=2.2.0 diff --git a/setup.py b/setup.py index 9146b2503..f98028f3d 100644 --- a/setup.py +++ b/setup.py @@ -84,7 +84,7 @@ }, install_requires=[ "opensearch-py>=2", - "pandas>=1.5,<3", + "pandas==2.0.3", "matplotlib>=3.6.0,<4", "numpy>=1.24.0,<2", "deprecated>=1.2.14,<2", diff --git a/tests/dataframe/test_describe_pytest.py b/tests/dataframe/test_describe_pytest.py index 8d0344d26..e8c8ea2a6 100644 --- a/tests/dataframe/test_describe_pytest.py +++ b/tests/dataframe/test_describe_pytest.py @@ -34,7 +34,7 @@ def test_flights_describe(self): pd_flights = self.pd_flights() oml_flights = self.oml_flights() - pd_describe = pd_flights.describe() + pd_describe = pd_flights.describe().drop(["timestamp"], axis=1) # We remove bool columns to match pandas output oml_describe = oml_flights.describe().drop( ["Cancelled", "FlightDelay"], axis="columns" diff --git a/tests/dataframe/test_groupby_pytest.py b/tests/dataframe/test_groupby_pytest.py index 3ee3fa7bc..510d6fe74 100644 --- a/tests/dataframe/test_groupby_pytest.py +++ b/tests/dataframe/test_groupby_pytest.py @@ -106,10 +106,18 @@ def test_groupby_aggs_mad_var_std(self, pd_agg, dropna): pd_flights = self.pd_flights().filter(self.filter_data) oml_flights = self.oml_flights().filter(self.filter_data) - pd_groupby = getattr(pd_flights.groupby("Cancelled", dropna=dropna), pd_agg)() + if pd_agg == "mad": + pd_groupby = pd_flights.groupby("Cancelled", dropna=dropna).agg( + lambda x: (x - x.mean()).abs().mean() + ) + else: + pd_groupby = getattr( + pd_flights.groupby("Cancelled", dropna=dropna), pd_agg + )() oml_groupby = getattr(oml_flights.groupby("Cancelled", dropna=dropna), pd_agg)( numeric_only=True ) + pd_groupby = pd_groupby[oml_groupby.columns] # checking only values because dtypes are checked in aggs tests assert_frame_equal( @@ -224,14 +232,32 @@ def test_groupby_dataframe_mad(self): pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"]) oml_flights = self.oml_flights().filter(self.filter_data + ["DestCountry"]) - pd_mad = pd_flights.groupby("DestCountry").mad() + pd_mad = pd_flights.groupby("DestCountry").apply( + lambda x: x.select_dtypes(include="number").apply( + lambda x: (x - x.mean()).abs().mean() + ) + ) + + # Re-merge non-numeric columns back, with suffixes to avoid column overlap + non_numeric_columns = ( + pd_flights.select_dtypes(exclude="number").groupby("DestCountry").first() + ) + pd_mad = pd_mad.join( + non_numeric_columns, lsuffix="_numeric", rsuffix="_non_numeric" + )[self.filter_data] + if "Cancelled" in pd_mad.columns: + pd_mad["Cancelled"] = pd_mad["Cancelled"].astype(float) oml_mad = oml_flights.groupby("DestCountry").mad() assert_index_equal(pd_mad.columns, oml_mad.columns) assert_index_equal(pd_mad.index, oml_mad.index) assert_series_equal(pd_mad.dtypes, oml_mad.dtypes) - pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", "mad"]) + pd_min_mad = pd_flights.groupby("DestCountry").agg( + ["min", lambda x: (x - x.median()).abs().mean()] + ) + + pd_min_mad.columns = pd_min_mad.columns.set_levels(["min", "mad"], level=1) oml_min_mad = oml_flights.groupby("DestCountry").aggregate(["min", "mad"]) assert_index_equal(pd_min_mad.columns, oml_min_mad.columns) diff --git a/tests/dataframe/test_metrics_pytest.py b/tests/dataframe/test_metrics_pytest.py index f3055ea5a..65adaf38e 100644 --- a/tests/dataframe/test_metrics_pytest.py +++ b/tests/dataframe/test_metrics_pytest.py @@ -81,9 +81,10 @@ def test_flights_extended_metrics(self): logger.setLevel(logging.DEBUG) for func in self.extended_funcs: - pd_metric = getattr(pd_flights, func)( - **({"numeric_only": True} if func != "mad" else {}) - ) + if func == "mad": + pd_metric = (pd_flights - pd_flights.mean()).abs().mean() + else: + pd_metric = getattr(pd_flights, func)(**({"numeric_only": True})) oml_metric = getattr(oml_flights, func)(numeric_only=True) pd_value = pd_metric["AvgTicketPrice"] @@ -101,7 +102,10 @@ def test_flights_extended_metrics_nan(self): ] for func in self.extended_funcs: - pd_metric = getattr(pd_flights_1, func)() + if func == "mad": + pd_metric = (pd_flights_1 - pd_flights_1.mean()).abs().mean() + else: + pd_metric = getattr(pd_flights_1, func)() oml_metric = getattr(oml_flights_1, func)(numeric_only=False) assert_series_equal(pd_metric, oml_metric, check_exact=False) @@ -111,7 +115,10 @@ def test_flights_extended_metrics_nan(self): oml_flights_0 = oml_flights[oml_flights.FlightNum == "XXX"][["AvgTicketPrice"]] for func in self.extended_funcs: - pd_metric = getattr(pd_flights_0, func)() + if func == "mad": + pd_metric = (pd_flights_0 - pd_flights_0.mean()).abs().mean() + else: + pd_metric = getattr(pd_flights_0, func)() oml_metric = getattr(oml_flights_0, func)(numeric_only=False) assert_series_equal(pd_metric, oml_metric, check_exact=False) @@ -498,7 +505,8 @@ def test_flights_agg_quantile(self, numeric_only): ["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"] ) - pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only) + pd_quantile = pd_flights.agg([lambda x: x.quantile(0.5), lambda x: x.min()]) + pd_quantile.index = ["quantile", "min"] oml_quantile = oml_flights.agg(["quantile", "min"], numeric_only=numeric_only) assert_frame_equal( diff --git a/tests/series/test_arithmetics_pytest.py b/tests/series/test_arithmetics_pytest.py index 18226675c..196e5d38d 100644 --- a/tests/series/test_arithmetics_pytest.py +++ b/tests/series/test_arithmetics_pytest.py @@ -80,9 +80,7 @@ def to_pandas(self): # "type cast" to modified class (inherits from ed.Series) that overrides the `to_pandas` function oml_series.__class__ = ModifiedOMLSeries - assert_pandas_opensearch_py_ml_series_equal( - pd_series, oml_series, check_less_precise=True - ) + assert_pandas_opensearch_py_ml_series_equal(pd_series, oml_series) def test_ecommerce_series_invalid_div(self): pd_df = self.pd_ecommerce() diff --git a/tests/series/test_metrics_pytest.py b/tests/series/test_metrics_pytest.py index 71037d4da..9df21dcbf 100644 --- a/tests/series/test_metrics_pytest.py +++ b/tests/series/test_metrics_pytest.py @@ -49,7 +49,10 @@ def test_flights_metrics(self): oml_flights = self.oml_flights()["AvgTicketPrice"] for func in self.all_funcs: - pd_metric = getattr(pd_flights, func)() + if func == "mad": + pd_metric = (pd_flights - pd_flights.mean()).abs().mean() + else: + pd_metric = getattr(pd_flights, func)() oml_metric = getattr(oml_flights, func)() self.assert_almost_equal_for_agg(func, pd_metric, oml_metric) @@ -94,7 +97,10 @@ def test_ecommerce_selected_all_numeric_source_fields(self): oml_ecommerce = self.oml_ecommerce()[column] for func in self.all_funcs: - pd_metric = getattr(pd_ecommerce, func)() + if func == "mad": + pd_metric = (pd_ecommerce - pd_ecommerce.mean()).abs().mean() + else: + pd_metric = getattr(pd_ecommerce, func)() oml_metric = getattr(oml_ecommerce, func)( **({"numeric_only": True} if (func != "nunique") else {}) ) From 06076d9b4075527bf9dcfa7226d8e0870cb4a59d Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Wed, 30 Oct 2024 00:45:16 +0500 Subject: [PATCH 02/20] Updated CHANGELOG Signed-off-by: Yerzhaisang Taskali --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c239c2082..cebd68d3c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,6 +46,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - updating listing file with three v2 sparse model - by @dhrubo-os ([#412](https://github.com/opensearch-project/opensearch-py-ml/pull/412)) - Update model upload history - opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini (v.1.0.0)(TORCH_SCRIPT) by @dhrubo-os ([#417](https://github.com/opensearch-project/opensearch-py-ml/pull/417)) - Update model upload history - opensearch-project/opensearch-neural-sparse-encoding-v2-distill (v.1.0.0)(TORCH_SCRIPT) by @dhrubo-os ([#419](https://github.com/opensearch-project/opensearch-py-ml/pull/419)) +- Bump pandas from 1.5.3 to 2.0.3 bu @yerzhaisang ([#422](https://github.com/opensearch-project/opensearch-py-ml/pull/422)) ### Fixed - Fix the wrong final zip file name in model_uploader workflow, now will name it by the upload_prefix alse.([#413](https://github.com/opensearch-project/opensearch-py-ml/pull/413/files)) From 3ef1e2c3e13c01a8da2cf263ec197abae90e0cc4 Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Sun, 3 Nov 2024 13:20:41 +0500 Subject: [PATCH 03/20] Updated CHANGELOG Signed-off-by: Yerzhaisang Taskali --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cebd68d3c..b9e332b12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,7 +46,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - updating listing file with three v2 sparse model - by @dhrubo-os ([#412](https://github.com/opensearch-project/opensearch-py-ml/pull/412)) - Update model upload history - opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini (v.1.0.0)(TORCH_SCRIPT) by @dhrubo-os ([#417](https://github.com/opensearch-project/opensearch-py-ml/pull/417)) - Update model upload history - opensearch-project/opensearch-neural-sparse-encoding-v2-distill (v.1.0.0)(TORCH_SCRIPT) by @dhrubo-os ([#419](https://github.com/opensearch-project/opensearch-py-ml/pull/419)) -- Bump pandas from 1.5.3 to 2.0.3 bu @yerzhaisang ([#422](https://github.com/opensearch-project/opensearch-py-ml/pull/422)) +- Bump pandas from 1.5.3 to 2.0.3 by @yerzhaisang ([#422](https://github.com/opensearch-project/opensearch-py-ml/pull/422)) ### Fixed - Fix the wrong final zip file name in model_uploader workflow, now will name it by the upload_prefix alse.([#413](https://github.com/opensearch-project/opensearch-py-ml/pull/413/files)) From da14ee5bf8c111833950c0d02f3e3bd7cf8cad7a Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Mon, 4 Nov 2024 12:41:52 +0500 Subject: [PATCH 04/20] updated built-in method Signed-off-by: Yerzhaisang Taskali --- opensearch_py_ml/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opensearch_py_ml/common.py b/opensearch_py_ml/common.py index 1449f03d5..287619df6 100644 --- a/opensearch_py_ml/common.py +++ b/opensearch_py_ml/common.py @@ -66,7 +66,7 @@ def build_pd_series( dtype = dtype or (EMPTY_SERIES_DTYPE if not data else dtype) if dtype is not None: kwargs["dtype"] = dtype - if index_name: + if index_name is not None: index = pd.Index(data.keys(), name=index_name) kwargs["index"] = index return pd.Series(data, **kwargs) From c68b7f8e543865b510001a0016b9ed9157046308 Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Mon, 4 Nov 2024 13:58:52 +0500 Subject: [PATCH 05/20] removed unused comment Signed-off-by: Yerzhaisang Taskali --- opensearch_py_ml/dataframe.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/opensearch_py_ml/dataframe.py b/opensearch_py_ml/dataframe.py index cdc196d8a..ac28764e1 100644 --- a/opensearch_py_ml/dataframe.py +++ b/opensearch_py_ml/dataframe.py @@ -424,9 +424,6 @@ def drop( axis = pd.DataFrame._get_axis_name(axis) axes = {axis: labels} elif index is not None or columns is not None: - # axes, _ = pd.DataFrame()._construct_axes_from_arguments( - # (index, columns), {} - # ) axes = {} if index is not None: if isinstance(index, pd.Index): From 0084ccebb2cbd5a4b38870ff5185a9d5157a561d Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Mon, 4 Nov 2024 14:38:53 +0500 Subject: [PATCH 06/20] Implement to_list_if_needed method for list conversion Signed-off-by: Yerzhaisang Taskali --- opensearch_py_ml/dataframe.py | 37 +++++++++-------------------------- opensearch_py_ml/utils.py | 24 +++++++++++++++++++++++ 2 files changed, 33 insertions(+), 28 deletions(-) diff --git a/opensearch_py_ml/dataframe.py b/opensearch_py_ml/dataframe.py index ac28764e1..9b0370b98 100644 --- a/opensearch_py_ml/dataframe.py +++ b/opensearch_py_ml/dataframe.py @@ -47,7 +47,7 @@ from opensearch_py_ml.groupby import DataFrameGroupBy from opensearch_py_ml.ndframe import NDFrame from opensearch_py_ml.series import Series -from opensearch_py_ml.utils import is_valid_attr_name +from opensearch_py_ml.utils import is_valid_attr_name, to_list_if_needed if TYPE_CHECKING: from opensearchpy import OpenSearch @@ -424,33 +424,14 @@ def drop( axis = pd.DataFrame._get_axis_name(axis) axes = {axis: labels} elif index is not None or columns is not None: - axes = {} - if index is not None: - if isinstance(index, pd.Index): - index = index.tolist() # Convert Index to list - elif not is_list_like(index): - index = [index] # Convert to list if it's not list-like already - axes["index"] = index - else: - axes["index"] = None - - if columns is not None: - if isinstance(columns, pd.Index): - columns = columns.tolist() # Convert Index to list - elif not is_list_like(columns): - columns = [columns] # Convert to list if it's not list-like already - axes["columns"] = columns - else: - axes["columns"] = None - - if columns is not None: - if not is_list_like(columns): - columns = [columns] - axes["columns"] = ( - pd.Index(columns) if isinstance(columns, list) else columns - ) - else: - axes["columns"] = None + axes = { + "index": to_list_if_needed(index), + "columns": ( + pd.Index(to_list_if_needed(columns)) + if columns is not None + else None + ), + } else: raise ValueError( "Need to specify at least one of 'labels', 'index' or 'columns'" diff --git a/opensearch_py_ml/utils.py b/opensearch_py_ml/utils.py index 8f1763085..2f8cf49ab 100644 --- a/opensearch_py_ml/utils.py +++ b/opensearch_py_ml/utils.py @@ -30,6 +30,7 @@ from typing import Any, Callable, Collection, Iterable, List, TypeVar, Union, cast import pandas as pd # type: ignore +from pandas.core.dtypes.common import is_list_like # type: ignore RT = TypeVar("RT") @@ -61,6 +62,29 @@ def is_valid_attr_name(s: str) -> bool: ) +def to_list_if_needed(value): + """ + Converts the input to a list if necessary. + + If the input is a pandas Index, it converts it to a list. + If the input is not list-like (e.g., a single value), it wraps it in a list. + If the input is None or already list-like, it returns it as is. + + Parameters: + value: The input to potentially convert to a list. + + Returns: + The input converted to a list if needed, or the original input if no conversion is necessary. + """ + if value is None: + return None + if isinstance(value, pd.Index): + return value.tolist() + if not is_list_like(value): + return [value] + return value + + def to_list(x: Union[Collection[Any], pd.Series]) -> List[Any]: if isinstance(x, ABCCollection): return list(x) From 12bacb7ff04a6427194d58f08a8ef9c31c90d402 Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Mon, 4 Nov 2024 16:05:15 +0500 Subject: [PATCH 07/20] Refactor MAD calculation using CustomFunctionDispatcher for improved readability and reusability Signed-off-by: Yerzhaisang Taskali --- opensearch_py_ml/utils.py | 20 ++++++++++++++++++++ tests/dataframe/test_groupby_pytest.py | 16 ++++++++-------- tests/dataframe/test_metrics_pytest.py | 23 ++++++++++++----------- tests/series/test_metrics_pytest.py | 15 +++++++++------ 4 files changed, 49 insertions(+), 25 deletions(-) diff --git a/opensearch_py_ml/utils.py b/opensearch_py_ml/utils.py index 2f8cf49ab..c52ab88a7 100644 --- a/opensearch_py_ml/utils.py +++ b/opensearch_py_ml/utils.py @@ -101,3 +101,23 @@ def try_sort(iterable: Iterable[str]) -> Iterable[str]: return sorted(listed) except TypeError: return listed + + +class CustomFunctionDispatcher: + # Define custom functions in a dictionary + customFunctionMap = { + "mad": lambda x: (x - x.median()).abs().mean(), + } + + @classmethod + def apply_custom_function(cls, func, data): + """ + Apply a custom function if available, else return None. + :param func: Function name as a string + :param data: Data on which function is applied + :return: Result of custom function or None if func not found + """ + custom_func = cls.customFunctionMap.get(func) + if custom_func: + return custom_func(data) + return None diff --git a/tests/dataframe/test_groupby_pytest.py b/tests/dataframe/test_groupby_pytest.py index 510d6fe74..6635784fd 100644 --- a/tests/dataframe/test_groupby_pytest.py +++ b/tests/dataframe/test_groupby_pytest.py @@ -28,6 +28,7 @@ import pytest from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal +from opensearch_py_ml.utils import CustomFunctionDispatcher from tests.common import TestData @@ -106,11 +107,10 @@ def test_groupby_aggs_mad_var_std(self, pd_agg, dropna): pd_flights = self.pd_flights().filter(self.filter_data) oml_flights = self.oml_flights().filter(self.filter_data) - if pd_agg == "mad": - pd_groupby = pd_flights.groupby("Cancelled", dropna=dropna).agg( - lambda x: (x - x.mean()).abs().mean() - ) - else: + pd_groupby = pd_flights.groupby("Cancelled", dropna=dropna).agg( + lambda x: CustomFunctionDispatcher.apply_custom_function(pd_agg, x) + ) + if not pd_groupby: pd_groupby = getattr( pd_flights.groupby("Cancelled", dropna=dropna), pd_agg )() @@ -233,8 +233,8 @@ def test_groupby_dataframe_mad(self): oml_flights = self.oml_flights().filter(self.filter_data + ["DestCountry"]) pd_mad = pd_flights.groupby("DestCountry").apply( - lambda x: x.select_dtypes(include="number").apply( - lambda x: (x - x.mean()).abs().mean() + lambda group: group.select_dtypes(include="number").apply( + lambda x: CustomFunctionDispatcher.apply_custom_function("mad", x) ) ) @@ -254,7 +254,7 @@ def test_groupby_dataframe_mad(self): assert_series_equal(pd_mad.dtypes, oml_mad.dtypes) pd_min_mad = pd_flights.groupby("DestCountry").agg( - ["min", lambda x: (x - x.median()).abs().mean()] + ["min", lambda x: CustomFunctionDispatcher.apply_custom_function("mad", x)] ) pd_min_mad.columns = pd_min_mad.columns.set_levels(["min", "mad"], level=1) diff --git a/tests/dataframe/test_metrics_pytest.py b/tests/dataframe/test_metrics_pytest.py index 65adaf38e..5aad17fd4 100644 --- a/tests/dataframe/test_metrics_pytest.py +++ b/tests/dataframe/test_metrics_pytest.py @@ -24,11 +24,11 @@ import numpy as np import pandas as pd - -# File called _pytest for PyCharm compatibility import pytest from pandas.testing import assert_frame_equal, assert_series_equal +# File called _pytest for PyCharm compatibility +from opensearch_py_ml.utils import CustomFunctionDispatcher from tests.common import TestData, assert_almost_equal @@ -81,9 +81,8 @@ def test_flights_extended_metrics(self): logger.setLevel(logging.DEBUG) for func in self.extended_funcs: - if func == "mad": - pd_metric = (pd_flights - pd_flights.mean()).abs().mean() - else: + pd_metric = CustomFunctionDispatcher.apply_custom_function(func, pd_flights) + if not pd_metric: pd_metric = getattr(pd_flights, func)(**({"numeric_only": True})) oml_metric = getattr(oml_flights, func)(numeric_only=True) @@ -102,9 +101,10 @@ def test_flights_extended_metrics_nan(self): ] for func in self.extended_funcs: - if func == "mad": - pd_metric = (pd_flights_1 - pd_flights_1.mean()).abs().mean() - else: + pd_metric = pd_flights_1.apply( + lambda x: CustomFunctionDispatcher.apply_custom_function(func, x) + ) + if not pd_metric: pd_metric = getattr(pd_flights_1, func)() oml_metric = getattr(oml_flights_1, func)(numeric_only=False) @@ -115,9 +115,10 @@ def test_flights_extended_metrics_nan(self): oml_flights_0 = oml_flights[oml_flights.FlightNum == "XXX"][["AvgTicketPrice"]] for func in self.extended_funcs: - if func == "mad": - pd_metric = (pd_flights_0 - pd_flights_0.mean()).abs().mean() - else: + pd_metric = pd_flights_0.apply( + lambda x: CustomFunctionDispatcher.apply_custom_function(func, x) + ) + if not pd_metric: pd_metric = getattr(pd_flights_0, func)() oml_metric = getattr(oml_flights_0, func)(numeric_only=False) diff --git a/tests/series/test_metrics_pytest.py b/tests/series/test_metrics_pytest.py index 9df21dcbf..d316339c2 100644 --- a/tests/series/test_metrics_pytest.py +++ b/tests/series/test_metrics_pytest.py @@ -31,6 +31,7 @@ import pytest from pandas.testing import assert_series_equal +from opensearch_py_ml.utils import CustomFunctionDispatcher from tests.common import TestData, assert_almost_equal @@ -49,9 +50,10 @@ def test_flights_metrics(self): oml_flights = self.oml_flights()["AvgTicketPrice"] for func in self.all_funcs: - if func == "mad": - pd_metric = (pd_flights - pd_flights.mean()).abs().mean() - else: + pd_metric = pd_flights.apply( + lambda x: CustomFunctionDispatcher.apply_custom_function(func, x) + ) + if not pd_metric: pd_metric = getattr(pd_flights, func)() oml_metric = getattr(oml_flights, func)() @@ -97,9 +99,10 @@ def test_ecommerce_selected_all_numeric_source_fields(self): oml_ecommerce = self.oml_ecommerce()[column] for func in self.all_funcs: - if func == "mad": - pd_metric = (pd_ecommerce - pd_ecommerce.mean()).abs().mean() - else: + pd_metric = pd_ecommerce.apply( + lambda x: CustomFunctionDispatcher.apply_custom_function(func, x) + ) + if not pd_metric: pd_metric = getattr(pd_ecommerce, func)() oml_metric = getattr(oml_ecommerce, func)( **({"numeric_only": True} if (func != "nunique") else {}) From 77ebf1c6d200a82a436375db3e5a8f756df29511 Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Mon, 4 Nov 2024 16:28:55 +0500 Subject: [PATCH 08/20] Refactor MAD calculation using CustomFunctionDispatcher for improved readability and reusability Signed-off-by: Yerzhaisang Taskali --- tests/dataframe/test_groupby_pytest.py | 2 +- tests/dataframe/test_metrics_pytest.py | 6 +++--- tests/series/test_metrics_pytest.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/dataframe/test_groupby_pytest.py b/tests/dataframe/test_groupby_pytest.py index 6635784fd..cfe6bfc8a 100644 --- a/tests/dataframe/test_groupby_pytest.py +++ b/tests/dataframe/test_groupby_pytest.py @@ -110,7 +110,7 @@ def test_groupby_aggs_mad_var_std(self, pd_agg, dropna): pd_groupby = pd_flights.groupby("Cancelled", dropna=dropna).agg( lambda x: CustomFunctionDispatcher.apply_custom_function(pd_agg, x) ) - if not pd_groupby: + if pd_groupby is None: pd_groupby = getattr( pd_flights.groupby("Cancelled", dropna=dropna), pd_agg )() diff --git a/tests/dataframe/test_metrics_pytest.py b/tests/dataframe/test_metrics_pytest.py index 5aad17fd4..518d2be7b 100644 --- a/tests/dataframe/test_metrics_pytest.py +++ b/tests/dataframe/test_metrics_pytest.py @@ -82,7 +82,7 @@ def test_flights_extended_metrics(self): for func in self.extended_funcs: pd_metric = CustomFunctionDispatcher.apply_custom_function(func, pd_flights) - if not pd_metric: + if pd_metric is None: pd_metric = getattr(pd_flights, func)(**({"numeric_only": True})) oml_metric = getattr(oml_flights, func)(numeric_only=True) @@ -104,7 +104,7 @@ def test_flights_extended_metrics_nan(self): pd_metric = pd_flights_1.apply( lambda x: CustomFunctionDispatcher.apply_custom_function(func, x) ) - if not pd_metric: + if pd_metric is None: pd_metric = getattr(pd_flights_1, func)() oml_metric = getattr(oml_flights_1, func)(numeric_only=False) @@ -118,7 +118,7 @@ def test_flights_extended_metrics_nan(self): pd_metric = pd_flights_0.apply( lambda x: CustomFunctionDispatcher.apply_custom_function(func, x) ) - if not pd_metric: + if pd_metric is None: pd_metric = getattr(pd_flights_0, func)() oml_metric = getattr(oml_flights_0, func)(numeric_only=False) diff --git a/tests/series/test_metrics_pytest.py b/tests/series/test_metrics_pytest.py index d316339c2..949817509 100644 --- a/tests/series/test_metrics_pytest.py +++ b/tests/series/test_metrics_pytest.py @@ -53,7 +53,7 @@ def test_flights_metrics(self): pd_metric = pd_flights.apply( lambda x: CustomFunctionDispatcher.apply_custom_function(func, x) ) - if not pd_metric: + if pd_metric is None: pd_metric = getattr(pd_flights, func)() oml_metric = getattr(oml_flights, func)() @@ -102,7 +102,7 @@ def test_ecommerce_selected_all_numeric_source_fields(self): pd_metric = pd_ecommerce.apply( lambda x: CustomFunctionDispatcher.apply_custom_function(func, x) ) - if not pd_metric: + if pd_metric is None: pd_metric = getattr(pd_ecommerce, func)() oml_metric = getattr(oml_ecommerce, func)( **({"numeric_only": True} if (func != "nunique") else {}) From 45e793a3dca7afe96a467aa0df14a2ce05688eb5 Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Mon, 4 Nov 2024 17:24:26 +0500 Subject: [PATCH 09/20] Refactor MAD calculation using CustomFunctionDispatcher for improved readability and reusability Signed-off-by: Yerzhaisang Taskali --- tests/dataframe/test_groupby_pytest.py | 9 +++++---- tests/dataframe/test_metrics_pytest.py | 25 +++++++++++++++---------- tests/series/test_metrics_pytest.py | 20 ++++++++++++-------- 3 files changed, 32 insertions(+), 22 deletions(-) diff --git a/tests/dataframe/test_groupby_pytest.py b/tests/dataframe/test_groupby_pytest.py index cfe6bfc8a..8bebd685e 100644 --- a/tests/dataframe/test_groupby_pytest.py +++ b/tests/dataframe/test_groupby_pytest.py @@ -107,10 +107,11 @@ def test_groupby_aggs_mad_var_std(self, pd_agg, dropna): pd_flights = self.pd_flights().filter(self.filter_data) oml_flights = self.oml_flights().filter(self.filter_data) - pd_groupby = pd_flights.groupby("Cancelled", dropna=dropna).agg( - lambda x: CustomFunctionDispatcher.apply_custom_function(pd_agg, x) - ) - if pd_groupby is None: + if pd_agg in CustomFunctionDispatcher.customFunctionMap: + pd_groupby = pd_flights.groupby("Cancelled", dropna=dropna).agg( + lambda x: CustomFunctionDispatcher.apply_custom_function(pd_agg, x) + ) + else: pd_groupby = getattr( pd_flights.groupby("Cancelled", dropna=dropna), pd_agg )() diff --git a/tests/dataframe/test_metrics_pytest.py b/tests/dataframe/test_metrics_pytest.py index 518d2be7b..3fda5cafd 100644 --- a/tests/dataframe/test_metrics_pytest.py +++ b/tests/dataframe/test_metrics_pytest.py @@ -81,8 +81,11 @@ def test_flights_extended_metrics(self): logger.setLevel(logging.DEBUG) for func in self.extended_funcs: - pd_metric = CustomFunctionDispatcher.apply_custom_function(func, pd_flights) - if pd_metric is None: + if func in CustomFunctionDispatcher.customFunctionMap: + pd_metric = CustomFunctionDispatcher.apply_custom_function( + func, pd_flights + ) + else: pd_metric = getattr(pd_flights, func)(**({"numeric_only": True})) oml_metric = getattr(oml_flights, func)(numeric_only=True) @@ -101,10 +104,11 @@ def test_flights_extended_metrics_nan(self): ] for func in self.extended_funcs: - pd_metric = pd_flights_1.apply( - lambda x: CustomFunctionDispatcher.apply_custom_function(func, x) - ) - if pd_metric is None: + if func in CustomFunctionDispatcher.customFunctionMap: + pd_metric = pd_flights_1.apply( + lambda x: CustomFunctionDispatcher.apply_custom_function(func, x) + ) + else: pd_metric = getattr(pd_flights_1, func)() oml_metric = getattr(oml_flights_1, func)(numeric_only=False) @@ -115,10 +119,11 @@ def test_flights_extended_metrics_nan(self): oml_flights_0 = oml_flights[oml_flights.FlightNum == "XXX"][["AvgTicketPrice"]] for func in self.extended_funcs: - pd_metric = pd_flights_0.apply( - lambda x: CustomFunctionDispatcher.apply_custom_function(func, x) - ) - if pd_metric is None: + if func in CustomFunctionDispatcher.customFunctionMap: + pd_metric = pd_flights_0.apply( + lambda x: CustomFunctionDispatcher.apply_custom_function(func, x) + ) + else: pd_metric = getattr(pd_flights_0, func)() oml_metric = getattr(oml_flights_0, func)(numeric_only=False) diff --git a/tests/series/test_metrics_pytest.py b/tests/series/test_metrics_pytest.py index 949817509..097177ae6 100644 --- a/tests/series/test_metrics_pytest.py +++ b/tests/series/test_metrics_pytest.py @@ -50,10 +50,11 @@ def test_flights_metrics(self): oml_flights = self.oml_flights()["AvgTicketPrice"] for func in self.all_funcs: - pd_metric = pd_flights.apply( - lambda x: CustomFunctionDispatcher.apply_custom_function(func, x) - ) - if pd_metric is None: + if func in CustomFunctionDispatcher.customFunctionMap: + pd_metric = pd_flights.apply( + lambda x: CustomFunctionDispatcher.apply_custom_function(func, x) + ) + else: pd_metric = getattr(pd_flights, func)() oml_metric = getattr(oml_flights, func)() @@ -99,10 +100,13 @@ def test_ecommerce_selected_all_numeric_source_fields(self): oml_ecommerce = self.oml_ecommerce()[column] for func in self.all_funcs: - pd_metric = pd_ecommerce.apply( - lambda x: CustomFunctionDispatcher.apply_custom_function(func, x) - ) - if pd_metric is None: + if func in CustomFunctionDispatcher.customFunctionMap: + pd_metric = pd_ecommerce.apply( + lambda x: CustomFunctionDispatcher.apply_custom_function( + func, x + ) + ) + else: pd_metric = getattr(pd_ecommerce, func)() oml_metric = getattr(oml_ecommerce, func)( **({"numeric_only": True} if (func != "nunique") else {}) From c4455e03fbec8914651054cae72468cd2c0f787d Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Mon, 4 Nov 2024 18:02:22 +0500 Subject: [PATCH 10/20] Refactor MAD calculation using CustomFunctionDispatcher for improved readability and reusability Signed-off-by: Yerzhaisang Taskali --- tests/series/test_metrics_pytest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/series/test_metrics_pytest.py b/tests/series/test_metrics_pytest.py index 097177ae6..55768deb1 100644 --- a/tests/series/test_metrics_pytest.py +++ b/tests/series/test_metrics_pytest.py @@ -51,7 +51,7 @@ def test_flights_metrics(self): for func in self.all_funcs: if func in CustomFunctionDispatcher.customFunctionMap: - pd_metric = pd_flights.apply( + pd_metric = pd_flights.agg( lambda x: CustomFunctionDispatcher.apply_custom_function(func, x) ) else: @@ -101,7 +101,7 @@ def test_ecommerce_selected_all_numeric_source_fields(self): for func in self.all_funcs: if func in CustomFunctionDispatcher.customFunctionMap: - pd_metric = pd_ecommerce.apply( + pd_metric = pd_ecommerce.agg( lambda x: CustomFunctionDispatcher.apply_custom_function( func, x ) From 713550712ae95b403a9f0817560dbdf68f8e73ca Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Mon, 4 Nov 2024 18:40:28 +0500 Subject: [PATCH 11/20] refactor: move metric identifiers to constants.py for readability Signed-off-by: Yerzhaisang Taskali --- opensearch_py_ml/constants.py | 28 +++++++++++++++++ opensearch_py_ml/groupby.py | 11 +++++-- opensearch_py_ml/operations.py | 30 ++++++++++++++----- opensearch_py_ml/query_compiler.py | 9 +++--- opensearch_py_ml/utils.py | 4 ++- tests/dataframe/test_aggs_pytest.py | 13 ++++---- tests/dataframe/test_groupby_pytest.py | 28 +++++++++++++---- tests/dataframe/test_metrics_pytest.py | 18 +++++++---- .../test_map_pd_aggs_to_es_aggs_pytest.py | 13 +++++--- tests/series/test_describe_pytest.py | 5 ++-- tests/series/test_metrics_pytest.py | 18 +++++++++-- 11 files changed, 135 insertions(+), 42 deletions(-) create mode 100644 opensearch_py_ml/constants.py diff --git a/opensearch_py_ml/constants.py b/opensearch_py_ml/constants.py new file mode 100644 index 000000000..e916d52b3 --- /dev/null +++ b/opensearch_py_ml/constants.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + + +# Licensed to Elasticsearch B.V. under one or more contributor +# license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright +# ownership. Elasticsearch B.V. licenses this file to you under +# the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +MEAN_ABSOLUTE_DEVIATION = "mad" +VARIANCE = "var" +STANDARD_DEVIATION = "std" diff --git a/opensearch_py_ml/groupby.py b/opensearch_py_ml/groupby.py index e5c4561c3..248df0c64 100644 --- a/opensearch_py_ml/groupby.py +++ b/opensearch_py_ml/groupby.py @@ -25,6 +25,11 @@ from typing import TYPE_CHECKING, List, Optional, Union +from opensearch_py_ml.constants import ( + MEAN_ABSOLUTE_DEVIATION, + STANDARD_DEVIATION, + VARIANCE, +) from opensearch_py_ml.query_compiler import QueryCompiler if TYPE_CHECKING: @@ -153,7 +158,7 @@ def var(self, numeric_only: bool = True) -> "pd.DataFrame": """ return self._query_compiler.aggs_groupby( by=self._by, - pd_aggs=["var"], + pd_aggs=[VARIANCE], dropna=self._dropna, numeric_only=numeric_only, ) @@ -206,7 +211,7 @@ def std(self, numeric_only: bool = True) -> "pd.DataFrame": """ return self._query_compiler.aggs_groupby( by=self._by, - pd_aggs=["std"], + pd_aggs=[STANDARD_DEVIATION], dropna=self._dropna, numeric_only=numeric_only, ) @@ -259,7 +264,7 @@ def mad(self, numeric_only: bool = True) -> "pd.DataFrame": """ return self._query_compiler.aggs_groupby( by=self._by, - pd_aggs=["mad"], + pd_aggs=[MEAN_ABSOLUTE_DEVIATION], dropna=self._dropna, numeric_only=numeric_only, ) diff --git a/opensearch_py_ml/operations.py b/opensearch_py_ml/operations.py index 0a2c52990..b749562fa 100644 --- a/opensearch_py_ml/operations.py +++ b/opensearch_py_ml/operations.py @@ -52,6 +52,11 @@ build_pd_series, opensearch_date_to_pandas_date, ) +from opensearch_py_ml.constants import ( + MEAN_ABSOLUTE_DEVIATION, + STANDARD_DEVIATION, + VARIANCE, +) from opensearch_py_ml.index import Index from opensearch_py_ml.query import Query from opensearch_py_ml.tasks import ( @@ -620,7 +625,7 @@ def _unpack_metric_aggs( values.append(field.nan_value) # Explicit condition for mad to add NaN because it doesn't support bool elif is_dataframe_agg and numeric_only: - if pd_agg == "mad": + if pd_agg == MEAN_ABSOLUTE_DEVIATION: values.append(field.nan_value) continue @@ -1097,7 +1102,14 @@ def _map_pd_aggs_to_os_aggs( """ # pd aggs that will be mapped to os aggs # that can use 'extended_stats'. - extended_stats_pd_aggs = {"mean", "min", "max", "sum", "var", "std"} + extended_stats_pd_aggs = { + "mean", + "min", + "max", + "sum", + VARIANCE, + STANDARD_DEVIATION, + } extended_stats_os_aggs = {"avg", "min", "max", "sum"} extended_stats_calls = 0 @@ -1117,15 +1129,15 @@ def _map_pd_aggs_to_os_aggs( os_aggs.append("avg") elif pd_agg == "sum": os_aggs.append("sum") - elif pd_agg == "std": + elif pd_agg == STANDARD_DEVIATION: os_aggs.append(("extended_stats", "std_deviation")) - elif pd_agg == "var": + elif pd_agg == VARIANCE: os_aggs.append(("extended_stats", "variance")) # Aggs that aren't 'extended_stats' compatible elif pd_agg == "nunique": os_aggs.append("cardinality") - elif pd_agg == "mad": + elif pd_agg == MEAN_ABSOLUTE_DEVIATION: os_aggs.append("median_absolute_deviation") elif pd_agg == "median": os_aggs.append(("percentiles", (50.0,))) @@ -1205,7 +1217,7 @@ def describe(self, query_compiler: "QueryCompiler") -> pd.DataFrame: df1 = self.aggs( query_compiler=query_compiler, - pd_aggs=["count", "mean", "min", "max", "std"], + pd_aggs=["count", "mean", "min", "max", STANDARD_DEVIATION], numeric_only=True, ) df2 = self.quantile( @@ -1223,10 +1235,12 @@ def describe(self, query_compiler: "QueryCompiler") -> pd.DataFrame: if df.shape[1] == 1: return df.reindex( - ["count", "mean", "std", "min", "25%", "50%", "75%", "max"] + ["count", "mean", STANDARD_DEVIATION, "min", "25%", "50%", "75%", "max"] ) - return df.reindex(["count", "mean", "min", "25%", "50%", "75%", "max", "std"]) + return df.reindex( + ["count", "mean", "min", "25%", "50%", "75%", "max", STANDARD_DEVIATION] + ) def to_pandas( self, query_compiler: "QueryCompiler", show_progress: bool = False diff --git a/opensearch_py_ml/query_compiler.py b/opensearch_py_ml/query_compiler.py index c10899671..f9f0d0130 100644 --- a/opensearch_py_ml/query_compiler.py +++ b/opensearch_py_ml/query_compiler.py @@ -41,6 +41,7 @@ import pandas as pd # type: ignore from opensearch_py_ml.common import opensearch_date_to_pandas_date +from opensearch_py_ml.constants import MEAN_ABSOLUTE_DEVIATION, VARIANCE from opensearch_py_ml.field_mappings import FieldMappings from opensearch_py_ml.filter import BooleanFilter, QueryFilter from opensearch_py_ml.index import Index @@ -587,17 +588,15 @@ def mean(self, numeric_only: Optional[bool] = None) -> pd.Series: def var(self, numeric_only: Optional[bool] = None) -> pd.Series: return self._operations._metric_agg_series( - self, ["var"], numeric_only=numeric_only + self, [VARIANCE], numeric_only=numeric_only ) def std(self, numeric_only: Optional[bool] = None) -> pd.Series: - return self._operations._metric_agg_series( - self, ["std"], numeric_only=numeric_only - ) + return self._operations._metric_agg_series(self, [], numeric_only=numeric_only) def mad(self, numeric_only: Optional[bool] = None) -> pd.Series: return self._operations._metric_agg_series( - self, ["mad"], numeric_only=numeric_only + self, [MEAN_ABSOLUTE_DEVIATION], numeric_only=numeric_only ) def median(self, numeric_only: Optional[bool] = None) -> pd.Series: diff --git a/opensearch_py_ml/utils.py b/opensearch_py_ml/utils.py index c52ab88a7..af1204b27 100644 --- a/opensearch_py_ml/utils.py +++ b/opensearch_py_ml/utils.py @@ -32,6 +32,8 @@ import pandas as pd # type: ignore from pandas.core.dtypes.common import is_list_like # type: ignore +from opensearch_py_ml.constants import MEAN_ABSOLUTE_DEVIATION + RT = TypeVar("RT") @@ -106,7 +108,7 @@ def try_sort(iterable: Iterable[str]) -> Iterable[str]: class CustomFunctionDispatcher: # Define custom functions in a dictionary customFunctionMap = { - "mad": lambda x: (x - x.median()).abs().mean(), + MEAN_ABSOLUTE_DEVIATION: lambda x: (x - x.median()).abs().mean(), } @classmethod diff --git a/tests/dataframe/test_aggs_pytest.py b/tests/dataframe/test_aggs_pytest.py index 62f6843a6..dec2892ce 100644 --- a/tests/dataframe/test_aggs_pytest.py +++ b/tests/dataframe/test_aggs_pytest.py @@ -28,6 +28,7 @@ import pytest from pandas.testing import assert_frame_equal, assert_series_equal +from opensearch_py_ml.constants import STANDARD_DEVIATION, VARIANCE from tests.common import TestData @@ -47,10 +48,10 @@ def test_basic_aggs(self): assert_frame_equal(pd_sum_min, oml_sum_min, check_exact=False) pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg( - ["sum", "min", "std"] + ["sum", "min", STANDARD_DEVIATION] ) oml_sum_min_std = oml_flights.select_dtypes(include=[np.number]).agg( - ["sum", "min", "std"], numeric_only=True + ["sum", "min", STANDARD_DEVIATION], numeric_only=True ) print(pd_sum_min_std.dtypes) @@ -75,10 +76,10 @@ def test_terms_aggs(self): assert_frame_equal(pd_sum_min, oml_sum_min, check_exact=False) pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg( - ["sum", "min", "std"] + ["sum", "min", STANDARD_DEVIATION] ) oml_sum_min_std = oml_flights.select_dtypes(include=[np.number]).agg( - ["sum", "min", "std"], numeric_only=True + ["sum", "min", STANDARD_DEVIATION], numeric_only=True ) print(pd_sum_min_std.dtypes) @@ -94,10 +95,10 @@ def test_aggs_median_var(self): pd_aggs = pd_ecommerce[ ["taxful_total_price", "taxless_total_price", "total_quantity"] - ].agg(["median", "var"]) + ].agg(["median", VARIANCE]) oml_aggs = oml_ecommerce[ ["taxful_total_price", "taxless_total_price", "total_quantity"] - ].agg(["median", "var"], numeric_only=True) + ].agg(["median", VARIANCE], numeric_only=True) print(pd_aggs, pd_aggs.dtypes) print(oml_aggs, oml_aggs.dtypes) diff --git a/tests/dataframe/test_groupby_pytest.py b/tests/dataframe/test_groupby_pytest.py index 8bebd685e..e88357759 100644 --- a/tests/dataframe/test_groupby_pytest.py +++ b/tests/dataframe/test_groupby_pytest.py @@ -28,6 +28,11 @@ import pytest from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal +from opensearch_py_ml.constants import ( + MEAN_ABSOLUTE_DEVIATION, + STANDARD_DEVIATION, + VARIANCE, +) from opensearch_py_ml.utils import CustomFunctionDispatcher from tests.common import TestData @@ -101,7 +106,9 @@ def test_groupby_aggs_numeric_only_true(self, pd_agg, dropna): ) @pytest.mark.parametrize("dropna", [True, False]) - @pytest.mark.parametrize("pd_agg", ["mad", "var", "std"]) + @pytest.mark.parametrize( + "pd_agg", [MEAN_ABSOLUTE_DEVIATION, VARIANCE, STANDARD_DEVIATION] + ) def test_groupby_aggs_mad_var_std(self, pd_agg, dropna): # For these aggs pandas doesn't support numeric_only pd_flights = self.pd_flights().filter(self.filter_data) @@ -235,7 +242,9 @@ def test_groupby_dataframe_mad(self): pd_mad = pd_flights.groupby("DestCountry").apply( lambda group: group.select_dtypes(include="number").apply( - lambda x: CustomFunctionDispatcher.apply_custom_function("mad", x) + lambda x: CustomFunctionDispatcher.apply_custom_function( + MEAN_ABSOLUTE_DEVIATION, x + ) ) ) @@ -255,11 +264,20 @@ def test_groupby_dataframe_mad(self): assert_series_equal(pd_mad.dtypes, oml_mad.dtypes) pd_min_mad = pd_flights.groupby("DestCountry").agg( - ["min", lambda x: CustomFunctionDispatcher.apply_custom_function("mad", x)] + [ + "min", + lambda x: CustomFunctionDispatcher.apply_custom_function( + MEAN_ABSOLUTE_DEVIATION, x + ), + ] ) - pd_min_mad.columns = pd_min_mad.columns.set_levels(["min", "mad"], level=1) - oml_min_mad = oml_flights.groupby("DestCountry").aggregate(["min", "mad"]) + pd_min_mad.columns = pd_min_mad.columns.set_levels( + ["min", MEAN_ABSOLUTE_DEVIATION], level=1 + ) + oml_min_mad = oml_flights.groupby("DestCountry").aggregate( + ["min", MEAN_ABSOLUTE_DEVIATION] + ) assert_index_equal(pd_min_mad.columns, oml_min_mad.columns) assert_index_equal(pd_min_mad.index, oml_min_mad.index) diff --git a/tests/dataframe/test_metrics_pytest.py b/tests/dataframe/test_metrics_pytest.py index 3fda5cafd..25f3cc060 100644 --- a/tests/dataframe/test_metrics_pytest.py +++ b/tests/dataframe/test_metrics_pytest.py @@ -27,6 +27,12 @@ import pytest from pandas.testing import assert_frame_equal, assert_series_equal +from opensearch_py_ml.constants import ( + MEAN_ABSOLUTE_DEVIATION, + STANDARD_DEVIATION, + VARIANCE, +) + # File called _pytest for PyCharm compatibility from opensearch_py_ml.utils import CustomFunctionDispatcher from tests.common import TestData, assert_almost_equal @@ -34,7 +40,7 @@ class TestDataFrameMetrics(TestData): funcs = ["max", "min", "mean", "sum"] - extended_funcs = ["median", "mad", "var", "std"] + extended_funcs = ["median", MEAN_ABSOLUTE_DEVIATION, VARIANCE, STANDARD_DEVIATION] filter_data = [ "AvgTicketPrice", "Cancelled", @@ -190,9 +196,9 @@ def test_flights_datetime_metrics_agg(self): "min": pd.Timestamp("2018-01-01 00:00:00"), "mean": pd.Timestamp("2018-01-21 19:20:45.564438232"), "sum": pd.NaT, - "mad": pd.NaT, - "var": pd.NaT, - "std": pd.NaT, + MEAN_ABSOLUTE_DEVIATION: pd.NaT, + VARIANCE: pd.NaT, + STANDARD_DEVIATION: pd.NaT, "nunique": 12236, } @@ -301,7 +307,7 @@ def test_flights_numeric_only(self): agg_data = oml_flights.agg(filtered_aggs, numeric_only=True).transpose() for agg in filtered_aggs: # Explicitly check for mad because it returns nan for bools - if agg == "mad": + if agg == MEAN_ABSOLUTE_DEVIATION: assert np.isnan(agg_data[agg]["Cancelled"]) else: assert_series_equal( @@ -317,7 +323,7 @@ def test_numeric_only_true_single_aggs(self): for agg in self.funcs + self.extended_funcs: result = getattr(oml_flights, agg)(numeric_only=True) assert result.dtype == np.dtype("float64") - assert result.shape == ((3,) if agg != "mad" else (2,)) + assert result.shape == ((3,) if agg != MEAN_ABSOLUTE_DEVIATION else (2,)) # check dtypes and shape of min, max and median for numeric_only=False | None @pytest.mark.parametrize("agg", ["min", "max", "median"]) diff --git a/tests/operations/test_map_pd_aggs_to_es_aggs_pytest.py b/tests/operations/test_map_pd_aggs_to_es_aggs_pytest.py index f5e4e8a96..ed975902d 100644 --- a/tests/operations/test_map_pd_aggs_to_es_aggs_pytest.py +++ b/tests/operations/test_map_pd_aggs_to_es_aggs_pytest.py @@ -22,6 +22,11 @@ # specific language governing permissions and limitations # under the License. +from opensearch_py_ml.constants import ( + MEAN_ABSOLUTE_DEVIATION, + STANDARD_DEVIATION, + VARIANCE, +) from opensearch_py_ml.operations import Operations @@ -31,9 +36,9 @@ def test_all_aggs(): "min", "max", "mean", - "std", - "var", - "mad", + STANDARD_DEVIATION, + VARIANCE, + MEAN_ABSOLUTE_DEVIATION, "count", "nunique", "median", @@ -69,7 +74,7 @@ def test_extended_stats_optimization(): os_aggs = Operations._map_pd_aggs_to_os_aggs(["count", "nunique"]) assert os_aggs == ["value_count", "cardinality"] - for pd_agg in ["var", "std"]: + for pd_agg in [VARIANCE, STANDARD_DEVIATION]: extended_os_agg = Operations._map_pd_aggs_to_os_aggs([pd_agg])[0] os_aggs = Operations._map_pd_aggs_to_os_aggs([pd_agg, "nunique"]) diff --git a/tests/series/test_describe_pytest.py b/tests/series/test_describe_pytest.py index b0ad65602..ca00f1330 100644 --- a/tests/series/test_describe_pytest.py +++ b/tests/series/test_describe_pytest.py @@ -24,6 +24,7 @@ import pandas as pd +from opensearch_py_ml.constants import STANDARD_DEVIATION from tests.common import TestData, assert_series_equal @@ -42,7 +43,7 @@ def test_series_describe(self): # Percentiles calculations vary for Elasticsearch assert_series_equal( - oml_desc[["count", "mean", "std", "min", "max"]], - pd_desc[["count", "mean", "std", "min", "max"]], + oml_desc[["count", "mean", STANDARD_DEVIATION, "min", "max"]], + pd_desc[["count", "mean", STANDARD_DEVIATION, "min", "max"]], rtol=0.2, ) diff --git a/tests/series/test_metrics_pytest.py b/tests/series/test_metrics_pytest.py index 55768deb1..011035b95 100644 --- a/tests/series/test_metrics_pytest.py +++ b/tests/series/test_metrics_pytest.py @@ -31,16 +31,30 @@ import pytest from pandas.testing import assert_series_equal +from opensearch_py_ml.constants import ( + MEAN_ABSOLUTE_DEVIATION, + STANDARD_DEVIATION, + VARIANCE, +) from opensearch_py_ml.utils import CustomFunctionDispatcher from tests.common import TestData, assert_almost_equal class TestSeriesMetrics(TestData): - all_funcs = ["max", "min", "mean", "sum", "nunique", "var", "std", "mad"] + all_funcs = [ + "max", + "min", + "mean", + "sum", + "nunique", + VARIANCE, + STANDARD_DEVIATION, + MEAN_ABSOLUTE_DEVIATION, + ] timestamp_funcs = ["max", "min", "mean", "nunique"] def assert_almost_equal_for_agg(self, func, pd_metric, oml_metric): - if func in ("nunique", "var", "mad"): + if func in ("nunique", VARIANCE, MEAN_ABSOLUTE_DEVIATION): np.testing.assert_almost_equal(pd_metric, oml_metric, decimal=-3) else: np.testing.assert_almost_equal(pd_metric, oml_metric, decimal=2) From d2203bea35ca07ead0c3d13a0c0af3ab458c317b Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Mon, 4 Nov 2024 19:02:22 +0500 Subject: [PATCH 12/20] refactor: move metric identifiers to constants.py for readability Signed-off-by: Yerzhaisang Taskali --- opensearch_py_ml/query_compiler.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/opensearch_py_ml/query_compiler.py b/opensearch_py_ml/query_compiler.py index f9f0d0130..c3aaa9d5d 100644 --- a/opensearch_py_ml/query_compiler.py +++ b/opensearch_py_ml/query_compiler.py @@ -41,7 +41,11 @@ import pandas as pd # type: ignore from opensearch_py_ml.common import opensearch_date_to_pandas_date -from opensearch_py_ml.constants import MEAN_ABSOLUTE_DEVIATION, VARIANCE +from opensearch_py_ml.constants import ( + MEAN_ABSOLUTE_DEVIATION, + STANDARD_DEVIATION, + VARIANCE, +) from opensearch_py_ml.field_mappings import FieldMappings from opensearch_py_ml.filter import BooleanFilter, QueryFilter from opensearch_py_ml.index import Index @@ -592,7 +596,9 @@ def var(self, numeric_only: Optional[bool] = None) -> pd.Series: ) def std(self, numeric_only: Optional[bool] = None) -> pd.Series: - return self._operations._metric_agg_series(self, [], numeric_only=numeric_only) + return self._operations._metric_agg_series( + self, [STANDARD_DEVIATION], numeric_only=numeric_only + ) def mad(self, numeric_only: Optional[bool] = None) -> pd.Series: return self._operations._metric_agg_series( From c55639d022ddbdc4732291e6726b3b2c6a5a9bc1 Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Tue, 5 Nov 2024 14:00:14 +0500 Subject: [PATCH 13/20] Removed unused line Signed-off-by: Yerzhaisang Taskali --- tests/dataframe/test_groupby_pytest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/dataframe/test_groupby_pytest.py b/tests/dataframe/test_groupby_pytest.py index e88357759..282e3240b 100644 --- a/tests/dataframe/test_groupby_pytest.py +++ b/tests/dataframe/test_groupby_pytest.py @@ -125,7 +125,6 @@ def test_groupby_aggs_mad_var_std(self, pd_agg, dropna): oml_groupby = getattr(oml_flights.groupby("Cancelled", dropna=dropna), pd_agg)( numeric_only=True ) - pd_groupby = pd_groupby[oml_groupby.columns] # checking only values because dtypes are checked in aggs tests assert_frame_equal( From b1f2319db2b89af9395219317e0ea6579ec58665 Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Tue, 5 Nov 2024 21:47:34 +0500 Subject: [PATCH 14/20] clarify build_pd_series docstring Signed-off-by: Yerzhaisang Taskali --- opensearch_py_ml/common.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/opensearch_py_ml/common.py b/opensearch_py_ml/common.py index 287619df6..ece2aad24 100644 --- a/opensearch_py_ml/common.py +++ b/opensearch_py_ml/common.py @@ -60,9 +60,22 @@ def build_pd_series( index_name: Optional[str] = None, **kwargs: Any, ) -> pd.Series: - """Builds a pd.Series while squelching the warning - for unspecified dtype on empty series """ + Builds a pandas Series from a dictionary, optionally setting an index name. + + Parameters: + data : Dict[str, Any] + The data to build the Series from, with keys as the index. + dtype : Optional[DTypeLike] + The desired data type of the Series. If not specified, uses EMPTY_SERIES_DTYPE if data is empty. + index_name : Optional[str] + Name to assign to the Series index, similar to `index_name` in `value_counts`. + + Returns: + pd.Series + A pandas Series constructed from the given data, with the specified dtype and index name. + """ + dtype = dtype or (EMPTY_SERIES_DTYPE if not data else dtype) if dtype is not None: kwargs["dtype"] = dtype From 77c56a73716f68b9b9541958f195465ff2340f50 Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Tue, 5 Nov 2024 21:53:33 +0500 Subject: [PATCH 15/20] save constansts in utils.py Signed-off-by: Yerzhaisang Taskali --- opensearch_py_ml/constants.py | 28 ------------------- opensearch_py_ml/groupby.py | 2 +- opensearch_py_ml/operations.py | 2 +- opensearch_py_ml/query_compiler.py | 2 +- opensearch_py_ml/utils.py | 5 ++-- tests/dataframe/test_aggs_pytest.py | 2 +- tests/dataframe/test_groupby_pytest.py | 2 +- tests/dataframe/test_metrics_pytest.py | 2 +- .../test_map_pd_aggs_to_es_aggs_pytest.py | 2 +- tests/series/test_describe_pytest.py | 2 +- tests/series/test_metrics_pytest.py | 2 +- 11 files changed, 12 insertions(+), 39 deletions(-) delete mode 100644 opensearch_py_ml/constants.py diff --git a/opensearch_py_ml/constants.py b/opensearch_py_ml/constants.py deleted file mode 100644 index e916d52b3..000000000 --- a/opensearch_py_ml/constants.py +++ /dev/null @@ -1,28 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# The OpenSearch Contributors require contributions made to -# this file be licensed under the Apache-2.0 license or a -# compatible open source license. -# Any modifications Copyright OpenSearch Contributors. See -# GitHub history for details. - - -# Licensed to Elasticsearch B.V. under one or more contributor -# license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright -# ownership. Elasticsearch B.V. licenses this file to you under -# the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -MEAN_ABSOLUTE_DEVIATION = "mad" -VARIANCE = "var" -STANDARD_DEVIATION = "std" diff --git a/opensearch_py_ml/groupby.py b/opensearch_py_ml/groupby.py index 248df0c64..2dcef9c7a 100644 --- a/opensearch_py_ml/groupby.py +++ b/opensearch_py_ml/groupby.py @@ -25,7 +25,7 @@ from typing import TYPE_CHECKING, List, Optional, Union -from opensearch_py_ml.constants import ( +from opensearch_py_ml.utils import ( MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE, diff --git a/opensearch_py_ml/operations.py b/opensearch_py_ml/operations.py index b749562fa..efc61b36b 100644 --- a/opensearch_py_ml/operations.py +++ b/opensearch_py_ml/operations.py @@ -52,7 +52,7 @@ build_pd_series, opensearch_date_to_pandas_date, ) -from opensearch_py_ml.constants import ( +from opensearch_py_ml.utils import ( MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE, diff --git a/opensearch_py_ml/query_compiler.py b/opensearch_py_ml/query_compiler.py index c3aaa9d5d..9a0ee1836 100644 --- a/opensearch_py_ml/query_compiler.py +++ b/opensearch_py_ml/query_compiler.py @@ -41,7 +41,7 @@ import pandas as pd # type: ignore from opensearch_py_ml.common import opensearch_date_to_pandas_date -from opensearch_py_ml.constants import ( +from opensearch_py_ml.utils import ( MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE, diff --git a/opensearch_py_ml/utils.py b/opensearch_py_ml/utils.py index af1204b27..00a5dada2 100644 --- a/opensearch_py_ml/utils.py +++ b/opensearch_py_ml/utils.py @@ -32,10 +32,11 @@ import pandas as pd # type: ignore from pandas.core.dtypes.common import is_list_like # type: ignore -from opensearch_py_ml.constants import MEAN_ABSOLUTE_DEVIATION - RT = TypeVar("RT") +MEAN_ABSOLUTE_DEVIATION = "mad" +VARIANCE = "var" +STANDARD_DEVIATION = "std" def deprecated_api( replace_with: str, diff --git a/tests/dataframe/test_aggs_pytest.py b/tests/dataframe/test_aggs_pytest.py index dec2892ce..748a3dfd3 100644 --- a/tests/dataframe/test_aggs_pytest.py +++ b/tests/dataframe/test_aggs_pytest.py @@ -28,7 +28,7 @@ import pytest from pandas.testing import assert_frame_equal, assert_series_equal -from opensearch_py_ml.constants import STANDARD_DEVIATION, VARIANCE +from opensearch_py_ml.utils import STANDARD_DEVIATION, VARIANCE from tests.common import TestData diff --git a/tests/dataframe/test_groupby_pytest.py b/tests/dataframe/test_groupby_pytest.py index 282e3240b..911550515 100644 --- a/tests/dataframe/test_groupby_pytest.py +++ b/tests/dataframe/test_groupby_pytest.py @@ -28,7 +28,7 @@ import pytest from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal -from opensearch_py_ml.constants import ( +from opensearch_py_ml.utils import ( MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE, diff --git a/tests/dataframe/test_metrics_pytest.py b/tests/dataframe/test_metrics_pytest.py index 25f3cc060..00901a40f 100644 --- a/tests/dataframe/test_metrics_pytest.py +++ b/tests/dataframe/test_metrics_pytest.py @@ -27,7 +27,7 @@ import pytest from pandas.testing import assert_frame_equal, assert_series_equal -from opensearch_py_ml.constants import ( +from opensearch_py_ml.utils import ( MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE, diff --git a/tests/operations/test_map_pd_aggs_to_es_aggs_pytest.py b/tests/operations/test_map_pd_aggs_to_es_aggs_pytest.py index ed975902d..7acb2e7e4 100644 --- a/tests/operations/test_map_pd_aggs_to_es_aggs_pytest.py +++ b/tests/operations/test_map_pd_aggs_to_es_aggs_pytest.py @@ -22,7 +22,7 @@ # specific language governing permissions and limitations # under the License. -from opensearch_py_ml.constants import ( +from opensearch_py_ml.utils import ( MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE, diff --git a/tests/series/test_describe_pytest.py b/tests/series/test_describe_pytest.py index ca00f1330..0841253d4 100644 --- a/tests/series/test_describe_pytest.py +++ b/tests/series/test_describe_pytest.py @@ -24,7 +24,7 @@ import pandas as pd -from opensearch_py_ml.constants import STANDARD_DEVIATION +from opensearch_py_ml.utils import STANDARD_DEVIATION from tests.common import TestData, assert_series_equal diff --git a/tests/series/test_metrics_pytest.py b/tests/series/test_metrics_pytest.py index 011035b95..35bf910f9 100644 --- a/tests/series/test_metrics_pytest.py +++ b/tests/series/test_metrics_pytest.py @@ -31,7 +31,7 @@ import pytest from pandas.testing import assert_series_equal -from opensearch_py_ml.constants import ( +from opensearch_py_ml.utils import ( MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE, From 25905b6d9b483efc43eff4f416267bd72d1d5a1b Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Tue, 5 Nov 2024 21:56:40 +0500 Subject: [PATCH 16/20] fiexd CI Signed-off-by: Yerzhaisang Taskali --- opensearch_py_ml/groupby.py | 6 +----- opensearch_py_ml/operations.py | 6 +----- opensearch_py_ml/query_compiler.py | 6 +----- opensearch_py_ml/utils.py | 1 + tests/dataframe/test_groupby_pytest.py | 2 +- tests/dataframe/test_metrics_pytest.py | 5 ++--- tests/operations/test_map_pd_aggs_to_es_aggs_pytest.py | 6 +----- tests/series/test_metrics_pytest.py | 2 +- 8 files changed, 9 insertions(+), 25 deletions(-) diff --git a/opensearch_py_ml/groupby.py b/opensearch_py_ml/groupby.py index 2dcef9c7a..ea2083485 100644 --- a/opensearch_py_ml/groupby.py +++ b/opensearch_py_ml/groupby.py @@ -25,12 +25,8 @@ from typing import TYPE_CHECKING, List, Optional, Union -from opensearch_py_ml.utils import ( - MEAN_ABSOLUTE_DEVIATION, - STANDARD_DEVIATION, - VARIANCE, -) from opensearch_py_ml.query_compiler import QueryCompiler +from opensearch_py_ml.utils import MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE if TYPE_CHECKING: import pandas as pd # type: ignore diff --git a/opensearch_py_ml/operations.py b/opensearch_py_ml/operations.py index efc61b36b..fa45b742e 100644 --- a/opensearch_py_ml/operations.py +++ b/opensearch_py_ml/operations.py @@ -52,11 +52,6 @@ build_pd_series, opensearch_date_to_pandas_date, ) -from opensearch_py_ml.utils import ( - MEAN_ABSOLUTE_DEVIATION, - STANDARD_DEVIATION, - VARIANCE, -) from opensearch_py_ml.index import Index from opensearch_py_ml.query import Query from opensearch_py_ml.tasks import ( @@ -70,6 +65,7 @@ SizeTask, TailTask, ) +from opensearch_py_ml.utils import MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE if TYPE_CHECKING: from numpy.typing import DTypeLike diff --git a/opensearch_py_ml/query_compiler.py b/opensearch_py_ml/query_compiler.py index 9a0ee1836..735b0307c 100644 --- a/opensearch_py_ml/query_compiler.py +++ b/opensearch_py_ml/query_compiler.py @@ -41,15 +41,11 @@ import pandas as pd # type: ignore from opensearch_py_ml.common import opensearch_date_to_pandas_date -from opensearch_py_ml.utils import ( - MEAN_ABSOLUTE_DEVIATION, - STANDARD_DEVIATION, - VARIANCE, -) from opensearch_py_ml.field_mappings import FieldMappings from opensearch_py_ml.filter import BooleanFilter, QueryFilter from opensearch_py_ml.index import Index from opensearch_py_ml.operations import Operations +from opensearch_py_ml.utils import MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE if TYPE_CHECKING: from opensearchpy import OpenSearch diff --git a/opensearch_py_ml/utils.py b/opensearch_py_ml/utils.py index 00a5dada2..e850d2724 100644 --- a/opensearch_py_ml/utils.py +++ b/opensearch_py_ml/utils.py @@ -38,6 +38,7 @@ VARIANCE = "var" STANDARD_DEVIATION = "std" + def deprecated_api( replace_with: str, ) -> Callable[[Callable[..., RT]], Callable[..., RT]]: diff --git a/tests/dataframe/test_groupby_pytest.py b/tests/dataframe/test_groupby_pytest.py index 911550515..89938b110 100644 --- a/tests/dataframe/test_groupby_pytest.py +++ b/tests/dataframe/test_groupby_pytest.py @@ -32,8 +32,8 @@ MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE, + CustomFunctionDispatcher, ) -from opensearch_py_ml.utils import CustomFunctionDispatcher from tests.common import TestData diff --git a/tests/dataframe/test_metrics_pytest.py b/tests/dataframe/test_metrics_pytest.py index 00901a40f..e334643b3 100644 --- a/tests/dataframe/test_metrics_pytest.py +++ b/tests/dataframe/test_metrics_pytest.py @@ -27,14 +27,13 @@ import pytest from pandas.testing import assert_frame_equal, assert_series_equal +# File called _pytest for PyCharm compatibility from opensearch_py_ml.utils import ( MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE, + CustomFunctionDispatcher, ) - -# File called _pytest for PyCharm compatibility -from opensearch_py_ml.utils import CustomFunctionDispatcher from tests.common import TestData, assert_almost_equal diff --git a/tests/operations/test_map_pd_aggs_to_es_aggs_pytest.py b/tests/operations/test_map_pd_aggs_to_es_aggs_pytest.py index 7acb2e7e4..c030eb890 100644 --- a/tests/operations/test_map_pd_aggs_to_es_aggs_pytest.py +++ b/tests/operations/test_map_pd_aggs_to_es_aggs_pytest.py @@ -22,12 +22,8 @@ # specific language governing permissions and limitations # under the License. -from opensearch_py_ml.utils import ( - MEAN_ABSOLUTE_DEVIATION, - STANDARD_DEVIATION, - VARIANCE, -) from opensearch_py_ml.operations import Operations +from opensearch_py_ml.utils import MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE def test_all_aggs(): diff --git a/tests/series/test_metrics_pytest.py b/tests/series/test_metrics_pytest.py index 35bf910f9..bc3330eb2 100644 --- a/tests/series/test_metrics_pytest.py +++ b/tests/series/test_metrics_pytest.py @@ -35,8 +35,8 @@ MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE, + CustomFunctionDispatcher, ) -from opensearch_py_ml.utils import CustomFunctionDispatcher from tests.common import TestData, assert_almost_equal From feeb0b56a476113f6645530d11d9d58884c9254c Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Tue, 5 Nov 2024 22:53:00 +0500 Subject: [PATCH 17/20] added keyword argument to to_csv method Signed-off-by: Yerzhaisang Taskali --- opensearch_py_ml/dataframe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/opensearch_py_ml/dataframe.py b/opensearch_py_ml/dataframe.py index 9b0370b98..47f89872e 100644 --- a/opensearch_py_ml/dataframe.py +++ b/opensearch_py_ml/dataframe.py @@ -1331,6 +1331,7 @@ def to_csv( compression="infer", quoting=None, quotechar='"', + lineterminator=None, chunksize=None, tupleize_cols=None, date_format=None, @@ -1359,6 +1360,7 @@ def to_csv( "compression": compression, "quoting": quoting, "quotechar": quotechar, + "lineterminator": lineterminator, "chunksize": chunksize, "date_format": date_format, "doublequote": doublequote, From 116260837d07549922e9e209dc1a85fccdda1cd4 Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Tue, 5 Nov 2024 23:22:28 +0500 Subject: [PATCH 18/20] added comment to describe method Signed-off-by: Yerzhaisang Taskali --- opensearch_py_ml/operations.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/opensearch_py_ml/operations.py b/opensearch_py_ml/operations.py index fa45b742e..0b130c36b 100644 --- a/opensearch_py_ml/operations.py +++ b/opensearch_py_ml/operations.py @@ -1229,11 +1229,18 @@ def describe(self, query_compiler: "QueryCompiler") -> pd.DataFrame: df = pd.concat([df1, df2]) + # Note: In recent pandas versions, `describe()` returns a different index order + # for one-column DataFrames compared to multi-column DataFrames. + # We adjust the order manually to ensure consistency. if df.shape[1] == 1: + # For single-column DataFrames, `describe()` typically outputs: + # ["count", "mean", "std", "min", "25%", "50%", "75%", "max"] return df.reindex( ["count", "mean", STANDARD_DEVIATION, "min", "25%", "50%", "75%", "max"] ) + # For multi-column DataFrames, `describe()` typically outputs: + # ["count", "mean", "min", "25%", "50%", "75%", "max", "std"] return df.reindex( ["count", "mean", "min", "25%", "50%", "75%", "max", STANDARD_DEVIATION] ) From dce7c744671b51625b107a34d27157a58fc11fdb Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Tue, 3 Dec 2024 21:01:32 +0500 Subject: [PATCH 19/20] possible pandas versions Signed-off-by: Yerzhaisang Taskali --- docs/requirements-docs.txt | 2 +- requirements-dev.txt | 2 +- requirements.txt | 2 +- setup.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 5ae7950de..690105451 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,5 +1,5 @@ opensearch-py>=2 -pandas==2.0.3 +pandas>=2.0,<2.1 matplotlib>=3.6.0,<4 nbval sphinx diff --git a/requirements-dev.txt b/requirements-dev.txt index 6bbc817b0..5b08178fa 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ # # Basic requirements # -pandas==2.0.3 +pandas>=2.0,<2.1 matplotlib>=3.6.2,<4 numpy>=1.24.0,<2 opensearch-py>=2.2.0 diff --git a/requirements.txt b/requirements.txt index 8af3ac141..babbd128d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # # Basic requirements # -pandas==2.0.3 +pandas>=2.0,<2.1 matplotlib>=3.6.2,<4 numpy>=1.24.0,<2 opensearch-py>=2.2.0 diff --git a/setup.py b/setup.py index f98028f3d..724b4a9db 100644 --- a/setup.py +++ b/setup.py @@ -84,7 +84,7 @@ }, install_requires=[ "opensearch-py>=2", - "pandas==2.0.3", + "pandas>=2.0,<2.1", "matplotlib>=3.6.0,<4", "numpy>=1.24.0,<2", "deprecated>=1.2.14,<2", From 8665e5968950712afd0e789dee7d41b82b7d26fa Mon Sep 17 00:00:00 2001 From: Yerzhaisang Taskali Date: Mon, 9 Dec 2024 16:43:08 +0500 Subject: [PATCH 20/20] wide range of pandas versions Signed-off-by: Yerzhaisang Taskali --- docs/requirements-docs.txt | 2 +- requirements-dev.txt | 2 +- requirements.txt | 2 +- setup.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 690105451..1fc7e4c8d 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,5 +1,5 @@ opensearch-py>=2 -pandas>=2.0,<2.1 +pandas>=1.5.2,<2.1 matplotlib>=3.6.0,<4 nbval sphinx diff --git a/requirements-dev.txt b/requirements-dev.txt index 5b08178fa..b3dacfcc3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ # # Basic requirements # -pandas>=2.0,<2.1 +pandas>=1.5.2,<2.1 matplotlib>=3.6.2,<4 numpy>=1.24.0,<2 opensearch-py>=2.2.0 diff --git a/requirements.txt b/requirements.txt index babbd128d..cee732a8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # # Basic requirements # -pandas>=2.0,<2.1 +pandas>=1.5.2,<2.1 matplotlib>=3.6.2,<4 numpy>=1.24.0,<2 opensearch-py>=2.2.0 diff --git a/setup.py b/setup.py index 724b4a9db..c0271135d 100644 --- a/setup.py +++ b/setup.py @@ -84,7 +84,7 @@ }, install_requires=[ "opensearch-py>=2", - "pandas>=2.0,<2.1", + "pandas>=1.5,<2.1", "matplotlib>=3.6.0,<4", "numpy>=1.24.0,<2", "deprecated>=1.2.14,<2",