opensearch-project · Yerzhaisang · Oct 29, 2024 · Oct 29, 2024 · Nov 3, 2024 · Nov 4, 2024
@@ -46,6 +46,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - updating listing file with three v2 sparse model - by @dhrubo-os ([#412](https://github.com/opensearch-project/opensearch-py-ml/pull/412))
 - Update model upload history -  opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini (v.1.0.0)(TORCH_SCRIPT) by @dhrubo-os ([#417](https://github.com/opensearch-project/opensearch-py-ml/pull/417))
 - Update model upload history -  opensearch-project/opensearch-neural-sparse-encoding-v2-distill (v.1.0.0)(TORCH_SCRIPT) by @dhrubo-os ([#419](https://github.com/opensearch-project/opensearch-py-ml/pull/419))
+- Bump pandas from 1.5.3 to 2.0.3 bu @yerzhaisang ([#422](https://github.com/opensearch-project/opensearch-py-ml/pull/422))
 
 ### Fixed
 - Fix the wrong final zip file name in model_uploader workflow, now will name it by the upload_prefix alse.([#413](https://github.com/opensearch-project/opensearch-py-ml/pull/413/files))

@@ -1,5 +1,5 @@
 opensearch-py>=2
-pandas>=1.5,<3
+pandas==2.0.3
 matplotlib>=3.6.0,<4
 nbval
 sphinx

@@ -55,14 +55,20 @@
 
 
 def build_pd_series(
-    data: Dict[str, Any], dtype: Optional["DTypeLike"] = None, **kwargs: Any
+    data: Dict[str, Any],
+    dtype: Optional["DTypeLike"] = None,
+    index_name: Optional[str] = None,
+    **kwargs: Any,
 ) -> pd.Series:
     """Builds a pd.Series while squelching the warning
     for unspecified dtype on empty series
     """
     dtype = dtype or (EMPTY_SERIES_DTYPE if not data else dtype)
     if dtype is not None:
         kwargs["dtype"] = dtype
+    if index_name:
+        index = pd.Index(data.keys(), name=index_name)
+        kwargs["index"] = index
     return pd.Series(data, **kwargs)
 
 

@@ -424,9 +424,36 @@ def drop(
             axis = pd.DataFrame._get_axis_name(axis)
             axes = {axis: labels}
         elif index is not None or columns is not None:
-            axes, _ = pd.DataFrame()._construct_axes_from_arguments(
-                (index, columns), {}
-            )
+            # axes, _ = pd.DataFrame()._construct_axes_from_arguments(
+            #     (index, columns), {}
+            # )
+            axes = {}
+            if index is not None:
+                if isinstance(index, pd.Index):
+                    index = index.tolist()  # Convert Index to list
+                elif not is_list_like(index):
+                    index = [index]  # Convert to list if it's not list-like already
+                axes["index"] = index
+            else:
+                axes["index"] = None
+
+            if columns is not None:
+                if isinstance(columns, pd.Index):
+                    columns = columns.tolist()  # Convert Index to list
+                elif not is_list_like(columns):
+                    columns = [columns]  # Convert to list if it's not list-like already
+                axes["columns"] = columns
+            else:
+                axes["columns"] = None
+
+            if columns is not None:
+                if not is_list_like(columns):
+                    columns = [columns]
+                axes["columns"] = (
+                    pd.Index(columns) if isinstance(columns, list) else columns
+                )
+            else:
+                axes["columns"] = None
         else:
             raise ValueError(
                 "Need to specify at least one of 'labels', 'index' or 'columns'"
@@ -440,7 +467,7 @@ def drop(
                 axes["index"] = [axes["index"]]
             if errors == "raise":
                 # Check if axes['index'] values exists in index
-                count = self._query_compiler._index_matches_count(axes["index"])
+                count = self._query_compiler._index_matches_count(list(axes["index"]))
                 if count != len(axes["index"]):
                     raise ValueError(
                         f"number of labels {count}!={len(axes['index'])} not contained in axis"
@@ -1326,7 +1353,6 @@ def to_csv(
         compression="infer",
         quoting=None,
         quotechar='"',
-        line_terminator=None,
         chunksize=None,
         tupleize_cols=None,
         date_format=None,
@@ -1355,7 +1381,6 @@ def to_csv(
             "compression": compression,
             "quoting": quoting,
             "quotechar": quotechar,
-            "line_terminator": line_terminator,
             "chunksize": chunksize,
             "date_format": date_format,
             "doublequote": doublequote,

@@ -475,7 +475,7 @@ def _terms_aggs(
         except IndexError:
             name = None
 
-        return build_pd_series(results, name=name)
+        return build_pd_series(results, index_name=name, name="count")
 
     def _hist_aggs(
         self, query_compiler: "QueryCompiler", num_bins: int
@@ -1205,7 +1205,7 @@ def describe(self, query_compiler: "QueryCompiler") -> pd.DataFrame:
 
         df1 = self.aggs(
             query_compiler=query_compiler,
-            pd_aggs=["count", "mean", "std", "min", "max"],
+            pd_aggs=["count", "mean", "min", "max", "std"],
             numeric_only=True,
         )
         df2 = self.quantile(
@@ -1219,9 +1219,14 @@ def describe(self, query_compiler: "QueryCompiler") -> pd.DataFrame:
         # Convert [.25,.5,.75] to ["25%", "50%", "75%"]
         df2 = df2.set_index([["25%", "50%", "75%"]])
 
-        return pd.concat([df1, df2]).reindex(
-            ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
-        )
+        df = pd.concat([df1, df2])
+
+        if df.shape[1] == 1:
+            return df.reindex(
+                ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
+            )
+
+        return df.reindex(["count", "mean", "min", "25%", "50%", "75%", "max", "std"])
 
     def to_pandas(
         self, query_compiler: "QueryCompiler", show_progress: bool = False

@@ -312,11 +312,12 @@ def value_counts(self, os_size: int = 10) -> pd.Series:
 
         >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')
         >>> df['Carrier'].value_counts()
+        Carrier
         Logstash Airways    3331
         JetBeats            3274
         Kibana Airlines     3234
         ES-Air              3220
-        Name: Carrier, dtype: int64
+        Name: count, dtype: int64
         """
         if not isinstance(os_size, int):
             raise TypeError("os_size must be a positive integer.")

@@ -1,7 +1,7 @@
 #
 # Basic requirements
 #
-pandas>=1.5.2,<2
+pandas==2.0.3
 matplotlib>=3.6.2,<4
 numpy>=1.24.0,<2
 opensearch-py>=2.2.0

@@ -1,7 +1,7 @@
 #
 # Basic requirements
 #
-pandas>=1.5.2,<2
+pandas==2.0.3
 matplotlib>=3.6.2,<4
 numpy>=1.24.0,<2
 opensearch-py>=2.2.0

@@ -84,7 +84,7 @@
     },
     install_requires=[
         "opensearch-py>=2",
-        "pandas>=1.5,<3",
+        "pandas==2.0.3",
         "matplotlib>=3.6.0,<4",
         "numpy>=1.24.0,<2",
         "deprecated>=1.2.14,<2",

@@ -34,7 +34,7 @@ def test_flights_describe(self):
         pd_flights = self.pd_flights()
         oml_flights = self.oml_flights()
 
-        pd_describe = pd_flights.describe()
+        pd_describe = pd_flights.describe().drop(["timestamp"], axis=1)
         # We remove bool columns to match pandas output
         oml_describe = oml_flights.describe().drop(
             ["Cancelled", "FlightDelay"], axis="columns"

@@ -106,10 +106,18 @@ def test_groupby_aggs_mad_var_std(self, pd_agg, dropna):
         pd_flights = self.pd_flights().filter(self.filter_data)
         oml_flights = self.oml_flights().filter(self.filter_data)
 
-        pd_groupby = getattr(pd_flights.groupby("Cancelled", dropna=dropna), pd_agg)()
+        if pd_agg == "mad":
+            pd_groupby = pd_flights.groupby("Cancelled", dropna=dropna).agg(
+                lambda x: (x - x.mean()).abs().mean()
+            )
+        else:
+            pd_groupby = getattr(
+                pd_flights.groupby("Cancelled", dropna=dropna), pd_agg
+            )()
         oml_groupby = getattr(oml_flights.groupby("Cancelled", dropna=dropna), pd_agg)(
             numeric_only=True
         )
+        pd_groupby = pd_groupby[oml_groupby.columns]
 
         # checking only values because dtypes are checked in aggs tests
         assert_frame_equal(
@@ -224,14 +232,32 @@ def test_groupby_dataframe_mad(self):
         pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"])
         oml_flights = self.oml_flights().filter(self.filter_data + ["DestCountry"])
 
-        pd_mad = pd_flights.groupby("DestCountry").mad()
+        pd_mad = pd_flights.groupby("DestCountry").apply(
+            lambda x: x.select_dtypes(include="number").apply(
+                lambda x: (x - x.mean()).abs().mean()
+            )
+        )
+
+        # Re-merge non-numeric columns back, with suffixes to avoid column overlap
+        non_numeric_columns = (
+            pd_flights.select_dtypes(exclude="number").groupby("DestCountry").first()
+        )
+        pd_mad = pd_mad.join(
+            non_numeric_columns, lsuffix="_numeric", rsuffix="_non_numeric"
+        )[self.filter_data]
+        if "Cancelled" in pd_mad.columns:
+            pd_mad["Cancelled"] = pd_mad["Cancelled"].astype(float)
         oml_mad = oml_flights.groupby("DestCountry").mad()
 
         assert_index_equal(pd_mad.columns, oml_mad.columns)
         assert_index_equal(pd_mad.index, oml_mad.index)
         assert_series_equal(pd_mad.dtypes, oml_mad.dtypes)
 
-        pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", "mad"])
+        pd_min_mad = pd_flights.groupby("DestCountry").agg(
+            ["min", lambda x: (x - x.median()).abs().mean()]
+        )
+
+        pd_min_mad.columns = pd_min_mad.columns.set_levels(["min", "mad"], level=1)
         oml_min_mad = oml_flights.groupby("DestCountry").aggregate(["min", "mad"])
 
         assert_index_equal(pd_min_mad.columns, oml_min_mad.columns)

@@ -81,9 +81,10 @@ def test_flights_extended_metrics(self):
         logger.setLevel(logging.DEBUG)
 
         for func in self.extended_funcs:
-            pd_metric = getattr(pd_flights, func)(
-                **({"numeric_only": True} if func != "mad" else {})
-            )
+            if func == "mad":
+                pd_metric = (pd_flights - pd_flights.mean()).abs().mean()
+            else:
+                pd_metric = getattr(pd_flights, func)(**({"numeric_only": True}))
             oml_metric = getattr(oml_flights, func)(numeric_only=True)
 
             pd_value = pd_metric["AvgTicketPrice"]
@@ -101,7 +102,10 @@ def test_flights_extended_metrics_nan(self):
         ]
 
         for func in self.extended_funcs:
-            pd_metric = getattr(pd_flights_1, func)()
+            if func == "mad":
+                pd_metric = (pd_flights_1 - pd_flights_1.mean()).abs().mean()
+            else:
+                pd_metric = getattr(pd_flights_1, func)()
             oml_metric = getattr(oml_flights_1, func)(numeric_only=False)
 
             assert_series_equal(pd_metric, oml_metric, check_exact=False)
@@ -111,7 +115,10 @@ def test_flights_extended_metrics_nan(self):
         oml_flights_0 = oml_flights[oml_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
 
         for func in self.extended_funcs:
-            pd_metric = getattr(pd_flights_0, func)()
+            if func == "mad":
+                pd_metric = (pd_flights_0 - pd_flights_0.mean()).abs().mean()
+            else:
+                pd_metric = getattr(pd_flights_0, func)()
             oml_metric = getattr(oml_flights_0, func)(numeric_only=False)
 
             assert_series_equal(pd_metric, oml_metric, check_exact=False)
@@ -498,7 +505,8 @@ def test_flights_agg_quantile(self, numeric_only):
             ["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"]
         )
 
-        pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
+        pd_quantile = pd_flights.agg([lambda x: x.quantile(0.5), lambda x: x.min()])
+        pd_quantile.index = ["quantile", "min"]
         oml_quantile = oml_flights.agg(["quantile", "min"], numeric_only=numeric_only)
 
         assert_frame_equal(

@@ -80,9 +80,7 @@ def to_pandas(self):
             # "type cast" to modified class (inherits from ed.Series) that overrides the `to_pandas` function
             oml_series.__class__ = ModifiedOMLSeries
 
-            assert_pandas_opensearch_py_ml_series_equal(
-                pd_series, oml_series, check_less_precise=True
-            )
+            assert_pandas_opensearch_py_ml_series_equal(pd_series, oml_series)
 
     def test_ecommerce_series_invalid_div(self):
         pd_df = self.pd_ecommerce()

@@ -49,7 +49,10 @@ def test_flights_metrics(self):
         oml_flights = self.oml_flights()["AvgTicketPrice"]
 
         for func in self.all_funcs:
-            pd_metric = getattr(pd_flights, func)()
+            if func == "mad":
+                pd_metric = (pd_flights - pd_flights.mean()).abs().mean()
+            else:
+                pd_metric = getattr(pd_flights, func)()
             oml_metric = getattr(oml_flights, func)()
 
             self.assert_almost_equal_for_agg(func, pd_metric, oml_metric)
@@ -94,7 +97,10 @@ def test_ecommerce_selected_all_numeric_source_fields(self):
             oml_ecommerce = self.oml_ecommerce()[column]
 
             for func in self.all_funcs:
-                pd_metric = getattr(pd_ecommerce, func)()
+                if func == "mad":
+                    pd_metric = (pd_ecommerce - pd_ecommerce.mean()).abs().mean()
+                else:
+                    pd_metric = getattr(pd_ecommerce, func)()
                 oml_metric = getattr(oml_ecommerce, func)(
                     **({"numeric_only": True} if (func != "nunique") else {})
                 )