-
Notifications
You must be signed in to change notification settings - Fork 64
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bump pandas from 1.5.3 to 2.0.3 #422
base: main
Are you sure you want to change the base?
Changes from 2 commits
fb4f0d6
06076d9
3ef1e2c
da14ee5
c68b7f8
0084cce
12bacb7
77ebf1c
45e793a
c4455e0
7135507
d2203be
c55639d
b1f2319
77c56a7
25905b6
feeb0b5
1162608
dce7c74
8665e59
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
opensearch-py>=2 | ||
pandas>=1.5,<3 | ||
pandas==2.0.3 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we upgrade to a more latest version? any reason specifically for 2.0.3? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bumping to a later version introduces datatype issues, including an ImportError like this: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, let's focus on bumping to |
||
matplotlib>=3.6.0,<4 | ||
nbval | ||
sphinx | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,14 +55,20 @@ | |
|
||
|
||
def build_pd_series( | ||
data: Dict[str, Any], dtype: Optional["DTypeLike"] = None, **kwargs: Any | ||
data: Dict[str, Any], | ||
dtype: Optional["DTypeLike"] = None, | ||
index_name: Optional[str] = None, | ||
**kwargs: Any, | ||
) -> pd.Series: | ||
"""Builds a pd.Series while squelching the warning | ||
for unspecified dtype on empty series | ||
""" | ||
dtype = dtype or (EMPTY_SERIES_DTYPE if not data else dtype) | ||
if dtype is not None: | ||
kwargs["dtype"] = dtype | ||
if index_name: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lets keep an explicit check for None - what happens if the index is not found? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Done |
||
index = pd.Index(data.keys(), name=index_name) | ||
kwargs["index"] = index | ||
return pd.Series(data, **kwargs) | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -424,9 +424,36 @@ def drop( | |
axis = pd.DataFrame._get_axis_name(axis) | ||
axes = {axis: labels} | ||
elif index is not None or columns is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Kind of confused here the parent branch is checking that if one of them is not None but inside its checking again There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
axes, _ = pd.DataFrame()._construct_axes_from_arguments( | ||
(index, columns), {} | ||
) | ||
# axes, _ = pd.DataFrame()._construct_axes_from_arguments( | ||
# (index, columns), {} | ||
# ) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can remove these comments if no longer used There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
axes = {} | ||
if index is not None: | ||
if isinstance(index, pd.Index): | ||
index = index.tolist() # Convert Index to list | ||
elif not is_list_like(index): | ||
index = [index] # Convert to list if it's not list-like already | ||
axes["index"] = index | ||
else: | ||
axes["index"] = None | ||
|
||
if columns is not None: | ||
if isinstance(columns, pd.Index): | ||
columns = columns.tolist() # Convert Index to list | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. columns to list There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
elif not is_list_like(columns): | ||
columns = [columns] # Convert to list if it's not list-like already | ||
axes["columns"] = columns | ||
else: | ||
axes["columns"] = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can this be wrapped in a method? and then we can do something like this
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
|
||
if columns is not None: | ||
if not is_list_like(columns): | ||
columns = [columns] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. repeated logic? this is handled in lines 443 and 444 right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
axes["columns"] = ( | ||
pd.Index(columns) if isinstance(columns, list) else columns | ||
) | ||
else: | ||
axes["columns"] = None | ||
else: | ||
raise ValueError( | ||
"Need to specify at least one of 'labels', 'index' or 'columns'" | ||
|
@@ -440,7 +467,7 @@ def drop( | |
axes["index"] = [axes["index"]] | ||
if errors == "raise": | ||
# Check if axes['index'] values exists in index | ||
count = self._query_compiler._index_matches_count(axes["index"]) | ||
count = self._query_compiler._index_matches_count(list(axes["index"])) | ||
if count != len(axes["index"]): | ||
raise ValueError( | ||
f"number of labels {count}!={len(axes['index'])} not contained in axis" | ||
|
@@ -1326,7 +1353,6 @@ def to_csv( | |
compression="infer", | ||
quoting=None, | ||
quotechar='"', | ||
line_terminator=None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are we removing this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I restored it as lineterminator to align with recent pandas updates, but it’s still not actively used elsewhere in the code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we keep as it is: |
||
chunksize=None, | ||
tupleize_cols=None, | ||
date_format=None, | ||
|
@@ -1355,7 +1381,6 @@ def to_csv( | |
"compression": compression, | ||
"quoting": quoting, | ||
"quotechar": quotechar, | ||
"line_terminator": line_terminator, | ||
"chunksize": chunksize, | ||
"date_format": date_format, | ||
"doublequote": doublequote, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -475,7 +475,7 @@ def _terms_aggs( | |
except IndexError: | ||
name = None | ||
|
||
return build_pd_series(results, name=name) | ||
return build_pd_series(results, index_name=name, name="count") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How come its using "count" here but before it was using name? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In pandas 2.0.3, a change in the value_counts method resulted in the following behavior: The method now uses "count" as the name for the values column, while the original column name (e.g., "Carrier") is used for the index name. This differs from earlier versions, where the values column would inherit the name of the original column. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey @dhrubo-os what are your thoughts? Thanks for the info @Yerzhaisang There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Yerzhaisang could you please share any documentation about the changing behavior of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In pandas 1.5.3, the series name is used for the result series. In pandas 2.0.3, the series name is used for the index name, while the result series name is set to this one. |
||
|
||
def _hist_aggs( | ||
self, query_compiler: "QueryCompiler", num_bins: int | ||
|
@@ -1205,7 +1205,7 @@ def describe(self, query_compiler: "QueryCompiler") -> pd.DataFrame: | |
|
||
df1 = self.aggs( | ||
query_compiler=query_compiler, | ||
pd_aggs=["count", "mean", "std", "min", "max"], | ||
pd_aggs=["count", "mean", "min", "max", "std"], | ||
numeric_only=True, | ||
) | ||
df2 = self.quantile( | ||
|
@@ -1219,9 +1219,14 @@ def describe(self, query_compiler: "QueryCompiler") -> pd.DataFrame: | |
# Convert [.25,.5,.75] to ["25%", "50%", "75%"] | ||
df2 = df2.set_index([["25%", "50%", "75%"]]) | ||
|
||
return pd.concat([df1, df2]).reindex( | ||
["count", "mean", "std", "min", "25%", "50%", "75%", "max"] | ||
) | ||
df = pd.concat([df1, df2]) | ||
|
||
if df.shape[1] == 1: | ||
return df.reindex( | ||
["count", "mean", "std", "min", "25%", "50%", "75%", "max"] | ||
) | ||
|
||
return df.reindex(["count", "mean", "min", "25%", "50%", "75%", "max", "std"]) | ||
|
||
def to_pandas( | ||
self, query_compiler: "QueryCompiler", show_progress: bool = False | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -312,11 +312,12 @@ def value_counts(self, os_size: int = 10) -> pd.Series: | |
|
||
>>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') | ||
>>> df['Carrier'].value_counts() | ||
Carrier | ||
Logstash Airways 3331 | ||
JetBeats 3274 | ||
Kibana Airlines 3234 | ||
ES-Air 3220 | ||
Name: Carrier, dtype: int64 | ||
Name: count, dtype: int64 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is what we don't want, right? Carrier is the column name which we are changing it to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey @dhrubo, this is actually correct. In pandas 2.0.3, Carrier is set as the index name, and count is the column name. |
||
""" | ||
if not isinstance(os_size, int): | ||
raise TypeError("os_size must be a positive integer.") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,7 +34,7 @@ def test_flights_describe(self): | |
pd_flights = self.pd_flights() | ||
oml_flights = self.oml_flights() | ||
|
||
pd_describe = pd_flights.describe() | ||
pd_describe = pd_flights.describe().drop(["timestamp"], axis=1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are we removing this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
# We remove bool columns to match pandas output | ||
oml_describe = oml_flights.describe().drop( | ||
["Cancelled", "FlightDelay"], axis="columns" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -106,10 +106,18 @@ def test_groupby_aggs_mad_var_std(self, pd_agg, dropna): | |
pd_flights = self.pd_flights().filter(self.filter_data) | ||
oml_flights = self.oml_flights().filter(self.filter_data) | ||
|
||
pd_groupby = getattr(pd_flights.groupby("Cancelled", dropna=dropna), pd_agg)() | ||
if pd_agg == "mad": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible to extract these variables like "mad", "var", "std" as a constant. (can remove the friction to new comers) fro example let MEAN_ABSOLUTE_DEVATION = "mad". Not sure if that would take a lot of effort but something to think about There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
pd_groupby = pd_flights.groupby("Cancelled", dropna=dropna).agg( | ||
lambda x: (x - x.mean()).abs().mean() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see this lambda get used in other places, could we possible have a util class that dispatches theses common function names? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
) | ||
else: | ||
pd_groupby = getattr( | ||
pd_flights.groupby("Cancelled", dropna=dropna), pd_agg | ||
)() | ||
oml_groupby = getattr(oml_flights.groupby("Cancelled", dropna=dropna), pd_agg)( | ||
numeric_only=True | ||
) | ||
pd_groupby = pd_groupby[oml_groupby.columns] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need this? We shouldn't use any oml resource/info in pd as the goal is to how pd functionalities are same to oml functionality. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, fixed |
||
|
||
# checking only values because dtypes are checked in aggs tests | ||
assert_frame_equal( | ||
|
@@ -224,14 +232,32 @@ def test_groupby_dataframe_mad(self): | |
pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"]) | ||
oml_flights = self.oml_flights().filter(self.filter_data + ["DestCountry"]) | ||
|
||
pd_mad = pd_flights.groupby("DestCountry").mad() | ||
pd_mad = pd_flights.groupby("DestCountry").apply( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we get some idea from this PR: https://github.com/elastic/eland/pull/602/files ? Also Shouldn't we keep BWC in mind? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe we don’t need to worry about backward compatibility, as we haven’t modified our built-in methods. We only customized the deprecated pandas methods within the tests. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What I was wondering currently we are doing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. got it, let me do some research There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @dhrubo-os thank you for this good point. Yeah, we shouldn't force customers to install only one version. |
||
lambda x: x.select_dtypes(include="number").apply( | ||
lambda x: (x - x.mean()).abs().mean() | ||
) | ||
) | ||
|
||
# Re-merge non-numeric columns back, with suffixes to avoid column overlap | ||
non_numeric_columns = ( | ||
pd_flights.select_dtypes(exclude="number").groupby("DestCountry").first() | ||
) | ||
pd_mad = pd_mad.join( | ||
non_numeric_columns, lsuffix="_numeric", rsuffix="_non_numeric" | ||
)[self.filter_data] | ||
if "Cancelled" in pd_mad.columns: | ||
pd_mad["Cancelled"] = pd_mad["Cancelled"].astype(float) | ||
oml_mad = oml_flights.groupby("DestCountry").mad() | ||
|
||
assert_index_equal(pd_mad.columns, oml_mad.columns) | ||
assert_index_equal(pd_mad.index, oml_mad.index) | ||
assert_series_equal(pd_mad.dtypes, oml_mad.dtypes) | ||
|
||
pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", "mad"]) | ||
pd_min_mad = pd_flights.groupby("DestCountry").agg( | ||
["min", lambda x: (x - x.median()).abs().mean()] | ||
) | ||
|
||
pd_min_mad.columns = pd_min_mad.columns.set_levels(["min", "mad"], level=1) | ||
oml_min_mad = oml_flights.groupby("DestCountry").aggregate(["min", "mad"]) | ||
|
||
assert_index_equal(pd_min_mad.columns, oml_min_mad.columns) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -81,9 +81,10 @@ def test_flights_extended_metrics(self): | |
logger.setLevel(logging.DEBUG) | ||
|
||
for func in self.extended_funcs: | ||
pd_metric = getattr(pd_flights, func)( | ||
**({"numeric_only": True} if func != "mad" else {}) | ||
) | ||
if func == "mad": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I noticed you introduced branching what if some day in the future we need to reimplement another function from scratch instead of creating a new branch every time can we extract this behavior out? perhaps have a utility class with our own custom implementation like
and then dispatch it instead of branching for every new method we need to reimplent something like if func in customFUnctionMap: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
pd_metric = (pd_flights - pd_flights.mean()).abs().mean() | ||
else: | ||
pd_metric = getattr(pd_flights, func)(**({"numeric_only": True})) | ||
oml_metric = getattr(oml_flights, func)(numeric_only=True) | ||
|
||
pd_value = pd_metric["AvgTicketPrice"] | ||
|
@@ -101,7 +102,10 @@ def test_flights_extended_metrics_nan(self): | |
] | ||
|
||
for func in self.extended_funcs: | ||
pd_metric = getattr(pd_flights_1, func)() | ||
if func == "mad": | ||
pd_metric = (pd_flights_1 - pd_flights_1.mean()).abs().mean() | ||
else: | ||
pd_metric = getattr(pd_flights_1, func)() | ||
oml_metric = getattr(oml_flights_1, func)(numeric_only=False) | ||
|
||
assert_series_equal(pd_metric, oml_metric, check_exact=False) | ||
|
@@ -111,7 +115,10 @@ def test_flights_extended_metrics_nan(self): | |
oml_flights_0 = oml_flights[oml_flights.FlightNum == "XXX"][["AvgTicketPrice"]] | ||
|
||
for func in self.extended_funcs: | ||
pd_metric = getattr(pd_flights_0, func)() | ||
if func == "mad": | ||
pd_metric = (pd_flights_0 - pd_flights_0.mean()).abs().mean() | ||
else: | ||
pd_metric = getattr(pd_flights_0, func)() | ||
oml_metric = getattr(oml_flights_0, func)(numeric_only=False) | ||
|
||
assert_series_equal(pd_metric, oml_metric, check_exact=False) | ||
|
@@ -498,7 +505,8 @@ def test_flights_agg_quantile(self, numeric_only): | |
["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"] | ||
) | ||
|
||
pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only) | ||
pd_quantile = pd_flights.agg([lambda x: x.quantile(0.5), lambda x: x.min()]) | ||
pd_quantile.index = ["quantile", "min"] | ||
oml_quantile = oml_flights.agg(["quantile", "min"], numeric_only=numeric_only) | ||
|
||
assert_frame_equal( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
by*
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed