Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add compatibility for pandas 2 #1184

Merged
merged 20 commits into from
Jun 30, 2023
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
2793e33
First pass at unblocking pytest errors
charlesbluca Jun 20, 2023
f8e97cd
Fix some datetime relateed failures
charlesbluca Jun 20, 2023
0ddbfb4
Copy datetime table fixture to avoid dropping timezones
charlesbluca Jun 20, 2023
c5c94b5
Merge remote-tracking branch 'origin/main' into pandas-2-compat
charlesbluca Jun 20, 2023
a9fbd32
Unpin pandas dependency
charlesbluca Jun 20, 2023
10008ae
Resolve append deprecations in analyze table
charlesbluca Jun 20, 2023
9d15748
Resolve append deprecations in JDBC
charlesbluca Jun 20, 2023
a55b7e8
Fix deprecated drop call in test_join_cross
charlesbluca Jun 21, 2023
fb08000
specify mixed datetime format for test_select_different_types
charlesbluca Jun 21, 2023
47c0dae
Week number now accessible through isocalendar
charlesbluca Jun 21, 2023
1279ce3
Merge remote-tracking branch 'origin/main' into pandas-2-compat
charlesbluca Jun 21, 2023
a7b95f2
Merge remote-tracking branch 'origin/main' into pandas-2-compat
charlesbluca Jun 21, 2023
23a43ee
xfail test_filter_cast_timestamp
charlesbluca Jun 22, 2023
53fe77c
Temp fix for cuDF timezone delocalization
charlesbluca Jun 22, 2023
7adc32e
Merge remote-tracking branch 'origin/main' into pandas-2-compat
charlesbluca Jun 22, 2023
23a9f10
Add some compat code for pandas<2
charlesbluca Jun 23, 2023
6a6240c
Merge remote-tracking branch 'origin/main' into pandas-2-compat
charlesbluca Jun 27, 2023
92ce2f4
Use timezone-aware datetimes on GPU
charlesbluca Jun 27, 2023
54a2a17
Don't hardcode timezone fixture
charlesbluca Jun 30, 2023
f4b25a0
Link to timezone scalar issue
charlesbluca Jun 30, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion continuous_integration/environment-3.10-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies:
- mlflow
- mock
- numpy>=1.21.6
- pandas>=1.4.0,<2.0.0
- pandas>=1.4.0
- pre-commit
- prompt_toolkit>=3.0.8
- psycopg2
Expand Down
2 changes: 1 addition & 1 deletion continuous_integration/environment-3.9-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies:
- mlflow
- mock
- numpy>=1.21.6
- pandas>=1.4.0,<2.0.0
- pandas>=1.4.0
- pre-commit
- prompt_toolkit>=3.0.8
- psycopg2
Expand Down
2 changes: 1 addition & 1 deletion continuous_integration/gpuci/environment-3.10.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dependencies:
- mlflow
- mock
- numpy>=1.21.6
- pandas>=1.4.0,<2.0.0
- pandas>=1.4.0
- pre-commit
- prompt_toolkit>=3.0.8
- psycopg2
Expand Down
2 changes: 1 addition & 1 deletion continuous_integration/gpuci/environment-3.9.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dependencies:
- mlflow
- mock
- numpy>=1.21.6
- pandas>=1.4.0,<2.0.0
- pandas>=1.4.0
- pre-commit
- prompt_toolkit>=3.0.8
- psycopg2
Expand Down
2 changes: 1 addition & 1 deletion continuous_integration/recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ requirements:
run:
- python
- dask >=2022.3.0,<=2023.5.1
- pandas >=1.4.0,<2.0.0
- pandas >=1.4.0
# FIXME: handling is needed for httpx-based fastapi>=0.87.0
- fastapi >=0.69.0,<0.87.0
- uvicorn >=0.13.4
Expand Down
14 changes: 12 additions & 2 deletions dask_sql/mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,24 +331,34 @@ def cast_column_type(

def cast_column_to_type(col: dd.Series, expected_type: str):
"""Cast the given column to the expected type"""
pdt = pd.api.types

is_dt_ns = pdt.is_datetime64_ns_dtype
is_dt_tz = lambda t: is_dt_ns(t) and pdt.is_datetime64tz_dtype(t)
is_dt_ntz = lambda t: is_dt_ns(t) and not pdt.is_datetime64tz_dtype(t)

current_type = col.dtype

if similar_type(current_type, expected_type):
logger.debug("...not converting.")
return None

if pd.api.types.is_integer_dtype(expected_type):
if pdt.is_integer_dtype(expected_type):
if pd.api.types.is_float_dtype(current_type):
logger.debug("...truncating...")
# Currently "trunc" can not be applied to NA (the pandas missing value type),
# because NA is a different type. It works with np.NaN though.
# For our use case, that does not matter, as the conversion to integer later
# will convert both NA and np.NaN to NA.
col = da.trunc(col.fillna(value=np.NaN))
elif pd.api.types.is_timedelta64_dtype(current_type):
elif pdt.is_timedelta64_dtype(current_type):
logger.debug(f"Explicitly casting from {current_type} to np.int64")
return col.astype(np.int64)

if is_dt_tz(current_type) and is_dt_ntz(expected_type):
# casting from timezone-aware to timezone-naive datatypes with astype is deprecated in pandas 2
return col.dt.tz_localize(None)

logger.debug(f"Need to cast from {current_type} to {expected_type}")
return col.astype(expected_type)

Expand Down
36 changes: 16 additions & 20 deletions dask_sql/physical/rel/custom/analyze_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,26 +47,22 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai
df = dc.df

# Calculate statistics
statistics = dd.from_pandas(
pd.DataFrame({col: [] for col in columns}), npartitions=1
)
statistics = statistics.append(df[[mapping(col) for col in columns]].describe())

# Add additional information
statistics = statistics.append(
pd.Series(
{
col: str(python_to_sql_type(df[mapping(col)].dtype)).lower()
for col in columns
},
name="data_type",
)
)
statistics = statistics.append(
pd.Series(
{col: col for col in columns},
name="col_name",
)
statistics = dd.concat(
[
df[[mapping(col) for col in columns]].describe(),
pd.DataFrame(
{
mapping(col): str(
python_to_sql_type(df[mapping(col)].dtype)
).lower()
for col in columns
},
index=["data_type"],
),
pd.DataFrame(
{mapping(col): col for col in columns}, index=["col_name"]
),
]
)

cc = ColumnContainer(statistics.columns)
Expand Down
2 changes: 1 addition & 1 deletion dask_sql/physical/rex/core/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -926,7 +926,7 @@ def date_part(self, what, df: SeriesOrScalar):
elif what in {"SECOND", "SECONDS"}:
return df.second
elif what in {"WEEK", "WEEKS"}:
return df.week
return df.isocalendar().week
elif what in {"YEAR", "YEARS"}:
return df.year
else:
Expand Down
6 changes: 3 additions & 3 deletions dask_sql/server/presto_jdbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@ def create_meta_data(c: Context):
# catalogs = pd.DataFrame().append(create_catalog_row(catalog), ignore_index=True)
# c.create_table("catalogs", catalogs, schema_name=system_schema)

schemas = pd.DataFrame().append(create_schema_row(), ignore_index=True)
schemas = pd.DataFrame(create_schema_row(), index=[0])
c.create_table("schemas", schemas, schema_name=system_schema)
schema_rows = []

tables = pd.DataFrame().append(create_table_row(), ignore_index=True)
tables = pd.DataFrame(create_table_row(), index=[0])
c.create_table("tables", tables, schema_name=system_schema)
table_rows = []

columns = pd.DataFrame().append(create_column_row(), ignore_index=True)
columns = pd.DataFrame(create_column_row(), index=[0])
c.create_table("columns", columns, schema_name=system_schema)
column_rows = []

Expand Down
2 changes: 1 addition & 1 deletion docker/conda.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
python>=3.8
dask>=2022.3.0,<=2023.5.1
pandas>=1.4.0,<2.0.0
pandas>=1.4.0
jpype1>=1.0.2
openjdk>=8
maven>=3.6.0
Expand Down
2 changes: 1 addition & 1 deletion docker/main.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ RUN mamba install -y \
"setuptools-rust>=1.5.2" \
# core dependencies
"dask>=2022.3.0,<=2023.5.1" \
"pandas>=1.4.0,<2.0.0" \
"pandas>=1.4.0" \
# FIXME: handling is needed for httpx-based fastapi>=0.87.0
"fastapi>=0.69.0,<0.87.0" \
"uvicorn>=0.13.4" \
Expand Down
2 changes: 1 addition & 1 deletion docs/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ dependencies:
- sphinx-tabs
- dask-sphinx-theme>=2.0.3
- dask>=2022.3.0,<=2023.5.1
- pandas>=1.4.0,<2.0.0
- pandas>=1.4.0
- fugue>=0.7.3
# FIXME: handling is needed for httpx-based fastapi>=0.87.0
- fastapi>=0.69.0,<0.87.0
Expand Down
2 changes: 1 addition & 1 deletion docs/requirements-docs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ sphinx>=4.0.0
sphinx-tabs
dask-sphinx-theme>=3.0.0
dask>=2022.3.0,<=2023.5.1
pandas>=1.4.0,<2.0.0
pandas>=1.4.0
fugue>=0.7.3
# FIXME: handling is needed for httpx-based fastapi>=0.87.0
fastapi>=0.69.0,<0.87.0
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
install_requires=[
"dask[dataframe]>=2022.3.0,<=2023.5.1",
"distributed>=2022.3.0,<=2023.5.1",
"pandas>=1.4.0,<2.0.0",
"pandas>=1.4.0",
# FIXME: handling is needed for httpx-based fastapi>=0.87.0
"fastapi>=0.69.0,<0.87.0",
"uvicorn>=0.13.4",
Expand Down
13 changes: 7 additions & 6 deletions tests/integration/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,13 @@ def gpu_string_table(string_table):

@pytest.fixture()
def gpu_datetime_table(datetime_table):
# cudf doesn't have support for timezoned datetime data
datetime_table["timezone"] = datetime_table["timezone"].astype("datetime64[ns]")
datetime_table["utc_timezone"] = datetime_table["utc_timezone"].astype(
"datetime64[ns]"
)
return cudf.from_pandas(datetime_table) if cudf else None
if cudf:
# cudf doesn't have support for timezoned datetime data
df = datetime_table.copy()
df["timezone"] = df["timezone"].dt.tz_localize(None)
df["utc_timezone"] = df["utc_timezone"].dt.tz_localize(None)
return cudf.from_pandas(df)
return None
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before this change, we were actually overwriting the datetime_table CPU fixture such that it didn't contain timezone-aware data.

Now that we're no longer doing that, a failure has been exposed in test_filter_cast_timestamp because filtering a timezone-aware column by a timestamp would require non-UTC timezone handling for literals on the Python end:

if timezone and timezone != "UTC":
raise ValueError("Non UTC timezones not supported")

There are a few ways we could handle this, but all of them generally involve front-facing changes to timestamp literal handling so figure it might make sense to break that off into a separate PR.

Okay with xfailing that test for now to get this in? cc @ayushdg @jdye64



@pytest.fixture()
Expand Down
24 changes: 11 additions & 13 deletions tests/integration/test_analyze.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import dask.dataframe as dd
import pandas as pd

from dask_sql.mappings import python_to_sql_type
Expand All @@ -8,24 +9,21 @@ def test_analyze(c, df):
result_df = c.sql("ANALYZE TABLE df COMPUTE STATISTICS FOR ALL COLUMNS")

# extract table and compute stats with Dask manually
expected_df = (
c.sql("SELECT * FROM df")
.describe()
.append(
pd.Series(
expected_df = dd.concat(
[
c.sql("SELECT * FROM df").describe(),
pd.DataFrame(
{
col: str(python_to_sql_type(df[col].dtype)).lower()
for col in df.columns
},
name="data_type",
)
)
.append(
pd.Series(
index=["data_type"],
),
pd.DataFrame(
{col: col for col in df.columns},
name="col_name",
)
)
index=["col_name"],
),
]
)

assert_eq(result_df, expected_df)
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def test_filter_cast_date(c, input_table, request):
)

expected_df = datetime_table[
datetime_table["timezone"].astype("<M8[ns]").dt.floor("D").astype("<M8[ns]")
datetime_table["timezone"].dt.tz_localize(None).dt.floor("D").astype("<M8[ns]")
> pd.Timestamp("2014-08-01")
]
assert_eq(return_df, expected_df)
Expand Down
3 changes: 1 addition & 2 deletions tests/integration/test_jdbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@
def c():
c = Context()
c.create_schema(schema)
row = create_table_row()
tables = pd.DataFrame().append(row, ignore_index=True)
tables = pd.DataFrame(create_table_row(), index=[0])
tables = tables.astype({"AN_INT": "int64"})
c.create_table(table, tables, schema_name=schema)

Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def test_join_cross(c, user_table_1, department_table):
user_table_1["key"] = 1
department_table["key"] = 1

expected_df = dd.merge(user_table_1, department_table, on="key").drop("key", 1)
expected_df = dd.merge(user_table_1, department_table, on="key").drop(columns="key")

assert_eq(return_df, expected_df, check_index=False)

Expand Down
13 changes: 9 additions & 4 deletions tests/integration/test_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ def test_select_column(c, df):
def test_select_different_types(c):
expected_df = pd.DataFrame(
{
"date": pd.to_datetime(["2022-01-21 17:34", "2022-01-21", "17:34", pd.NaT]),
"date": pd.to_datetime(
["2022-01-21 17:34", "2022-01-21", "17:34", pd.NaT], format="mixed"
),
"string": ["this is a test", "another test", "äölüć", ""],
"integer": [1, 2, -4, 5],
"float": [-1.1, np.NaN, pd.NA, np.sqrt(2)],
Expand Down Expand Up @@ -163,13 +165,13 @@ def test_date_casting(c, input_table, request):

expected_df = datetime_table
expected_df["timezone"] = (
expected_df["timezone"].astype("<M8[ns]").dt.floor("D").astype("<M8[ns]")
expected_df["timezone"].dt.tz_localize(None).dt.floor("D").astype("<M8[ns]")
)
expected_df["no_timezone"] = (
expected_df["no_timezone"].astype("<M8[ns]").dt.floor("D").astype("<M8[ns]")
)
expected_df["utc_timezone"] = (
expected_df["utc_timezone"].astype("<M8[ns]").dt.floor("D").astype("<M8[ns]")
expected_df["utc_timezone"].dt.tz_localize(None).dt.floor("D").astype("<M8[ns]")
)

assert_eq(result_df, expected_df)
Expand All @@ -194,7 +196,10 @@ def test_timestamp_casting(c, input_table, request):
"""
)

expected_df = datetime_table.astype("<M8[ns]")
expected_df = datetime_table
expected_df["timezone"] = expected_df["timezone"].dt.tz_localize(None)
expected_df["utc_timezone"] = expected_df["utc_timezone"].dt.tz_localize(None)

assert_eq(result_df, expected_df)


Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,4 @@ def test_similar_type():
assert similar_type(pd.Int64Dtype(), np.int32)
assert not similar_type(np.uint32, np.int32)
assert similar_type(np.float32, np.float64)
assert similar_type(np.object_, str)
assert similar_type(object, str)