Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: handle temporal columns in presto partitions #24054

Merged
merged 1 commit into from
May 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion superset/db_engine_specs/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1276,7 +1276,7 @@ def where_latest_partition( # pylint: disable=too-many-arguments,unused-argumen
schema: Optional[str],
database: Database,
query: Select,
columns: Optional[List[Dict[str, str]]] = None,
columns: Optional[List[Dict[str, Any]]] = None,
) -> Optional[Select]:
"""
Add a where clause to a query to reference only the most recent partition
Expand Down
2 changes: 1 addition & 1 deletion superset/db_engine_specs/hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ def where_latest_partition( # pylint: disable=too-many-arguments
schema: Optional[str],
database: "Database",
query: Select,
columns: Optional[List[Dict[str, str]]] = None,
columns: Optional[List[Dict[str, Any]]] = None,
) -> Optional[Select]:
try:
col_names, values = cls.latest_partition(
Expand Down
18 changes: 10 additions & 8 deletions superset/db_engine_specs/presto.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ def where_latest_partition( # pylint: disable=too-many-arguments
schema: Optional[str],
database: Database,
query: Select,
columns: Optional[List[Dict[str, str]]] = None,
columns: Optional[List[Dict[str, Any]]] = None,
giftig marked this conversation as resolved.
Show resolved Hide resolved
) -> Optional[Select]:
try:
col_names, values = cls.latest_partition(
Expand All @@ -513,13 +513,15 @@ def where_latest_partition( # pylint: disable=too-many-arguments
}

for col_name, value in zip(col_names, values):
if col_name in column_type_by_name:
if column_type_by_name.get(col_name) == "TIMESTAMP":
query = query.where(Column(col_name, TimeStamp()) == value)
elif column_type_by_name.get(col_name) == "DATE":
query = query.where(Column(col_name, Date()) == value)
else:
query = query.where(Column(col_name) == value)
col_type = column_type_by_name.get(col_name)

if isinstance(col_type, types.DATE):
col_type = Date()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems a little atypical that we’re overloading col_type.

Why do only DATE and TIMESTAMP get mutated? Is this because of how the types are cast from a string in SQL, i.e., DATE ‘2023-05-01’?

Copy link
Contributor Author

@giftig giftig May 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we have a column which is a date or timestamp but the value we've received is a string, and the underlying presto lib doesn't understand how to cast that by itself. Instead we have some types in presto_sql_types which add that functionality and I'm swapping out with those custom types to render it correctly.

@villebro and I discussed that a more robust fix might be further upstream, such that we actually already get the right types here, but the fix proposed (changing the types around L220ish in this file) didn't fix the issue and introduced additional problems, so this looks like the most viable fix for now. It probably requires a broader look at the surrounding code to see if something was missed which would eliminate the need for this. However you can also see in convert_dttm that other custom logic exists for these types, and other engines are also inheriting that custom behaviour from this one too (which is why modifying it had a wider impact)

elif isinstance(col_type, types.TIMESTAMP):
col_type = TimeStamp()

query = query.where(Column(col_name, col_type) == value)

return query

@classmethod
Expand Down
39 changes: 38 additions & 1 deletion tests/unit_tests/db_engine_specs/test_presto.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@
# under the License.
from datetime import datetime
from typing import Any, Dict, Optional, Type
from unittest import mock

import pytest
import pytz
from sqlalchemy import types
from pyhive.sqlalchemy_presto import PrestoDialect
from sqlalchemy import sql, text, types
from sqlalchemy.engine.url import make_url

from superset.utils.core import GenericDataType
Expand Down Expand Up @@ -106,3 +108,38 @@ def test_get_schema_from_engine_params() -> None:
)
is None
)


@mock.patch("superset.db_engine_specs.presto.PrestoEngineSpec.latest_partition")
@pytest.mark.parametrize(
["column_type", "column_value", "expected_value"],
[
(types.DATE(), "2023-05-01", "DATE '2023-05-01'"),
(types.TIMESTAMP(), "2023-05-01", "TIMESTAMP '2023-05-01'"),
(types.VARCHAR(), "2023-05-01", "'2023-05-01'"),
(types.INT(), 1234, "1234"),
],
)
def test_where_latest_partition(
mock_latest_partition, column_type, column_value: Any, expected_value: str
) -> None:
"""
Test the ``where_latest_partition`` method
"""
from superset.db_engine_specs.presto import PrestoEngineSpec as spec

mock_latest_partition.return_value = (["partition_key"], [column_value])

query = sql.select(text("* FROM table"))
columns = [{"name": "partition_key", "type": column_type}]

expected = f"""SELECT * FROM table \nWHERE "partition_key" = {expected_value}"""
result = spec.where_latest_partition(
"table", mock.MagicMock(), mock.MagicMock(), query, columns
)
assert result is not None
actual = result.compile(
dialect=PrestoDialect(), compile_kwargs={"literal_binds": True}
)

assert str(actual) == expected