Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Fix unintended cache misses with async queries #14291

Merged
merged 4 commits into from
Apr 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions superset/tasks/async_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.

import copy
import logging
from typing import Any, cast, Dict, Optional

Expand Down Expand Up @@ -91,6 +92,13 @@ def load_explore_json_into_cache( # pylint: disable=too-many-locals
ensure_user_is_set(job_metadata.get("user_id"))
datasource_id, datasource_type = get_datasource_info(None, None, form_data)

# Perform a deep copy here so that below we can cache the original
# value of the form_data object. This is necessary since the viz
# objects modify the form_data object. If the modified version were
# to be cached here, it will lead to a cache miss when clients
# attempt to retrieve the value of the completed async query.
original_form_data = copy.deepcopy(form_data)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💯


viz_obj = get_viz(
datasource_type=cast(str, datasource_type),
datasource_id=datasource_id,
Expand All @@ -102,8 +110,11 @@ def load_explore_json_into_cache( # pylint: disable=too-many-locals
if viz_obj.has_error(payload):
raise SupersetVizException(errors=payload["errors"])

# cache form_data for async retrieval
cache_value = {"form_data": form_data, "response_type": response_type}
# Cache the original form_data value for async retrieval
cache_value = {
"form_data": original_form_data,
"response_type": response_type,
}
cache_key = generate_cache_key(cache_value, cache_key_prefix)
set_and_log_cache(cache_manager.cache, cache_key, cache_value)
result_url = f"/superset/explore_json/data/{cache_key}"
Expand Down
6 changes: 4 additions & 2 deletions superset/utils/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@
)
from superset.typing import FlaskResponse, FormData, Metric
from superset.utils.dates import datetime_to_epoch, EPOCH
from superset.utils.hashing import md5_sha_from_str
from superset.utils.hashing import md5_sha_from_dict, md5_sha_from_str

try:
from pydruid.utils.having import Having
Expand Down Expand Up @@ -1046,7 +1046,6 @@ def to_adhoc(
result = {
"clause": clause.upper(),
"expressionType": expression_type,
"filterOptionName": str(uuid.uuid4()),
"isExtra": bool(filt.get("isExtra")),
}

Expand All @@ -1061,6 +1060,9 @@ def to_adhoc(
elif expression_type == "SQL":
result.update({"sqlExpression": filt.get(clause)})

deterministic_name = md5_sha_from_dict(result)
result["filterOptionName"] = deterministic_name
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm unclear what the impact of changing the filterOptionName value is here. @rusackas @villebro

Copy link
Contributor Author

@benjreinhart benjreinhart Apr 22, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I agree we should get some 👀 on this from folks more familiar. AFAICT, this preserves the existing behavior of generating a unique key (which I assume is needed on clients) but also keeps it deterministic so that hashing form_data will work properly (both with async/non-async caching).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With a deterministic filterOptionName we of course now have the risk of having duplicate filterOptionNames which probably is precisely what the uuid4 key is aiming to avoid (could cause trouble in React components if these get returned to the frontend). So maybe we should deduplicate filters, too, which in edge cases would avoid unnecessary cache misses (=if the exact same filter has been defined twice vs another query where the same filter is only defined once)?

Copy link
Contributor Author

@benjreinhart benjreinhart Apr 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm... can you point to a specific example of where that might be an issue? Wouldn't each filter have different attributes? If not, is it a bug on the client that the same filters for the same chart would be added more than once? The idea is this should be deterministic for the same filter dict, but the set of filter dicts should have different properties, causing a different sha.


return result


Expand Down
12 changes: 10 additions & 2 deletions superset/viz.py
Original file line number Diff line number Diff line change
Expand Up @@ -2257,7 +2257,11 @@ def query_obj(self) -> QueryObjectDict:
if fd.get("point_radius") != "Auto":
d["columns"].append(fd.get("point_radius"))

d["columns"] = list(set(d["columns"]))
# Ensure this value is sorted so that it does not
# cause the cache key generation (which hashes the
# query object) to generate different keys for values
# that should be considered the same.
d["columns"] = sorted(set(d["columns"]))
else:
# Ensuring columns chosen are all in group by
if (
Expand Down Expand Up @@ -2501,7 +2505,11 @@ def query_obj(self) -> QueryObjectDict:
if fd.get("js_columns"):
gb += fd.get("js_columns") or []
metrics = self.get_metrics()
gb = list(set(gb))
# Ensure this value is sorted so that it does not
# cause the cache key generation (which hashes the
# query object) to generate different keys for values
# that should be considered the same.
gb = sorted(set(gb))
if metrics:
d["groupby"] = gb
d["metrics"] = metrics
Expand Down
54 changes: 54 additions & 0 deletions tests/utils/core_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=no-self-use
import pytest

from superset.utils.core import to_adhoc


def test_to_adhoc_generates_deterministic_values():
input_1 = {
"op": "IS NOT NULL",
"col": "LATITUDE",
"val": "",
}

input_2 = {**input_1, "col": "LONGITUDE"}

# The result is the same when given the same input
assert to_adhoc(input_1) == to_adhoc(input_1)
assert to_adhoc(input_1) == {
"clause": "WHERE",
"expressionType": "SIMPLE",
"isExtra": False,
"comparator": "",
"operator": "IS NOT NULL",
"subject": "LATITUDE",
"filterOptionName": "d0908f77d950131db7a69fdc820cb739",
}

# The result is different when given different input
assert to_adhoc(input_1) != to_adhoc(input_2)
assert to_adhoc(input_2) == {
"clause": "WHERE",
"expressionType": "SIMPLE",
"isExtra": False,
"comparator": "",
"operator": "IS NOT NULL",
"subject": "LONGITUDE",
"filterOptionName": "c5f283f727d4dfc6258b351d4a8663bc",
}
13 changes: 5 additions & 8 deletions tests/viz_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# specific language governing permissions and limitations
# under the License.
# isort:skip_file
import uuid
from datetime import date, datetime, timezone
import logging
from math import nan
Expand Down Expand Up @@ -1179,9 +1178,7 @@ def test_parse_coordinates_raises(self):
with self.assertRaises(SpatialException):
test_viz_deckgl.parse_coordinates("fldkjsalkj,fdlaskjfjadlksj")

@patch("superset.utils.core.uuid.uuid4")
def test_filter_nulls(self, mock_uuid4):
mock_uuid4.return_value = uuid.UUID("12345678123456781234567812345678")
def test_filter_nulls(self):
test_form_data = {
"latlong_key": {"type": "latlong", "lonCol": "lon", "latCol": "lat"},
"delimited_key": {"type": "delimited", "lonlatCol": "lonlat"},
Expand All @@ -1194,7 +1191,7 @@ def test_filter_nulls(self, mock_uuid4):
{
"clause": "WHERE",
"expressionType": "SIMPLE",
"filterOptionName": "12345678-1234-5678-1234-567812345678",
"filterOptionName": "bfa3a42a6f3de3c781b7d4f8e8d6613d",
"comparator": "",
"operator": "IS NOT NULL",
"subject": "lat",
Expand All @@ -1203,7 +1200,7 @@ def test_filter_nulls(self, mock_uuid4):
{
"clause": "WHERE",
"expressionType": "SIMPLE",
"filterOptionName": "12345678-1234-5678-1234-567812345678",
"filterOptionName": "2d35d87b57c6f1a5ae139f1a6b0cbd0a",
"comparator": "",
"operator": "IS NOT NULL",
"subject": "lon",
Expand All @@ -1214,7 +1211,7 @@ def test_filter_nulls(self, mock_uuid4):
{
"clause": "WHERE",
"expressionType": "SIMPLE",
"filterOptionName": "12345678-1234-5678-1234-567812345678",
"filterOptionName": "89cc0fafe39a4eabc5df2cd52e4d6514",
"comparator": "",
"operator": "IS NOT NULL",
"subject": "lonlat",
Expand All @@ -1225,7 +1222,7 @@ def test_filter_nulls(self, mock_uuid4):
{
"clause": "WHERE",
"expressionType": "SIMPLE",
"filterOptionName": "12345678-1234-5678-1234-567812345678",
"filterOptionName": "fa734d9a7bab254a53b41540d46cdb6c",
"comparator": "",
"operator": "IS NOT NULL",
"subject": "geo",
Expand Down