-
Notifications
You must be signed in to change notification settings - Fork 14.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: add retry to SQL-based alerting celery task #10542
Changes from 6 commits
e14a0e5
e97815d
028f147
4a4a04b
a838c7d
d6ce466
45bee1b
a577f04
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,12 +47,13 @@ | |
from retry.api import retry_call | ||
from selenium.common.exceptions import WebDriverException | ||
from selenium.webdriver import chrome, firefox | ||
from sqlalchemy.orm import Session | ||
from sqlalchemy.exc import NoSuchColumnError, ResourceClosedError | ||
from werkzeug.http import parse_cookie | ||
|
||
from superset import app, db, security_manager, thumbnail_cache | ||
from superset.extensions import celery_app | ||
from superset.models.alerts import Alert, AlertLog | ||
from superset.models.core import Database | ||
from superset.models.dashboard import Dashboard | ||
from superset.models.schedules import ( | ||
EmailDeliveryType, | ||
|
@@ -79,6 +80,7 @@ | |
logger = logging.getLogger("tasks.email_reports") | ||
logger.setLevel(logging.INFO) | ||
|
||
stats_logger = current_app.config["STATS_LOGGER"] | ||
EMAIL_PAGE_RENDER_WAIT = config["EMAIL_PAGE_RENDER_WAIT"] | ||
WEBDRIVER_BASEURL = config["WEBDRIVER_BASEURL"] | ||
WEBDRIVER_BASEURL_USER_FRIENDLY = config["WEBDRIVER_BASEURL_USER_FRIENDLY"] | ||
|
@@ -533,6 +535,9 @@ def schedule_email_report( # pylint: disable=unused-argument | |
name="alerts.run_query", | ||
bind=True, | ||
soft_time_limit=config["EMAIL_ASYNC_TIME_LIMIT_SEC"], | ||
autoretry_for=(NoSuchColumnError, ResourceClosedError,), | ||
retry_kwargs={"max_retries": 5}, | ||
retry_backoff=True, | ||
) | ||
def schedule_alert_query( # pylint: disable=unused-argument | ||
task: Task, | ||
|
@@ -542,24 +547,33 @@ def schedule_alert_query( # pylint: disable=unused-argument | |
is_test_alert: Optional[bool] = False, | ||
) -> None: | ||
model_cls = get_scheduler_model(report_type) | ||
dbsession = db.create_scoped_session() | ||
schedule = dbsession.query(model_cls).get(schedule_id) | ||
|
||
# The user may have disabled the schedule. If so, ignore this | ||
if not schedule or not schedule.active: | ||
logger.info("Ignoring deactivated alert") | ||
return | ||
try: | ||
schedule = db.create_scoped_session().query(model_cls).get(schedule_id) | ||
|
||
if report_type == ScheduleType.alert: | ||
if is_test_alert and recipients: | ||
deliver_alert(schedule.id, recipients) | ||
# The user may have disabled the schedule. If so, ignore this | ||
if not schedule or not schedule.active: | ||
logger.info("Ignoring deactivated alert") | ||
return | ||
|
||
if run_alert_query(schedule.id, dbsession): | ||
# deliver_dashboard OR deliver_slice | ||
return | ||
else: | ||
raise RuntimeError("Unknown report type") | ||
if report_type == ScheduleType.alert: | ||
if is_test_alert and recipients: | ||
deliver_alert(schedule.id, recipients) | ||
return | ||
|
||
if run_alert_query( | ||
schedule.id, schedule.database_id, schedule.sql, schedule.label | ||
): | ||
# deliver_dashboard OR deliver_slice | ||
return | ||
else: | ||
raise RuntimeError("Unknown report type") | ||
except NoSuchColumnError as column_error: | ||
stats_logger.incr("run_alert_task.failure.NoSuchColumnError") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the API follows a fixed pattern on statsd metrics: https://github.com/apache/incubator-superset/blob/master/superset/views/base_api.py#L173 So we use ..error|success|init. My take is that it's desirable to just incr an error counter (at the method level is granular enough) and also log the error with a more detailed message or exception There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in this case we want to differentiate between NoSuchColumnError & ResourceClosedError - still researching the error |
||
raise column_error | ||
except ResourceClosedError as resource_error: | ||
stats_logger.incr("run_alert_task.failure.ResourceClosedError") | ||
raise resource_error | ||
|
||
|
||
class AlertState: | ||
|
@@ -569,7 +583,7 @@ class AlertState: | |
|
||
|
||
def deliver_alert(alert_id: int, recipients: Optional[str] = None) -> None: | ||
alert = db.session.query(Alert).get(alert_id) | ||
alert = db.create_scoped_session().query(Alert).get(alert_id) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is interesting, @john-bodley submitted this: #10427 is this necessary because it's called by a celery task, and we want to make sure the session gets removed on task completion? What's the effect if this is called by a web request with proper session scope? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For now I'm actually going to revert this change and just use regular session. Both types of sessions cause errors with the underlying cause remaining unknown for now. |
||
|
||
logging.info("Triggering alert: %s", alert) | ||
img_data = None | ||
|
@@ -618,51 +632,56 @@ def deliver_alert(alert_id: int, recipients: Optional[str] = None) -> None: | |
_deliver_email(recipients, deliver_as_group, subject, body, data, images) | ||
|
||
|
||
def run_alert_query(alert_id: int, dbsession: Session) -> Optional[bool]: | ||
def run_alert_query( | ||
alert_id: int, database_id: int, sql: str, label: str | ||
) -> Optional[bool]: | ||
""" | ||
Execute alert.sql and return value if any rows are returned | ||
""" | ||
alert = db.session.query(Alert).get(alert_id) | ||
|
||
logger.info("Processing alert ID: %i", alert.id) | ||
database = alert.database | ||
dbsession = db.create_scoped_session() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: can we call it |
||
logger.info("Processing alert ID: %i", alert_id) | ||
database = dbsession.query(Database).get(database_id) | ||
if not database: | ||
logger.error("Alert database not preset") | ||
return None | ||
|
||
if not alert.sql: | ||
if not sql: | ||
logger.error("Alert SQL not preset") | ||
return None | ||
|
||
parsed_query = ParsedQuery(alert.sql) | ||
parsed_query = ParsedQuery(sql) | ||
sql = parsed_query.stripped() | ||
|
||
state = None | ||
dttm_start = datetime.utcnow() | ||
|
||
df = pd.DataFrame() | ||
try: | ||
logger.info("Evaluating SQL for alert %s", alert) | ||
logger.info("Evaluating SQL for alert <%s:%s>", alert_id, label) | ||
df = database.get_df(sql) | ||
except Exception as exc: # pylint: disable=broad-except | ||
state = AlertState.ERROR | ||
logging.exception(exc) | ||
logging.error("Failed at evaluating alert: %s (%s)", alert.label, alert.id) | ||
logging.error("Failed at evaluating alert: %s (%s)", label, alert_id) | ||
|
||
dttm_end = datetime.utcnow() | ||
last_eval_dttm = datetime.utcnow() | ||
|
||
if state != AlertState.ERROR: | ||
alert.last_eval_dttm = datetime.utcnow() | ||
if not df.empty: | ||
# Looking for truthy cells | ||
for row in df.to_records(): | ||
if any(row): | ||
state = AlertState.TRIGGER | ||
deliver_alert(alert.id) | ||
deliver_alert(alert_id) | ||
break | ||
if not state: | ||
state = AlertState.PASS | ||
|
||
dbsession.commit() | ||
alert = dbsession.query(Alert).get(alert_id) | ||
if state != AlertState.ERROR: | ||
alert.last_eval_dttm = last_eval_dttm | ||
alert.last_state = state | ||
alert.logs.append( | ||
AlertLog( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add a todo and the link to the issue - to get rid of retry once the underlying issue is resolved