Skip to content

Commit

Permalink
Encapsulate promotheus alert types in different method names.
Browse files Browse the repository at this point in the history
  • Loading branch information
khurtado committed Feb 14, 2022
1 parent 96193e2 commit 1cd9eb1
Showing 1 changed file with 45 additions and 40 deletions.
85 changes: 45 additions & 40 deletions src/python/WMCore/MicroService/MSTransferor/MSTransferor.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,17 +199,7 @@ def execute(self, reqStatus):
self.checkPUDataLocation(wflow)
if wflow.getSecondarySummary() and not wflow.getPURSElist():
# then we still have pileup to be transferred, but with incorrect locations
msg = "Workflow: %s cannot proceed due to some PU misconfiguration. Check previous logs..."
self.logger.critical(msg, wflow.getName())
# Send alert to prometheus
alertName = "{}: PU misconfiguration error. Workflow: {}".format(self.alertServiceName,
wflow.getname())
alertSeverity = "high"
alertSummary = "[MSTransferor] Workflow cannot proceed due to some PU misconfiguration."
alertDescription = "Workflow: {} could not proceed due to some PU misconfiguration,".format(wflow.getName())
alertDescription += "so it will be skipped."
self.sendAlert(alertName, alertSeverity, alertSummary, alertDescription,
self.alertServiceName)
self.alertPUMisconfig(wflow.getname())
# FIXME: this needs to be logged somewhere and workflow be set to failed
counterProblematicRequests += 1
continue
Expand All @@ -221,17 +211,9 @@ def execute(self, reqStatus):
success, transfers = self.makeTransferRequest(wflow)
except Exception as ex:
success = False
msg = "Unknown exception while making Transfer Request for %s " % wflow.getName()
msg += "\tError: %s" % str(ex)
self.alertUnknownTransferError(wflow.getName())
msg = "\tError: %s" % str(ex)
self.logger.exception(msg)
# Send alert to prometheus
alertName = "{}: Transfer request error. Workflow: {}".format(self.alertServiceName,
wflow.getname())
alertSeverity = "high"
alertSummary = "[MSTransferor] Unknown exception while making Transfer Request."
alertDescription = "Unknown exception while making Transfer request for workflow: {}".format(wflow.getName())
self.sendAlert(alertName, alertSeverity, alertSummary, alertDescription,
self.alertServiceName)
if success:
self.logger.info("Transfer requests successful for %s. Summary: %s",
wflow.getName(), pformat(transfers)) # then create a document in ReqMgr Aux DB
Expand All @@ -242,27 +224,9 @@ def execute(self, reqStatus):
counterSuccessRequests += 1
else:
counterFailedRequests += 1
# Send alert to prometheus
alertName = "{}: Transfer document error posting to CouchDB. Workflow: {}".format(self.alertServiceName,
wflow.getname())
alertSeverity = "high"
alertSummary = "[MSTransferor] Transfer document could not be created in CouchDB."
alertDescription = "Workflow: {}, failed request due to error posting to CouchDB".format(wflow.getName())
self.sendAlert(alertName, alertSeverity, alertSummary, alertDescription,
self.alertServiceName)
self.alertTransferCouchDBError(wflow.getname())
else:
counterFailedRequests += 1
msg = "Unknown exception while making Transfer Request for %s " % wflow.getName()
msg += "\tError: %s" % str(ex)
self.logger.exception(msg)
# Send alert to prometheus
alertName = "{}: Transfer request failed. Workflow: {}".format(self.alertServiceName,
wflow.getname())
alertSeverity = "high"
alertSummary = "[MSTransferor] Transfer Request failure."
alertDescription = "Transfer request did not succeed for workflow: {}".format(wflow.getName())
self.sendAlert(alertName, alertSeverity, alertSummary, alertDescription,
self.alertServiceName)
# it can go slightly beyond the limit. It's evaluated for every slice
if counterSuccessRequests >= self.msConfig["limitRequestsPerCycle"]:
msg = "Transferor succeeded acting on %d workflows in this cycle. " % counterSuccessRequests
Expand Down Expand Up @@ -785,6 +749,47 @@ def sendAlert(self, alertName, severity, summary, description, service, endSecs
except Exception as ex:
self.logger.exception("Failed to send alert to %s. Error: %s", self.alertManagerUrl, str(ex))

def alertPUMisconfig(self, workflowName):
"""
Send alert to Prometheus with PU misconfiguration error
"""
alertName = "{}: PU misconfiguration error. Workflow: {}".format(self.alertServiceName,
workflowName)
alertSeverity = "high"
alertSummary = "[MSTransferor] Workflow cannot proceed due to some PU misconfiguration."
alertDescription = "Workflow: {} could not proceed due to some PU misconfiguration,".format(workflowName)
alertDescription += "so it will be skipped."
self.sendAlert(alertName, alertSeverity, alertSummary, alertDescription,
self.alertServiceName)
self.logger.critical(alertDescription)

def alertUnknownTransferError(self, workflowName):
"""
Send alert to Prometheus with unknown transfer error
"""
alertName = "{}: Transfer request error. Workflow: {}".format(self.alertServiceName,
workflowName)
alertSeverity = "high"
alertSummary = "[MSTransferor] Unknown exception while making Transfer Request."
alertDescription = "Unknown exception while making Transfer request for workflow: {}".format(workflowName)
self.sendAlert(alertName, alertSeverity, alertSummary, alertDescription,
self.alertServiceName)
msg = "Unknown exception while making Transfer Request for %s " % wflow.getName()
self.logger.exception(alertDescription)

def alertTransferCouchDBError(self, workflowName):
"""
Send alert to Prometheus with CouchDB transfer error
"""
alertName = "{}: Transfer document error posting to CouchDB. Workflow: {}".format(self.alertServiceName,
workflowName)
alertSeverity = "high"
alertSummary = "[MSTransferor] Transfer document could not be created in CouchDB."
alertDescription = "Workflow: {}, failed request due to error posting to CouchDB".format(workflowName)
self.sendAlert(alertName, alertSeverity, alertSummary, alertDescription,
self.alertServiceName)
self.logger.warning(alertDescription)


def notifyLargeData(self, aboveWarningThreshold, transferId, wflowName, dataSize, dataIn):
"""
Expand Down

0 comments on commit 1cd9eb1

Please sign in to comment.