Skip to content

Commit

Permalink
Merge pull request DIRACGrid#7302 from fstagni/80_fixes72
Browse files Browse the repository at this point in the history
[8.0] fix: sets jobStatus=Failed/Payload failed iff the job was running
  • Loading branch information
fstagni authored Nov 30, 2023
2 parents ea17c79 + dab5430 commit 70538bb
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 2 deletions.
10 changes: 8 additions & 2 deletions src/DIRAC/WorkloadManagementSystem/Agent/JobAgent.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from DIRAC.WorkloadManagementSystem.Client.MatcherClient import MatcherClient
from DIRAC.WorkloadManagementSystem.Client.PilotManagerClient import PilotManagerClient
from DIRAC.WorkloadManagementSystem.Client.JobManagerClient import JobManagerClient
from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
from DIRAC.WorkloadManagementSystem.Client.JobStateUpdateClient import JobStateUpdateClient
from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport
from DIRAC.WorkloadManagementSystem.Client import JobStatus
Expand Down Expand Up @@ -691,7 +692,7 @@ def _checkSubmittedJobs(self):
payloadErrors = []
originalJobID = self.jobReport.jobID
for jobID, taskID in self.submissionDict.items():
if not taskID in self.computingElement.taskResults:
if taskID not in self.computingElement.taskResults:
continue

result = self.computingElement.taskResults[taskID]
Expand All @@ -714,7 +715,12 @@ def _checkSubmittedJobs(self):

# The payload failed (if result["Value"] is not 0)
elif result["Value"]:
self.jobReport.setJobStatus(status=JobStatus.FAILED, minorStatus="Payload failed")
# In order to avoid overriding perfectly valid states, the status is updated iff the job was running
res = JobMonitoringClient().getJobsStatus(jobID)
if not res["OK"]:
return res
if res["Value"][jobID]["Status"] == JobStatus.RUNNING:
self.jobReport.setJobStatus(status=JobStatus.FAILED, minorStatus="Payload failed")

# Do not keep running and do not overwrite the Payload error
message = f"Payload execution failed with error code {result['Value']}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import pytest
import time
from unittest.mock import MagicMock

from DIRAC import gLogger, S_OK, S_ERROR
from DIRAC.Core.Security.X509Chain import X509Chain # pylint: disable=import-error
Expand Down Expand Up @@ -498,6 +499,7 @@ def test_submitAndCheckJob(mocker, localCE, job, expectedResult1, expectedResult

mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobAgent.AgentModule.__init__")
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobAgent.JobAgent.am_stopExecution")
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobAgent.JobMonitoringClient", return_value=MagicMock())
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobAgent.createJobWrapper", return_value=S_OK([jobName]))
mocker.patch("DIRAC.Core.Security.X509Chain.X509Chain.dumpAllToString", return_value=S_OK())

Expand Down

0 comments on commit 70538bb

Please sign in to comment.