Skip to content

Commit

Permalink
fix: change periodic_remove condition and fix parseCondorStatus
Browse files Browse the repository at this point in the history
  • Loading branch information
aldbr committed Jun 28, 2023
1 parent 7415c90 commit 2e963c4
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 13 deletions.
26 changes: 15 additions & 11 deletions src/DIRAC/Resources/Computing/BatchSystems/Condor.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
5: "Failed",
}

HOLD_REASON_SUBCODE = 55
HOLD_REASON_SUBCODE = "55"

subTemplate = """
# Environment
Expand Down Expand Up @@ -71,7 +71,7 @@
# A random subcode to identify who put the job on hold
on_exit_hold_subcode = %(holdReasonSubcode)s
# Jobs are then deleted from the system after N days if they are not running
period_remove = (JobStatus != 2) && (time() - EnteredCurrentStatus) > (%(daysToKeepRemoteLogs)s * 24 * 3600)
period_remove = (JobStatus != 1) && (JobStatus != 2) && ((time() - EnteredCurrentStatus) > (%(daysToKeepRemoteLogs)s * 24 * 3600))
# Specific options
# ----------------
Expand All @@ -94,13 +94,17 @@ def parseCondorStatus(lines, jobID):
:returns: Status as known by DIRAC, and a reason if the job is being held
"""
jobID = str(jobID)

holdReason = ""
for line in lines:
l = line.strip().split()

# Make sure the status is present and is an integer
try:
status = int(l[1])
except (ValueError, IndexError):
continue
holdReason = ""

if l[0] == jobID:
# A job can be held for many various reasons, we need to further investigate with the holdReasonCode & holdReasonSubCode
# Details in:
Expand All @@ -110,22 +114,22 @@ def parseCondorStatus(lines, jobID):
# By default, a held (5) job is defined as Aborted, but there might be some exceptions
status = 3
try:
holdReasonCode = int(l[2])
holdReasonSubcode = int(l[3])
holdReason = l[4:]
except (ValueError, IndexError):
holdReasonCode = l[2]
holdReasonSubcode = l[3]
holdReason = " ".join(l[4:])
except IndexError:
# This should not happen in theory
# Just set the status to unknown such as
status = -1
holdReasonCode = -1
holdReasonSubcode = -1
holdReasonCode = "undefined"
holdReasonSubcode = "undefined"

# If holdReasonCode is 3 (The PERIODIC_HOLD expression evaluated to True. Or, ON_EXIT_HOLD was true)
# And subcode is HOLD_REASON_SUBCODE, then it means the job failed by itself, it needs to be marked as Failed
if holdReasonCode == 3 and holdReasonSubcode == HOLD_REASON_SUBCODE:
if holdReasonCode == "3" and holdReasonSubcode == HOLD_REASON_SUBCODE:
status = 5
# If holdReasonCode is 16 (Input files are being spooled), the job should be marked as Waiting
elif holdReasonCode == 16:
elif holdReasonCode == "16":
status = 1

return (STATES_MAP.get(status, "Unknown"), holdReason)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ def getJobStatus(self, jobIDList):

qList = []
for _condorIDs in breakListIntoChunks(condorIDs.values(), 100):
# This will return a list of 1245.75 3
# This will return a list of 1245.75 3 undefined undefined undefined
cmd = ["condor_q"]
cmd.extend(self.remoteScheddOptions.strip().split(" "))
cmd.extend(_condorIDs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,12 @@ def test_parseCondorStatus():
"foo": "Unknown",
"104096.1": "Aborted",
"104096.2": "Aborted",
"104096.3": "Unknown",
"104096.3": "Aborted",
"104096.4": "Unknown",
}
import pdb

pdb.set_trace()
for jobID, expected in expectedResults.items():
print(jobID, expected)
assert HTCE.parseCondorStatus(statusLines, jobID)[0] == expected
Expand Down

0 comments on commit 2e963c4

Please sign in to comment.