Skip to content

Commit

Permalink
Merge pull request #29135 from adewit/mp-backports-106x
Browse files Browse the repository at this point in the history
Tracker alignment: Backport millepede submission to Condor to 10_6_X
  • Loading branch information
cmsbuild authored Mar 10, 2020
2 parents 420b5d6 + 6e08833 commit 8a7ee75
Show file tree
Hide file tree
Showing 8 changed files with 63 additions and 72 deletions.
18 changes: 15 additions & 3 deletions Alignment/MillePedeAlignmentAlgorithm/scripts/mps_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@
cmdNotFound = 0
insuffPriv = 0
quotaspace = 0
copyerr=0
ispede=0

kill_reason = None
pedeLogErrStr = ""
Expand Down Expand Up @@ -103,6 +105,10 @@
# AP 26.11.2009 Insufficient privileges to rfcp files
if re.search(re.compile('stage_put: Insufficient user privileges',re.M), line):
insuffPriv = 1
if re.search(re.compile('Give up doing',re.M), line):
copyerr = 1
if re.search(re.compile('Directory content before',re.M),line):
ispede = 1
# AP 05.11.2015 Extract cpu-time.
# STDOUT doesn't contain NCU anymore. Now KSI2K and HS06 seconds are displayed.
# The ncuFactor is calculated from few samples by comparing KSI2K seconds with
Expand Down Expand Up @@ -218,7 +224,7 @@
nEvent = int(array[5])

if logZipped == 'true':
os.system('gzip '+eazeLog)
os.system('gzip -f '+eazeLog)

else: # no access to alignment.log
print('mps_check.py cannot find',eazeLog,'to test')
Expand Down Expand Up @@ -326,7 +332,7 @@
pedeLogWrnStr += line

if logZipped == 'true':
os.system('gzip '+eazeLog)
os.system('gzip -f '+eazeLog)
else:
print('mps_check.py cannot find',eazeLog,'to test')

Expand Down Expand Up @@ -354,7 +360,7 @@
pedeLogErr = 1
pedeLogErrStr += line
if logZipped == 'true':
os.system('gzip '+eazeLog)
os.system('gzip -f '+eazeLog)
else:
print('mps_check.py cannot find',eazeLog,'to test')

Expand Down Expand Up @@ -443,6 +449,12 @@
print(lib.JOBDIR[i],lib.JOBID[i],'Job not ended')
remark = 'job not ended'
okStatus = 'FAIL'
if copyerr == 1 and ispede!=1:
#Copy errors in pede job can occur when a nonexistent file is commented in alignment_merge.py but not in theScript.sh, and in that case is *not* a failure
print(lib.JOBDIR[i],lib.JOBID[i],'Copy to eos failed')
remark = 'copy to eos failed'
okStatus = 'FAIL'


# print warning line to stdout
if okStatus != "OK":
Expand Down
8 changes: 8 additions & 0 deletions Alignment/MillePedeAlignmentAlgorithm/scripts/mps_fire.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,14 @@ def write_HTCondor_submit_file_mille(path, script, lib, proxy_path=None):
+JobFlavour = "{flavour:s}"
"""
if "cafalca" in resources:
job_submit_template += """\
+CAFJob = True
+AccountingGroup = "group_u_CMS.CAF.ALCA"
# automatically remove the job if the submitter has no permissions to run a CAF Job
periodic_remove = !regexp("group_u_CMS.CAF.ALCA", AccountingGroup) && CAFJob =?= True
"""

if proxy_path is not None:
job_submit_template += """\
+x509userproxy = "{proxy:s}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ BEGIN

# parse the arguments
while (@ARGV) {
$arg = shift(ARGV);
$arg = shift(@ARGV);
if ($arg =~ /\A-/) { # check for option
if ($arg =~ "h") {
$helpwanted = 1;
Expand Down
10 changes: 7 additions & 3 deletions Alignment/MillePedeAlignmentAlgorithm/scripts/mps_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,13 @@
allowed_mille_classes = ("lxplus", "cmscaf1nh", "cmscaf1nd", "cmscaf1nw",
"cmscafspec1nh", "cmscafspec1nd", "cmscafspec1nw",
"8nm", "1nh", "8nh", "1nd", "2nd", "1nw", "2nw",
"cmsexpress","htcondor_espresso","htcondor_microcentury",
"htcondor_longlunch","htcondor_workday","htcondor_tomorrow",
"htcondor_testmatch","htcondor_nextweek")
"cmsexpress","htcondor_cafalca_espresso","htcondor_espresso",
"htcondor_cafalca_microcentury","htcondor_microcentury",
"htcondor_cafalca_longlunch", "htcondor_longlunch",
"htcondor_cafalca_workday", "htcondor_workday",
"htcondor_cafalca_tomorrow", "htcondor_tomorrow",
"htcondor_cafalca_testmatch", "htcondor_testmatch",
"htcondor_cafalca_nextweek", "htcondor_nextweek")
if lib.get_class("mille") not in allowed_mille_classes:
print("Bad job class for mille in class", args.job_class)
print("Allowed classes:")
Expand Down
71 changes: 19 additions & 52 deletions Alignment/MillePedeAlignmentAlgorithm/scripts/mps_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,39 +59,16 @@ def fill_time_info(mps_index, status, cpu_time):


################################################################################
# deal with submitted jobs by looking into output of shell (bjobs/condor_q)
# deal with submitted jobs by looking into output of shell (condor_q)
if len(submitted_jobs) > 0:
job_status = {}
if "htcondor" in lib.get_class("pede") or "htcondor" in lib.get_class("mille"):
condor_q = subprocess.check_output(["condor_q", "-af:j",
"JobStatus", "RemoteSysCpu"],
stderr = subprocess.STDOUT)
for line in condor_q.splitlines():
job_id, status, cpu_time = line.split()
job_status[job_id] = {"status": htcondor_jobstatus[status],
"cpu": float(cpu_time)}

bjobs = subprocess.check_output(["bjobs", "-l", "-a"],
stderr = subprocess.STDOUT)
bjobs = bjobs.replace("\n","")

job_regex = re.compile(r"Job<(\d+?)>,")
status_regex = re.compile(r"Status<([A-Z]+?)>")
cputime_regex = re.compile(r"TheCPUtimeusedis(\d+(\.\d+)?)seconds")
if bjobs != "No job found":
results = bjobs.replace(" ","").split("-----------------------")
for line in results:
if len(line.strip()) == 0: continue
# extract jobID
job_id = job_regex.search(line).group(1)
# extract job status
status = status_regex.search(line).group(1)
# extract CPU time (only present for finished job)
match = cputime_regex.search(line)
cpu_time = float(match.group(1)) if match else 0
print("out ", job_id, " ", status, " ", cpu_time)
job_status[job_id] = {"status": status,
"cpu": cpu_time}
condor_q = subprocess.check_output(["condor_q", "-af:j",
"JobStatus", "RemoteSysCpu"],
stderr = subprocess.STDOUT)
for line in condor_q.splitlines():
job_id, status, cpu_time = line.split()
job_status[job_id] = {"status": htcondor_jobstatus[status],
"cpu": float(cpu_time)}

for job_id, job_info in six.iteritems(job_status):
mps_index = submitted_jobs.get(job_id, -1)
Expand Down Expand Up @@ -119,30 +96,20 @@ def fill_time_info(mps_index, status, cpu_time):
disabled = "DISABLED" if "DISABLED" in lib.JOBSTATUS[mps_index] else ""
print(" DB job ", job_id, mps_index)

# check if job may be done by looking if a folder exists in the project directory.
# if True -> jobstatus is set to DONE
theBatchDirectory = "LSFJOB_"+job_id
if os.path.isdir(theBatchDirectory):
print("Directory ", theBatchDirectory, "exists")
lib.JOBSTATUS[mps_index] = disabled + "DONE"
# check if it is a HTCondor job already moved to "history"
userlog = os.path.join("jobData", lib.JOBDIR[mps_index], "HTCJOB")
condor_h = subprocess.check_output(["condor_history", job_id, "-limit", "1",
"-userlog", userlog,
"-af:j", "JobStatus", "RemoteSysCpu"],
stderr = subprocess.STDOUT)
if len(condor_h.strip()) > 0:
job_id, status, cpu_time = condor_h.split()
status = htcondor_jobstatus[status]
lib.JOBSTATUS[mps_index] = disabled + status
fill_time_info(mps_index, status, float(cpu_time))
submitted_jobs.pop(job_id)
continue

# check if it is a HTCondor job already moved to "history"
elif "htcondor" in lib.get_class("pede") or "htcondor" in lib.get_class("mille"):
userlog = os.path.join("jobData", lib.JOBDIR[mps_index], "HTCJOB")
condor_h = subprocess.check_output(["condor_history", job_id, "-limit", "1",
"-userlog", userlog,
"-af:j", "JobStatus", "RemoteSysCpu"],
stderr = subprocess.STDOUT)
if len(condor_h.strip()) > 0:
job_id, status, cpu_time = condor_h.split()
status = htcondor_jobstatus[status]
lib.JOBSTATUS[mps_index] = disabled + status
fill_time_info(mps_index, status, float(cpu_time))
submitted_jobs.pop(job_id)
continue

if "RUN" in lib.JOBSTATUS[mps_index]:
print("WARNING: Job ", mps_index, end=' ')
print("in state RUN, neither found by htcondor, nor bjobs, nor find", end=' ')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@


[general]
classInf = cmscaf1nd:htcondor_bigmem_testmatch
classInf = htcondor_cafalca_workday:htcondor_bigmem_testmatch
jobname = MillePedeCampaign
pedeMem = 32000
datasetdir = /afs/cern.ch/cms/CAF/CMSALCA/ALCA_TRACKERALIGN/MP/MPproduction/datasetfiles
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ trap clean_up HUP INT TERM SEGV USR2 XCPU XFSZ IO
# a helper function to repeatedly try failing copy commands
untilSuccess () {
# trying "${1} ${2} ${3} > /dev/null" until success, if ${4} is a
# positive number run {1} with -f flag,
# positive number run {1} with -f flag and using --cksum md5,
# break after ${5} tries (with four arguments do up to 5 tries).
if [[ ${#} -lt 4 || ${#} -gt 5 ]]
then
Expand All @@ -43,7 +43,7 @@ untilSuccess () {

if [[ ${4} -gt 0 ]]
then
${1} -f ${2} ${3} > /dev/null
${1} -f --cksum md5 ${2} ${3} > /dev/null
else
${1} ${2} ${3} > /dev/null
fi
Expand All @@ -53,7 +53,7 @@ untilSuccess () {
then # ... but not until infinity!
if [[ ${4} -gt 0 ]]
then
echo ${0}: Give up doing \"${1} -f ${2} ${3} \> /dev/null\".
echo ${0}: Give up doing \"${1} -f --cksum md5 ${2} ${3} \> /dev/null\".
return 1
else
echo ${0}: Give up doing \"${1} ${2} ${3} \> /dev/null\".
Expand All @@ -63,9 +63,9 @@ untilSuccess () {
TRIES=$((${TRIES}+1))
if [[ ${4} -gt 0 ]]
then
echo ${0}: WARNING, problems with \"${1} -f ${2} ${3} \> /dev/null\", try again.
echo ${0}: WARNING, problems with \"${1} -f --cksum md5 ${2} ${3} \> /dev/null\", try again.
sleep $((${TRIES}*5)) # for before each wait a litte longer...
${1} -f ${2} ${3} > /dev/null
${1} -f --cksum md5 ${2} ${3} > /dev/null
else
echo ${0}: WARNING, problems with \"${1} ${2} ${3} \> /dev/null\", try again.
sleep $((${TRIES}*5)) # for before each wait a litte longer...
Expand All @@ -75,7 +75,7 @@ untilSuccess () {

if [[ ${4} -gt 0 ]]
then
echo successfully executed \"${1} -f ${2} ${3} \> /dev/null\"
echo successfully executed \"${1} -f --cksum md5 ${2} ${3} \> /dev/null\"
else
echo successfully executed \"${1} ${2} ${3} \> /dev/null\"
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ trap clean_up HUP INT TERM SEGV USR2 XCPU XFSZ IO
# a helper function to repeatedly try failing copy commands
untilSuccess () {
# trying "${1} ${2} ${3} > /dev/null" until success, if ${4} is a
# positive number run {1} with -f flag,
# positive number run {1} with -f flag and using --cksum md5,
# break after ${5} tries (with four arguments do up to 5 tries).
if [[ ${#} -lt 4 || ${#} -gt 5 ]]
then
Expand All @@ -76,7 +76,7 @@ untilSuccess () {

if [[ ${4} -gt 0 ]]
then
${1} -f ${2} ${3} > /dev/null
${1} -f --cksum md5 ${2} ${3} > /dev/null
else
${1} ${2} ${3} > /dev/null
fi
Expand All @@ -86,7 +86,7 @@ untilSuccess () {
then # ... but not until infinity!
if [[ ${4} -gt 0 ]]
then
echo ${0}: Give up doing \"${1} -f ${2} ${3} \> /dev/null\".
echo ${0}: Give up doing \"${1} -f --cksum md5 ${2} ${3} \> /dev/null\".
return 1
else
echo ${0}: Give up doing \"${1} ${2} ${3} \> /dev/null\".
Expand All @@ -96,9 +96,9 @@ untilSuccess () {
TRIES=$((${TRIES}+1))
if [[ ${4} -gt 0 ]]
then
echo ${0}: WARNING, problems with \"${1} -f ${2} ${3} \> /dev/null\", try again.
echo ${0}: WARNING, problems with \"${1} -f --cksum md5 ${2} ${3} \> /dev/null\", try again.
sleep $((${TRIES}*5)) # for before each wait a litte longer...
${1} -f ${2} ${3} > /dev/null
${1} -f --cksum md5 ${2} ${3} > /dev/null
else
echo ${0}: WARNING, problems with \"${1} ${2} ${3} \> /dev/null\", try again.
sleep $((${TRIES}*5)) # for before each wait a litte longer...
Expand All @@ -108,7 +108,7 @@ untilSuccess () {

if [[ ${4} -gt 0 ]]
then
echo successfully executed \"${1} -f ${2} ${3} \> /dev/null\"
echo successfully executed \"${1} -f --cksum md5 ${2} ${3} \> /dev/null\"
else
echo successfully executed \"${1} ${2} ${3} \> /dev/null\"
fi
Expand Down

0 comments on commit 8a7ee75

Please sign in to comment.