Skip to content

Commit

Permalink
Merge branch 'worleyph/cime/update_provenance_collection' (PR #1429)
Browse files Browse the repository at this point in the history
Update provenance data collection:
a) Update mach_syslog for other platforms to match that for Anvil

When adding mach_syslog for Anvil, the logic was refactored,
decreasing the amount of data collected. Introduce similar changes
for mach_syslog for the other systems. The system-specific
commands are querying system and job status were also examined,
and some of the command choices and options modified.

b) Modify collection of provenance data on all supported systems

Disable collection of xtdb2proc and mdiag system configuration
data and qstat -f system load data or Titan. (May want to re-enable
in the future, so just commenting out, not deleting.)

Eliminate collection of some redundant or irrelevant date on SLURM
systems at NERSC, and add collection of some data on system
configuration and node usage by other jobs. Also fix error caused
by changed name of ACME job script output on NERSC systems.

Anvil support in provenance.py was lost inadvertently. Add it
back. Also eliminate some unnecessary detail in the provenance data
collected on Anvil.

Eliminate some unnecessary detail in the provenance data collected
on Mira and Cetus.

[BFB]

Fixes #1417
  • Loading branch information
amametjanov committed Apr 27, 2017
2 parents 9933df5 + 765dad4 commit c37feff
Show file tree
Hide file tree
Showing 8 changed files with 165 additions and 91 deletions.
5 changes: 4 additions & 1 deletion config/acme/machines/syslog.anvil
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ cat > $run/Walltime.Remaining <<EOF1
$remaining $sample_interval
EOF1
/bin/cp --preserve=timestamps $run/acme.log.$lid $dir/acme.log.$lid.$remaining
qstat -r acme > $dir/qstatr.$lid.$remaining
qstat -1 -n acme > $dir/qstatn.$lid.$remaining

while ($remaining > 0)
echo "Wallclock time remaining: $remaining" >> $dir/atm.log.$lid.step
Expand All @@ -41,7 +43,8 @@ while ($remaining > 0)
echo "Wallclock time remaining: $remaining" >> $dir/cpl.log.$lid.step
tail -n 4 $dir/cpl.log.$lid.step-all >> $dir/cpl.log.$lid.step
/bin/cp --preserve=timestamps -u $timing/* $dir
qstat -f acme > $dir/qstatf.$lid.$remaining
qstat -r acme > $dir/qstatr.$lid.$remaining
qstat -1 -n acme > $dir/qstatn.$lid.$remaining
chmod a+r $dir/*
sleep $sample_interval
set remaining = `qstat -f $jid | grep -F Walltime.Remaining | sed 's/ *Walltime.Remaining = *\([0-9]*\) */\1/' `
Expand Down
28 changes: 18 additions & 10 deletions config/acme/machines/syslog.cetus
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# cetus syslog script:
# mach_syslog <sampling interval (in seconds)> <job identifier> <timestamp> <run directory> <timing directory> <output directory>

set sec = 0
set sample_interval = $1
set jid = $2
set lid = $3
Expand All @@ -12,13 +11,14 @@ set dir = $6

# wait until output file is nonempty before checking remaining time
# (note that calling script 'touch'es the acme log file before spawning this script, so that 'wc' does not fail)
set ncpus = `qstat -lf $jid | grep -F Procs | sed 's/^ *Procs *: *\([0-9]*\).*/\1/' `
set outlth = 0
while ($outlth < 1)
while ($outlth < $ncpus)
sleep 10
set outlth = `wc \-l $run/acme.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
end

set TimeRemaining = `qstat -lf $jid | grep TimeRemaining | sed 's/^ *TimeRemaining *: *\([0-9]*:[0-9]*:[0-9]*\) */\1/' `
set TimeRemaining = `qstat -lf $jid | grep -F TimeRemaining | sed 's/^ *TimeRemaining *: *\([0-9]*:[0-9]*:[0-9]*\) */\1/' `
set rem_hours = `echo $TimeRemaining | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set rem_mins = `echo $TimeRemaining | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set rem_secs = `echo $TimeRemaining | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
Expand All @@ -32,16 +32,24 @@ EOF1
/bin/cp --preserve=timestamps $run/acme.log.$lid $dir/acme.log.$lid.$remaining

while ($remaining > 0)
grep -a -i -e "nstep" -e "model date" $run/*atm.log.$lid | tail > $dir/atm.log.$lid.nstep.$remaining
# grep -a -i "nstep" $run/acme.log.$lid | tail > $dir/acme.log.$lid.nstep.$remaining
grep -a -i -e "timestep" -e "model date" $run/*lnd.log.$lid | tail > $dir/lnd.log.$lid.timestep.$remaining
grep -a -i -e "timestep" -e "Step number" -e "model date" $run/*ocn.log.$lid | tail > $dir/ocn.log.$lid.stepnum.$remaining
grep -a -i -e "timestep" -e "istep" -e "model date" $run/*ice.log.$lid | tail > $dir/ice.log.$lid.istep.$remaining
grep -a -i "model date" $run/*cpl.log.$lid | tail > $dir/cpl.log.$lid.modeldata.$remaining
echo "Wallclock time remaining: $remaining" >> $dir/atm.log.$lid.step
grep -Fa -e "nstep" -e "model date" $run/*atm.log.$lid | tail -n 4 >> $dir/atm.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/lnd.log.$lid.step
grep -Fa -e "timestep" -e "model date" $run/*lnd.log.$lid | tail -n 4 >> $dir/lnd.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/ocn.log.$lid.step
grep -Fa -e "timestep" -e "Step number" -e "model date" $run/*ocn.log.$lid | tail -n 4 >> $dir/ocn.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/ice.log.$lid.step
grep -Fa -e "timestep" -e "istep" -e "model date" $run/*ice.log.$lid | tail -n 4 >> $dir/ice.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/rof.log.$lid.step
grep -Fa "model date" $run/*rof.log.$lid | tail -n 4 >> $dir/rof.log.$lid.step
grep -Fa "model date" $run/*cpl.log.$lid > $dir/cpl.log.$lid.step-all
echo "Wallclock time remaining: $remaining" >> $dir/cpl.log.$lid.step
tail -n 4 $dir/cpl.log.$lid.step-all >> $dir/cpl.log.$lid.step
/bin/cp --preserve=timestamps -u $timing/* $dir
qstat -f > $dir/qstatf.$lid.$remaining
chmod a+r $dir/*
sleep $sample_interval
set TimeRemaining = `qstat -lf $jid | grep TimeRemaining | sed 's/^ *TimeRemaining *: *\([0-9]*:[0-9]*:[0-9]*\) */\1/' `
set TimeRemaining = `qstat -lf $jid | grep -F TimeRemaining | sed 's/^ *TimeRemaining *: *\([0-9]*:[0-9]*:[0-9]*\) */\1/' `
set rem_hours = `echo $TimeRemaining | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set rem_mins = `echo $TimeRemaining | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set rem_secs = `echo $TimeRemaining | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
Expand Down
42 changes: 26 additions & 16 deletions config/acme/machines/syslog.cori-haswell
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#!/bin/csh -f
# cori-haswell syslog script:
# mach_syslog <sampling interval (in seconds)> <job identifier> <time stamp> <run directory> <timing directory> <output directory>
# mach_syslog <sampling interval (in seconds)> <job identifier> <timestamp> <run directory> <timing directory> <output directory>

set sec = 0
set sample_interval = $1
set jid = $2
set lid = $3
Expand All @@ -12,14 +11,14 @@ set dir = $6

# wait until job mapping information is output before saving output file
# (note that calling script 'touch'es the acme log file before spawning this script, so that 'wc' does not fail)
set nnodes = `sqs -f $jid | grep NumNodes | sed 's/^ *NumNodes= *\([0-9]*\).*/\1/' `
set ncpus = `sqs -f $jid | grep -F NumCPUs | sed 's/^ *NumNodes= *\([0-9]*\) *NumCPUs=*\([0-9]*\).*/\2/' `
set outlth = 0
while ($outlth < $nnodes)
while ($outlth < $ncpus)
sleep 10
set outlth = `wc \-l $run/acme.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
end

set TimeLimit = `sqs -f $jid | grep TimeLimit | sed 's/^ *RunTime=.*TimeLimit=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set TimeLimit = `sqs -f $jid | grep -F TimeLimit | sed 's/^ *RunTime=.*TimeLimit=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set limit_hours = `echo $TimeLimit | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set limit_mins = `echo $TimeLimit | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set limit_secs = `echo $TimeLimit | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
Expand All @@ -28,7 +27,7 @@ if ("X$limit_mins" == "X") set limit_mins = 0
if ("X$limit_secs" == "X") set limit_secs = 0
@ limit = 3600 * $limit_hours + 60 * $limit_mins + $limit_secs

set RunTime = `sqs -f $jid | grep RunTime | sed 's/^ *RunTime=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set RunTime = `sqs -f $jid | grep -F RunTime | sed 's/^ *RunTime=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set runt_hours = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set runt_mins = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set runt_secs = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
Expand All @@ -42,20 +41,32 @@ cat > $run/Walltime.Remaining <<EOF1
$remaining $sample_interval
EOF1
/bin/cp --preserve=timestamps $run/acme.log.$lid $dir/acme.log.$lid.$remaining
squeue -t R -o "%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l" > $dir/squeuef.$lid.$remaining
squeue -s | grep -v -F extern > $dir/squeues.$lid.$remaining
# squeue -t R -o "%.10i %R" > $dir/squeueR.$lid.$remaining

while ($remaining > 0)
grep -a -i -e "nstep" -e "model date" $run/*atm.log.$lid | tail > $dir/atm.log.$lid.nstep.$remaining
# grep -a -i "nstep" $run/acme.log.$lid | tail > $dir/acme.log.$lid.nstep.$remaining
grep -a -i -e "timestep" -e "model date" $run/*lnd.log.$lid | tail > $dir/lnd.log.$lid.timestep.$remaining
grep -a -i -e "timestep" -e "Step number" -e "model date" $run/*ocn.log.$lid | tail > $dir/ocn.log.$lid.stepnum.$remaining
grep -a -i -e "timestep" -e "istep" -e "model date" $run/*ice.log.$lid | tail > $dir/ice.log.$lid.istep.$remaining
grep -a -i "model date" $run/*cpl.log.$lid | tail > $dir/cpl.log.$lid.modeldata.$remaining
echo "Wallclock time remaining: $remaining" >> $dir/atm.log.$lid.step
grep -Fa -e "nstep" -e "model date" $run/*atm.log.$lid | tail -n 4 >> $dir/atm.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/lnd.log.$lid.step
grep -Fa -e "timestep" -e "model date" $run/*lnd.log.$lid | tail -n 4 >> $dir/lnd.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/ocn.log.$lid.step
grep -Fa -e "timestep" -e "Step number" -e "model date" $run/*ocn.log.$lid | tail -n 4 >> $dir/ocn.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/ice.log.$lid.step
grep -Fa -e "timestep" -e "istep" -e "model date" $run/*ice.log.$lid | tail -n 4 >> $dir/ice.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/rof.log.$lid.step
grep -Fa "model date" $run/*rof.log.$lid | tail -n 4 >> $dir/rof.log.$lid.step
grep -Fa "model date" $run/*cpl.log.$lid > $dir/cpl.log.$lid.step-all
echo "Wallclock time remaining: $remaining" >> $dir/cpl.log.$lid.step
tail -n 4 $dir/cpl.log.$lid.step-all >> $dir/cpl.log.$lid.step
/bin/cp --preserve=timestamps -u $timing/* $dir
# xtnodestat > $dir/xtnodestat.$lid.$remaining
sqs -w -a > $dir/sqsw.$lid.$remaining
# sqs -w -a | grep "^[0-9]* *R *"> $dir/sqswr.$lid.$remaining
squeue -t R -o "%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l" > $dir/squeuef.$lid.$remaining
squeue -s | grep -v -F extern > $dir/squeues.$lid.$remaining
# squeue -t R -o "%.10i %R" > $dir/squeueR.$lid.$remaining
chmod a+r $dir/*
sleep $sample_interval
set RunTime = `sqs -f $jid | grep RunTime | sed 's/^ *RunTime=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set RunTime = `sqs -f $jid | grep -F RunTime | sed 's/^ *RunTime=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set runt_hours = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set runt_mins = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set runt_secs = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
Expand All @@ -69,4 +80,3 @@ $remaining $sample_interval
EOF2

end

42 changes: 26 additions & 16 deletions config/acme/machines/syslog.cori-knl
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#!/bin/csh -f
# cori-knl syslog script:
# mach_syslog <sampling interval (in seconds)> <job identifier> <time stamp> <run directory> <timing directory> <output directory>
# mach_syslog <sampling interval (in seconds)> <job identifier> <timestamp> <run directory> <timing directory> <output directory>

set sec = 0
set sample_interval = $1
set jid = $2
set lid = $3
Expand All @@ -12,14 +11,14 @@ set dir = $6

# wait until job mapping information is output before saving output file
# (note that calling script 'touch'es the acme log file before spawning this script, so that 'wc' does not fail)
set nnodes = `sqs -f $jid | grep NumNodes | sed 's/^ *NumNodes= *\([0-9]*\).*/\1/' `
set ncpus = `sqs -f $jid | grep -F NumCPUs | sed 's/^ *NumNodes= *\([0-9]*\) *NumCPUs=*\([0-9]*\).*/\2/' `
set outlth = 0
while ($outlth < $nnodes)
while ($outlth < $ncpus)
sleep 10
set outlth = `wc \-l $run/acme.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
end

set TimeLimit = `sqs -f $jid | grep TimeLimit | sed 's/^ *RunTime=.*TimeLimit=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set TimeLimit = `sqs -f $jid | grep -F TimeLimit | sed 's/^ *RunTime=.*TimeLimit=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set limit_hours = `echo $TimeLimit | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set limit_mins = `echo $TimeLimit | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set limit_secs = `echo $TimeLimit | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
Expand All @@ -28,7 +27,7 @@ if ("X$limit_mins" == "X") set limit_mins = 0
if ("X$limit_secs" == "X") set limit_secs = 0
@ limit = 3600 * $limit_hours + 60 * $limit_mins + $limit_secs

set RunTime = `sqs -f $jid | grep RunTime | sed 's/^ *RunTime=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set RunTime = `sqs -f $jid | grep -F RunTime | sed 's/^ *RunTime=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set runt_hours = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set runt_mins = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set runt_secs = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
Expand All @@ -42,20 +41,32 @@ cat > $run/Walltime.Remaining <<EOF1
$remaining $sample_interval
EOF1
/bin/cp --preserve=timestamps $run/acme.log.$lid $dir/acme.log.$lid.$remaining
squeue -t R -o "%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l" > $dir/squeuef.$lid.$remaining
squeue -s | grep -v -F extern > $dir/squeues.$lid.$remaining
# squeue -t R -o "%.10i %R" > $dir/squeueR.$lid.$remaining

while ($remaining > 0)
grep -a -i -e "nstep" -e "model date" $run/*atm.log.$lid | tail > $dir/atm.log.$lid.nstep.$remaining
# grep -a -i "nstep" $run/acme.log.$lid | tail > $dir/acme.log.$lid.nstep.$remaining
grep -a -i -e "timestep" -e "model date" $run/*lnd.log.$lid | tail > $dir/lnd.log.$lid.timestep.$remaining
grep -a -i -e "timestep" -e "Step number" -e "model date" $run/*ocn.log.$lid | tail > $dir/ocn.log.$lid.stepnum.$remaining
grep -a -i -e "timestep" -e "istep" -e "model date" $run/*ice.log.$lid | tail > $dir/ice.log.$lid.istep.$remaining
grep -a -i "model date" $run/*cpl.log.$lid | tail > $dir/cpl.log.$lid.modeldata.$remaining
echo "Wallclock time remaining: $remaining" >> $dir/atm.log.$lid.step
grep -Fa -e "nstep" -e "model date" $run/*atm.log.$lid | tail -n 4 >> $dir/atm.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/lnd.log.$lid.step
grep -Fa -e "timestep" -e "model date" $run/*lnd.log.$lid | tail -n 4 >> $dir/lnd.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/ocn.log.$lid.step
grep -Fa -e "timestep" -e "Step number" -e "model date" $run/*ocn.log.$lid | tail -n 4 >> $dir/ocn.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/ice.log.$lid.step
grep -Fa -e "timestep" -e "istep" -e "model date" $run/*ice.log.$lid | tail -n 4 >> $dir/ice.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/rof.log.$lid.step
grep -Fa "model date" $run/*rof.log.$lid | tail -n 4 >> $dir/rof.log.$lid.step
grep -Fa "model date" $run/*cpl.log.$lid > $dir/cpl.log.$lid.step-all
echo "Wallclock time remaining: $remaining" >> $dir/cpl.log.$lid.step
tail -n 4 $dir/cpl.log.$lid.step-all >> $dir/cpl.log.$lid.step
/bin/cp --preserve=timestamps -u $timing/* $dir
# xtnodestat > $dir/xtnodestat.$lid.$remaining
sqs -w -a > $dir/sqsw.$lid.$remaining
# sqs -w -a | grep "^[0-9]* *R *"> $dir/sqswr.$lid.$remaining
squeue -t R -o "%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l" > $dir/squeuef.$lid.$remaining
squeue -s | grep -v -F extern > $dir/squeues.$lid.$remaining
# squeue -t R -o "%.10i %R" > $dir/squeueR.$lid.$remaining
chmod a+r $dir/*
sleep $sample_interval
set RunTime = `sqs -f $jid | grep RunTime | sed 's/^ *RunTime=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set RunTime = `sqs -f $jid | grep -F RunTime | sed 's/^ *RunTime=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set runt_hours = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set runt_mins = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set runt_secs = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
Expand All @@ -69,4 +80,3 @@ $remaining $sample_interval
EOF2

end

Loading

0 comments on commit c37feff

Please sign in to comment.