-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcondor_node_check
executable file
·82 lines (61 loc) · 2.04 KB
/
condor_node_check
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
# Worker node health check script, to run from condor startd
from mc_util import *
import sys, os, stat
import memcache
import socket
import subprocess
## get host info
hostname = socket.gethostname()
node = shortname(hostname)
## Define directories to test
SCRATCH_DIR = "/scratch"
TMP_DIR = "/tmp"
ROOT_DIR = "/"
TMP_FILE = "node_check"
EXE_DIR = "/var/lib/condor/execute"
## Memcache subroutines
def setOffline(msg):
print """NodeOnline = false"""
print """NodeOnlineReason = '%s'""" % (msg)
def setOnline():
print """NodeOnline = true"""
## Test if the node has been set offline manually
STATUS = mc_get( node+".manualstatus" )
if STATUS == 'offline':
sys.exit()
## Tests
def can_write( mypath, myfile ):
if os.path.exists( "%s" % (mypath) ):
if not os.path.isfile( "%s/%s" % (mypath, myfile) ):
if ( open( "%s/%s" % (mypath, myfile), 'w').close() ):
setOffline("Cannot write to %s" % mypath)
sys.exit()
else:
os.remove( "%s/%s" % (mypath, myfile))
def correct_perms( mypath, perms):
if os.path.exists( "%s" % (mypath) ):
PERMS = oct(os.stat(mypath).st_mode)[-4:]
if ( int(PERMS) & 0777 == perms ):
setOffline("Permissions on %s should be 1777, are %s" % (mypath, PERMS))
sys.exit()
def disk_free ( mypath, amount ):
if os.path.exists( "%s" % (mypath) ):
df = subprocess.Popen(["/bin/df", "-Ph", "%s" % mypath], stdout=subprocess.PIPE)
output = df.communicate()[0]
device, size, used, available, percent, mountpoint = output.split("\n")[1].split()
if available.find('T') >= 0:
next
else:
available = available.split(".")[0].replace('G', '')
if ( int(available) < int(amount) ):
setOffline("Disk space available on %s is %s GB" % (mypath, available))
sys.exit()
## run the checks
can_write( TMP_FILE, EXE_DIR )
correct_perms( EXE_DIR, 1777)
disk_free( ROOT_DIR, 5 ) # unit is GB
disk_free( EXE_DIR, 5 ) # unit is GB
## if we pass all the checks, set the node to online status
setOnline()
sys.exit()