Skip to content

Commit

Permalink
Remove lrms_config_hook and lrms_shutdown_hook from orte_lib.
Browse files Browse the repository at this point in the history
ORTE_LIB LM relies on the ORTE LaunchMethod's lrms_config_hook and
lrms_shutdown_hook. These are "always" called, as even in the ORTE_LIB
case we use ORTE for the sub-agent launch.
  • Loading branch information
Mark Santcroos committed Jan 9, 2017
1 parent e79fa66 commit 1968501
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 142 deletions.
4 changes: 0 additions & 4 deletions src/radical/pilot/agent/lm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,12 @@ def lrms_config_hook(cls, name, cfg, lrms, logger):

from .fork import Fork
from .orte import ORTE
from .orte_lib import ORTELib
from .yarn import Yarn
from .spark import Spark

impl = {
LM_NAME_FORK : Fork,
LM_NAME_ORTE : ORTE,
LM_NAME_ORTE_LIB : ORTELib,
LM_NAME_YARN : Yarn,
LM_NAME_SPARK : Spark
}.get(name)
Expand All @@ -177,13 +175,11 @@ def lrms_shutdown_hook(cls, name, cfg, lrms, lm_info, logger):
raise TypeError("LaunchMethod shutdown hook only available to base class!")

from .orte import ORTE
from .orte_lib import ORTELib
from .yarn import Yarn
from .spark import Spark

impl = {
LM_NAME_ORTE : ORTE,
LM_NAME_ORTE_LIB : ORTELib,
LM_NAME_YARN : Yarn,
LM_NAME_SPARK : Spark
}.get(name)
Expand Down
142 changes: 4 additions & 138 deletions src/radical/pilot/agent/lm/orte_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,145 +31,11 @@ def __init__(self, cfg, session):

# --------------------------------------------------------------------------
#
@classmethod
def lrms_config_hook(cls, name, cfg, lrms, logger):
"""
FIXME: this config hook will manipulate the LRMS nodelist. Not a nice
thing to do, but hey... :P
What really should be happening is that the LRMS digs information
on node reservation out of the config and configures the node
list accordingly. This config hook should be limited to starting
the DVM.
"""

dvm_command = cls._which('orte-dvm')
if not dvm_command:
raise Exception("Couldn't find orte-dvm")

# Now that we found the orte-dvm, get ORTE version
orte_info = {}
oi_output = subprocess.check_output(['orte-info|grep "Open RTE"'], shell=True)
oi_lines = oi_output.split('\n')
for line in oi_lines:
if not line:
continue
key, val = line.split(':')
if 'Open RTE' == key.strip():
orte_info['version'] = val.strip()
elif 'Open RTE repo revision' == key.strip():
orte_info['version_detail'] = val.strip()
logger.info("Found Open RTE: %s / %s",
orte_info['version'], orte_info['version_detail'])

# Use (g)stdbuf to disable buffering.
# We need this to get the "DVM ready",
# without waiting for orte-dvm to complete.
# The command seems to be generally available on our Cray's,
# if not, we can code some home-coooked pty stuff.
stdbuf_cmd = cls._find_executable(['stdbuf', 'gstdbuf'])
if not stdbuf_cmd:
raise Exception("Couldn't find (g)stdbuf")
stdbuf_arg = "-oL"

# Base command = (g)stdbuf <args> + orte-dvm + debug_args
dvm_args = [stdbuf_cmd, stdbuf_arg, dvm_command]

# Additional (debug) arguments to orte-dvm
debug_strings = [
#'--debug-devel',
#'--mca odls_base_verbose 100',
#'--mca rml_base_verbose 100',
]
# Split up the debug strings into args and add them to the dvm_args
[dvm_args.extend(ds.split()) for ds in debug_strings]

vm_size = len(lrms.node_list)
logger.info("Starting ORTE DVM on %d nodes with '%s' ...", vm_size, ' '.join(dvm_args))
dvm_process = subprocess.Popen(dvm_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

dvm_uri = None
while True:

line = dvm_process.stdout.readline().strip()

if line.startswith('VMURI:'):

if len(line.split(' ')) != 2:
raise Exception("Unknown VMURI format: %s" % line)

label, dvm_uri = line.split(' ', 1)

if label != 'VMURI:':
raise Exception("Unknown VMURI format: %s" % line)

logger.info("ORTE DVM URI: %s" % dvm_uri)

elif line == 'DVM ready':

if not dvm_uri:
raise Exception("VMURI not found!")

logger.info("ORTE DVM startup successful!")
break

else:

# Check if the process is still around,
# and log output in debug mode.
if None == dvm_process.poll():
logger.debug("ORTE: %s" % line)
else:
# Process is gone: fatal!
raise Exception("ORTE DVM process disappeared")

# ----------------------------------------------------------------------
def _watch_dvm(dvm_process):

logger.info('starting DVM watcher')

while dvm_process.poll() is None:
line = dvm_process.stdout.readline().strip()
if line:
logger.debug('dvm output: %s' % line)
else:
time.sleep(1.0)

logger.info('DVM stopped (%d)' % dvm_process.returncode)
# TODO: Tear down everything?
# ----------------------------------------------------------------------

dvm_watcher = threading.Thread(target=_watch_dvm, args=(dvm_process,),
name="DVMWatcher")
dvm_watcher.daemon = True
dvm_watcher.start()

lm_info = {'dvm_uri' : dvm_uri,
'version_info': {name: orte_info}}

# we need to inform the actual LM instance about the DVM URI. So we
# pass it back to the LRMS which will keep it in an 'lm_info', which
# will then be passed as part of the opaque_slots via the scheduler
return lm_info


# --------------------------------------------------------------------------
# NOTE: ORTE_LIB LM relies on the ORTE LaunchMethod's lrms_config_hook and
# lrms_shutdown_hook. These are "always" called, as even in the ORTE_LIB
# case we use ORTE for the sub-agent launch.
#
@classmethod
def lrms_shutdown_hook(cls, name, cfg, lrms, lm_info, logger):
"""
This hook is symmetric to the config hook above, and is called during
shutdown sequence, for the sake of freeing allocated resources.
"""

if 'dvm_uri' in lm_info:
try:
logger.info('terminating dvm')
orterun = cls._which('orterun')
if not orterun:
raise Exception("Couldn't find orterun")
subprocess.Popen([orterun, "--hnp", lm_info['dvm_uri'], "--terminate"])
except Exception as e:
logger.exception('dmv termination failed')
# --------------------------------------------------------------------------


# --------------------------------------------------------------------------
Expand Down

0 comments on commit 1968501

Please sign in to comment.