From 50d18577af28b6b053c0d446c959b42b5300a0f8 Mon Sep 17 00:00:00 2001
From: Chris McDonough <chrism@plope.com>
Date: Sun, 3 Aug 2014 23:35:43 -0400
Subject: [PATCH] work towards faster start and stop (#131)

---
 supervisor/rpcinterface.py             | 183 ++++++++++++++-----------
 supervisor/tests/base.py               |   3 +
 supervisor/tests/test_rpcinterfaces.py |  20 +--
 3 files changed, 118 insertions(+), 88 deletions(-)

diff --git a/supervisor/rpcinterface.py b/supervisor/rpcinterface.py
index b76d841dd..0cb402c00 100644
--- a/supervisor/rpcinterface.py
+++ b/supervisor/rpcinterface.py
@@ -2,6 +2,7 @@
 import time
 import datetime
 import errno
+import types
 
 from supervisor.compat import as_string
 from supervisor.compat import unicode
@@ -27,7 +28,10 @@
 from supervisor.states import getSupervisorStateDescription
 from supervisor.states import ProcessStates
 from supervisor.states import getProcessStateDescription
-from supervisor.states import RUNNING_STATES
+from supervisor.states import (
+    RUNNING_STATES,
+    STOPPED_STATES,
+    )
 
 API_VERSION  = '3.0'
 
@@ -273,49 +277,56 @@ def startProcess(self, name, wait=True):
         except (NotExecutable, NoPermission) as why:
             raise RPCError(Faults.NOT_EXECUTABLE, why.args[0])
 
-        started = []
+        if process.get_state() in RUNNING_STATES:
+            raise RPCError(Faults.ALREADY_STARTED, name)
 
-        startsecs = process.config.startsecs
+        process.spawn()
 
-        def startit():
-            if not started:
+        # We call reap() in order to more quickly obtain the side effects of
+        # process.finish(), which reap() eventually ends up calling.  This
+        # might be the case if the spawn() was successful but then the process
+        # died before its startsecs elapsed or it exited with an unexpected
+        # exit code. In particular, finish() may set spawnerr, which we can
+        # check and immediately raise an RPCError, avoiding the need to
+        # defer by returning a callback.
 
-                if process.get_state() in RUNNING_STATES:
-                    raise RPCError(Faults.ALREADY_STARTED, name)
+        self.supervisord.reap()
 
-                process.spawn()
+        if process.spawnerr:
+            raise RPCError(Faults.SPAWN_ERROR, name)
 
+        # We call process.transition() in order to more quickly obtain its
+        # side effects.  In particular, it might set the process' state from
+        # STARTING->RUNNING if the process has a startsecs==0.
+        process.transition()
+
+        if wait and process.get_state() != ProcessStates.RUNNING:
+            # by default, this branch will almost always be hit for processes
+            # with default startsecs configurations, because the default number
+            # of startsecs for a process is "1", and the process will not have
+            # entered the RUNNING state yet even though we've called
+            # transition() on it.  This is because a process is not considered
+            # RUNNING until it has stayed up > startsecs.
+
+            def onwait():
                 if process.spawnerr:
                     raise RPCError(Faults.SPAWN_ERROR, name)
 
-                # we use a list here to fake out lexical scoping;
-                # using a direct assignment to 'started' in the
-                # function appears to not work (symptom: 2nd or 3rd
-                # call through, it forgets about 'started', claiming
-                # it's undeclared).
-                started.append(time.time())
+                state = process.get_state()
 
-            if not wait or not startsecs:
-                return True
+                if state not in (ProcessStates.STARTING, ProcessStates.RUNNING):
+                    raise RPCError(Faults.ABNORMAL_TERMINATION, name)
 
-            t = time.time()
-            runtime = (t - started[0])
-            state = process.get_state()
-
-            if state not in (ProcessStates.STARTING, ProcessStates.RUNNING):
-                raise RPCError(Faults.ABNORMAL_TERMINATION, name)
+                if state == ProcessStates.RUNNING:
+                    return True
 
-            if runtime < startsecs:
                 return NOT_DONE_YET
 
-            if state == ProcessStates.RUNNING:
-                return True
-
-            raise RPCError(Faults.ABNORMAL_TERMINATION, name)
+            onwait.delay = 0.05
+            onwait.rpcinterface = self
+            return onwait # deferred
 
-        startit.delay = 0.05
-        startit.rpcinterface = self
-        return startit # deferred
+        return True
 
     def startProcessGroup(self, name, wait=True):
         """ Start all processes in the group named 'name'
@@ -373,36 +384,43 @@ def stopProcess(self, name, wait=True):
             group_name, process_name = split_namespec(name)
             return self.stopProcessGroup(group_name, wait)
 
-        stopped = []
-        called  = []
-
-        def killit():
-            if not called:
-                if process.get_state() not in RUNNING_STATES:
-                    raise RPCError(Faults.NOT_RUNNING)
-                # use a mutable for lexical scoping; see startProcess
-                called.append(1)
-
-            if not stopped:
-                msg = process.stop()
-                if msg is not None:
-                    raise RPCError(Faults.FAILED, msg)
-                stopped.append(1)
-
-                if wait:
+        if process.get_state() not in RUNNING_STATES:
+            raise RPCError(Faults.NOT_RUNNING)
+
+        msg = process.stop()
+        if msg is not None:
+            raise RPCError(Faults.FAILED, msg)
+
+        # We'll try to reap any killed child. FWIW, reap calls waitpid, and
+        # then, if waitpid returns a pid, calls finish() on the process with
+        # that pid, which drains any I/O from the process' dispatchers and
+        # changes the process' state.  I chose to call reap without once=True
+        # because we don't really care if we reap more than one child.  Even if
+        # we only reap one child. we may not even be reaping the child that we
+        # just stopped (this is all async, and process.stop() may not work, and
+        # we'll need to wait for SIGKILL during process.transition() as the
+        # result of normal select looping).
+
+        self.supervisord.reap()
+
+        if wait and process.get_state() not in STOPPED_STATES:
+
+            def onwait():
+                # process will eventually enter a stopped state by
+                # virtue of the supervisord.reap() method being called
+                # during normal operations
+                self.supervisord.options.logger.info(
+                    'waiting for %s to stop' % process.config.name
+                    )
+                if process.get_state() not in STOPPED_STATES:
                     return NOT_DONE_YET
-                else:
-                    return True
-
-            if process.get_state() not in (ProcessStates.STOPPED,
-                                           ProcessStates.EXITED):
-                return NOT_DONE_YET
-            else:
                 return True
 
-        killit.delay = 0.2
-        killit.rpcinterface = self
-        return killit # deferred
+            onwait.delay = 0
+            onwait.rpcinterface = self
+            return onwait # deferred
+
+        return True
 
     def stopProcessGroup(self, name, wait=True):
         """ Stop all processes in the process group named 'name'
@@ -792,6 +810,7 @@ def allfunc(
         callbacks=callbacks, # used only to fool scoping, never passed by caller
         results=results, # used only to fool scoping, never passed by caller
         ):
+
         if not callbacks:
 
             for group, process in processes:
@@ -799,39 +818,47 @@ def allfunc(
                 if predicate(process):
                     try:
                         callback = func(name, **extra_kwargs)
-                        callbacks.append((group, process, callback))
                     except RPCError as e:
                         results.append({'name':process.config.name,
                                         'group':group.config.name,
                                         'status':e.code,
                                         'description':e.text})
                         continue
+                    if isinstance(callback, types.FunctionType):
+                        callbacks.append((group, process, callback))
+                    else:
+                        results.append(
+                            {'name':process.config.name,
+                             'group':group.config.name,
+                             'status':Faults.SUCCESS,
+                             'description':'OK'}
+                            )
 
         if not callbacks:
             return results
 
-        group, process, callback = callbacks.pop(0)
+        for struct in callbacks[:]:
 
-        try:
-            value = callback()
-        except RPCError as e:
-            results.append(
-                {'name':process.config.name,
-                 'group':group.config.name,
-                 'status':e.code,
-                 'description':e.text})
-            return NOT_DONE_YET
+            group, process, cb = struct
 
-        if value is NOT_DONE_YET:
-            # push it back into the queue; it will finish eventually
-            callbacks.append((group, process, callback))
-        else:
-            results.append(
-                {'name':process.config.name,
-                 'group':group.config.name,
-                 'status':Faults.SUCCESS,
-                 'description':'OK'}
-                )
+            try:
+                value = cb()
+            except RPCError as e:
+                results.append(
+                    {'name':process.config.name,
+                     'group':group.config.name,
+                     'status':e.code,
+                     'description':e.text})
+                value = None
+
+            if value is not NOT_DONE_YET:
+                results.append(
+                    {'name':process.config.name,
+                     'group':group.config.name,
+                     'status':Faults.SUCCESS,
+                     'description':'OK'}
+                    )
+                callbacks.remove(struct)
 
         if callbacks:
             return NOT_DONE_YET
diff --git a/supervisor/tests/base.py b/supervisor/tests/base.py
index c6321765b..1ecdd6895 100644
--- a/supervisor/tests/base.py
+++ b/supervisor/tests/base.py
@@ -1022,6 +1022,9 @@ def set_procattr(self, process_name, attr_name, val, group_name=None):
         process = self.process_groups[group_name].processes[process_name]
         setattr(process, attr_name, val)
 
+    def reap(self):
+        self.reaped = True
+
 class DummyDispatcher:
     write_event_handled = False
     read_event_handled = False
diff --git a/supervisor/tests/test_rpcinterfaces.py b/supervisor/tests/test_rpcinterfaces.py
index a0f479f4e..1f680ede1 100644
--- a/supervisor/tests/test_rpcinterfaces.py
+++ b/supervisor/tests/test_rpcinterfaces.py
@@ -316,9 +316,10 @@ def test_startProcess_already_started(self):
         supervisord = PopulatedDummySupervisor(options, 'foo', pconfig)
         supervisord.set_procattr('foo', 'pid', 10)
         interface = self._makeOne(supervisord)
-        callback = interface.startProcess('foo')
-        self._assertRPCError(xmlrpc.Faults.ALREADY_STARTED,
-                             callback)
+        self._assertRPCError(
+            xmlrpc.Faults.ALREADY_STARTED,
+            interface.startProcess, 'foo'
+            )
 
     def test_startProcess_bad_group_name(self):
         options = DummyOptions()
@@ -372,11 +373,13 @@ def test_startProcess_spawnerr(self):
         process = supervisord.process_groups['foo'].processes['foo']
         process.spawnerr = 'abc'
         interface = self._makeOne(supervisord)
-        callback = interface.startProcess('foo')
-        self._assertRPCError(xmlrpc.Faults.SPAWN_ERROR, callback)
+        self._assertRPCError(
+            xmlrpc.Faults.SPAWN_ERROR,
+            interface.startProcess,
+            'foo'
+            )
 
     def test_startProcess(self):
-        from supervisor import http
         options = DummyOptions()
         pconfig = DummyPConfig(options, 'foo', __file__, autostart=False,
                                startsecs=.01)
@@ -384,14 +387,11 @@ def test_startProcess(self):
         supervisord = PopulatedDummySupervisor(options, 'foo', pconfig)
         supervisord.set_procattr('foo', 'state', ProcessStates.STOPPED)
         interface = self._makeOne(supervisord)
-        callback = interface.startProcess('foo')
-        self.assertEqual(callback(), http.NOT_DONE_YET)
+        result = interface.startProcess('foo')
         process = supervisord.process_groups['foo'].processes['foo']
         self.assertEqual(process.spawned, True)
         self.assertEqual(interface.update_text, 'startProcess')
         process.state = ProcessStates.RUNNING
-        time.sleep(.02)
-        result = callback()
         self.assertEqual(result, True)
 
     def test_startProcess_nowait(self):