Skip to content

Commit

Permalink
work around new Popen.send_signal behavior in Python 3.9
Browse files Browse the repository at this point in the history
In Python 3.9, Popen.send_signal (which is called by Popen.kill, which
we call in SharedChild.kill) now calls Popen.poll first, as a
best-effort check to make sure the child's PID hasn't already been freed
for reuse. If the child has exited, this may reap the child. Previously
we didn't check for this, and we called os.waitid after kill assuming
the child was still avilable. Now in Python 3.9 this raises an uncaught
"no child found" exception. It's a race condition that only triggers if
the child has exited before kill is called. We got lucky and caught this
crash in a doctest in CI. An example failing run:
https://github.com/oconnor663/duct.py/runs/1488376578

The upstream bug tracking the send_signal change is
https://bugs.python.org/issue38630.

This commit does a few things:

1. Add a test that reliably triggers the crash.
2. Read Popen.returncode directly, instead of tracking our own
   SharedChild._status. This makes the test pass, but it's unclear
   whether it's robust to all possible race conditions.
3. Stop calling Popen.kill, and instead reimplement its previous
   non-polling behavior. Technically this means we don't need the
   returncode fix above, but the code is simpler that way anyway, and
   it's some insurance against more changes like this in future Python
   versions.
  • Loading branch information
oconnor663-zoom committed Dec 3, 2020
1 parent 07010e2 commit 5dfae70
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 24 deletions.
70 changes: 46 additions & 24 deletions duct.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
import io
import os
import shutil
import signal
import subprocess
import threading

Expand Down Expand Up @@ -1118,6 +1119,10 @@ def maybe_canonicalize_exe_path(exe_name, iocontext):
popen_lock = threading.Lock()


def is_windows():
return os.name == "nt"


# This wrapper works around two major deadlock issues to do with pipes. The
# first is that, before Python 3.2 on POSIX systems, os.pipe() creates
# inheritable file descriptors, which leak to all child processes and prevent
Expand All @@ -1139,7 +1144,7 @@ def maybe_canonicalize_exe_path(exe_name, iocontext):
# subprocess.Popen. That type works around another race condition to do with
# signaling children.
def safe_popen(*args, **kwargs):
close_fds = (os.name != 'nt')
close_fds = not is_windows()
with popen_lock:
return SharedChild(*args, close_fds=close_fds, **kwargs)

Expand All @@ -1164,7 +1169,7 @@ def encode_with_universal_newlines(s):
# removals in that copy won't interact properly with the inherited parent
# environment.
def convert_env_var_name(var):
if os.name == 'nt':
if is_windows():
return var.upper()
return var

Expand Down Expand Up @@ -1194,26 +1199,24 @@ def convert_env_var_name(var):
class SharedChild:
def __init__(self, *args, **kwargs):
self._child = subprocess.Popen(*args, **kwargs)
self._status = None
# The status lock is only held long enough to read or write the status,
# or to make non-blocking calls like Popen.poll(). Threads making a
# blocking call to os.waitid() release the status lock first. This
# The child lock is only held for non-blocking calls. Threads making a
# blocking call to os.waitid() release the child lock first. This
# ensures that one thread can call try_wait() while another thread is
# blocked on wait().
self._status_lock = threading.Lock()
self._child_lock = threading.Lock()
self._wait_lock = threading.Lock()

def wait(self):
with self._wait_lock:
# See if another thread already waited. If so, return the status we
# got before. If not, immediately release the status lock, and move
# got before. If not, immediately release the child lock, and move
# on to call wait ourselves.
with self._status_lock:
if self._status is not None:
return self._status
with self._child_lock:
if self._child.returncode is not None:
return self._child.returncode

# No other thread has waited, we're holding the wait lock, and
# we've released the status lock. It's now our job to wait. As
# we've released the child lock. It's now our job to wait. As
# documented above, if os.waitid is defined, use that function to
# await the child without reaping it. Otherwise we do an ordinary
# Popen.wait and accept the race condition on some platforms.
Expand All @@ -1225,20 +1228,20 @@ def wait(self):
# races with kill(), which is what all of this is about.
self._child.wait()

# Finally, while still holding the wait lock, re-acquire the status
# Finally, while still holding the wait lock, re-acquire the child
# lock to reap the child and write the result. Since we know the
# child has already exited, this won't block. Any other waiting
# threads that were blocked on us will see our result.
with self._status_lock:
with self._child_lock:
# If the child was already reaped above in the !HAS_WAITID
# branch, this will just return the same status again.
self._status = self._child.wait()
return self._status
# branch, this second wait will be a no-op with a cached
# returncode.
return self._child.wait()

def try_wait(self):
with self._status_lock:
if self._status is not None:
return self._status
with self._child_lock:
if self._child.returncode is not None:
return self._child.returncode

# The child hasn't been waited on yet, so we need to do a
# non-blocking check to see if it's still running. The Popen type
Expand All @@ -1256,16 +1259,35 @@ def try_wait(self):

# If either of the poll approaches above returned non-None, do a full
# wait to reap the child, which will not block. Note that we've
# released the status lock here, because wait() will re-acquire it.
# released the child lock here, because wait() will re-acquire it.
if poll_result is not None:
return self.wait()
else:
return None

def kill(self):
with self._status_lock:
if self._status is None:
self._child.kill()
with self._child_lock:
if self._child.returncode is None:
# Previously we just used Popen.kill here. However, as of Python 3.9,
# Popen.send_signal (which is called by Popen.kill) calls Popen.poll first, as a
# best-effort check for the same PID race that this class is designed around. That
# means that if the child has already exited, Popen.kill will reap it. Now that we
# check Popen.returncode throughout this class (as of the same commit that adds this
# comment), we'll see the non-None exit status there as a side effect if reaping has
# happened. That *might* mean we could still call Popen.kill here safely. However,
# there's also the question of how Popen.poll's call to os.waitpid would interact
# with our own blocking call to os.waitid from another thread. The worry is that the
# waitpid call might take effect first, causing waitid to return a "no child found"
# error. I can confirm that happens on Linux when both calls are blocking. Here
# though, the waitpid call is non-blocking, which *might* mean it can't happen
# first, but that's going to depend on the OS. We could assume that it can happen
# and try to catch the error from waitid, but that codepath would be impossible to
# test. So what we actually do here is reimplement the documented behavior of
# Popen.kill: os.kill(pid, SIGKILL) on Unix, and Popen.terminate on Windows.
if is_windows():
self._child.terminate()
else:
os.kill(self._child.pid, signal.SIGKILL)

def pid(self):
return self._child.pid
Expand Down
27 changes: 27 additions & 0 deletions test_duct.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,3 +694,30 @@ def test_pids():
assert type(reader.pids()[0]) is int
assert type(reader.pids()[1]) is int
reader.read()


# This test was added after the release of Python 3.9, which included a
# behavior change that caused a crash in this case. There wasn't previously a
# explicit test for this, but I got lucky and one of the doctests hit it.
# (Example run: https://github.com/oconnor663/duct.py/runs/1488376578)
#
# The behavior change in Python 3.9 is that Popen.send_signal (which is called
# by Popen.kill, which we call in SharedChild.kill) now calls Popen.poll first,
# as a best-effort check to make sure the child's PID hasn't already been freed
# for reuse. If the child has not yet exited, this is effectively no different
# from before. However, if the child has exited, this may reap the child, which
# was not previously possible. This test guarantees that the child has exited
# before kill, and then makes sure kill doesn't crash.
def test_kill_after_child_exit():
# Create a child process and wait for it to exit, without actually waiting
# on it and reaping it, by reading its output. We can't use the .read()
# method for this, because that would actually wait on it and reap it, so
# we create our own pipe manually.
pipe_reader, pipe_writer = os.pipe()
handle = echo_cmd("hi").stdout_file(pipe_writer).start()
os.close(pipe_writer)
reader_file = os.fdopen(pipe_reader)
assert reader_file.read() == "hi\n"

# The child has exited. Now just test that kill doesn't crash.
handle.kill()

0 comments on commit 5dfae70

Please sign in to comment.