Make process-wrapper exit as soon as a timed-out process handles SIGTERM.
Before this change, passing --timeout=X --kill_delay=Y would cause the
process-wrapper to send a SIGTERM to the subprocess after X seconds... and
then unnecessarily wait for Y seconds to send a SIGKILL. If the process
exited after X passed but before Y expired, we wouldn't notice.
The reason is that we were using kill(pid, 0) to check if the process had
already vanished after a SIGTERM, expecting this call to return an error
once the process is gone. The problem is: the process is never gone until
a wait() is done, as otherwise it remains as a zombie, and sending signals
to a zombie process doesn't return an error.
Given that we cannot wait in the signal handler, make the signal handler
install a second timer to send the SIGKILL once the Y seconds pass, and
then let the main logic in the process-wrapper await for the subprocess'
termination and finalization of the whole process tree.
Part of #7818.
RELNOTES: None.
PiperOrigin-RevId: 314905347
diff --git a/src/main/tools/process-tools.cc b/src/main/tools/process-tools.cc
index 985ff60..91ea2c9 100644
--- a/src/main/tools/process-tools.cc
+++ b/src/main/tools/process-tools.cc
@@ -33,6 +33,8 @@
#include "src/main/protobuf/execution_statistics.pb.h"
#include "src/main/tools/logging.h"
+static volatile sig_atomic_t child_pid_for_signal = 0;
+
int SwitchToEuid() {
int uid = getuid();
int euid = geteuid();
@@ -80,20 +82,31 @@
}
}
+static void ForciblyKillEverything(int signo) {
+ // We don't update the last_signal field tracked in process-wrapper-legacy.cc,
+ // which is used to report the signal back to the user, because we want to
+ // know (from the caller side) the original signal that caused us to stop.
+ kill(-child_pid_for_signal, SIGKILL);
+}
+
void KillEverything(pid_t pgrp, bool gracefully, double graceful_kill_delay) {
if (gracefully) {
+ // TODO(jmmv): If we truly want to offer graceful termination, we should
+ // probably only send SIGTERM to the process group leader and allow it to
+ // decide what to do. Terminating its subprocesses out of its control might
+ // not have the right effect.
kill(-pgrp, SIGTERM);
- // Round up fractional seconds in this polling implementation.
- int kill_delay = static_cast<int>(ceil(graceful_kill_delay));
-
- // If the process is still alive, give it some time to die gracefully.
- while (kill_delay-- > 0 && kill(-pgrp, 0) == 0) {
- sleep(1);
- }
+ // Previous versions of this code used to loop testing if the process had
+ // already died by sending it a 0 signal... but that loop would never
+ // terminate early because sending a signal to a zombie process succeeds
+ // (and we cannot collect the child's exit status here).
+ child_pid_for_signal = pgrp;
+ InstallSignalHandler(SIGALRM, ForciblyKillEverything);
+ SetTimeout(graceful_kill_delay);
+ } else {
+ kill(-pgrp, SIGKILL);
}
-
- kill(-pgrp, SIGKILL);
}
void InstallSignalHandler(int signum, void (*handler)(int)) {
diff --git a/src/test/shell/integration/process-wrapper_test.sh b/src/test/shell/integration/process-wrapper_test.sh
index 4aa4647..38d59ec 100755
--- a/src/test/shell/integration/process-wrapper_test.sh
+++ b/src/test/shell/integration/process-wrapper_test.sh
@@ -31,6 +31,8 @@
readonly OUT="${OUT_DIR}/outfile"
readonly ERR="${OUT_DIR}/errfile"
+readonly EXIT_STATUS_SIGABRT=$((128 + 6))
+readonly EXIT_STATUS_SIGKILL=$((128 + 9))
readonly EXIT_STATUS_SIGALRM=$((128 + 14))
function set_up() {
@@ -66,7 +68,7 @@
function test_signal_death() {
local code=0
$process_wrapper --stdout=$OUT --stderr=$ERR /bin/sh -c 'kill -ABRT $$' &> $TEST_log || code=$?
- assert_equals 134 "$code" # SIGNAL_BASE + SIGABRT = 128 + 6
+ assert_equals "${EXIT_STATUS_SIGABRT}" "$code"
}
function test_signal_catcher() {
@@ -79,35 +81,66 @@
function test_basic_timeout() {
$process_wrapper --timeout=1 --kill_delay=2 --stdout=$OUT --stderr=$ERR /bin/sh -c \
- "echo before; sleep 10; echo after" &> $TEST_log && fail
+ "echo before; sleep 10; echo after" &> $TEST_log || code=$?
+ assert_equals "${EXIT_STATUS_SIGALRM}" "$code"
assert_stdout "before"
}
-# Tests that process_wrapper sends a SIGTERM to a process on timeout, but gives
-# it a grace period of 10 seconds before killing it with SIGKILL.
-# In this variant we expect the trap (that runs on SIGTERM) to exit within the
-# grace period, thus printing "beforeafter".
+# Tests that the timeout causes the process to receive a SIGTERM, but with the
+# process exiting on its own without the need for a SIGKILL. To make sure that
+# this is the case, we pass a very large kill delay to cause the outer test to
+# fail if we violate this expectation.
function test_timeout_grace() {
local code=0
- $process_wrapper --timeout=1 --kill_delay=10 --stdout=$OUT --stderr=$ERR /bin/sh -c \
- 'trap "echo before; sleep 1; echo after; exit 0" INT TERM ALRM; sleep 10' \
+ $process_wrapper --timeout=1 --kill_delay=100000 --stdout=$OUT --stderr=$ERR \
+ /bin/sh -c \
+ 'trap "echo ignoring signal" INT TERM ARLM; \
+ for i in $(seq 5); do sleep 1; done; echo after' \
&> $TEST_log || code=$?
assert_equals "${EXIT_STATUS_SIGALRM}" "$code"
- assert_stdout 'before
+ assert_stdout 'ignoring signal
after'
}
-# Tests that process_wrapper sends a SIGTERM to a process on timeout, but gives
-# it a grace period of 2 seconds before killing it with SIGKILL.
-# In this variant, we expect the process to be killed with SIGKILL, because the
-# trap takes longer than the grace period, thus only printing "before".
-function test_timeout_kill() {
+# Tests that the timeout causes the process to receive a SIGTERM and waits until
+# the process has exited on its own, even if that takes a little bit of time. To
+# make sure that this is the case, we pass a very large kill delay to cause the
+# outer test to fail if we violate this expectation.
+#
+# In the past, even though we would terminate the process quickly, we would
+# get stuck until the kill delay passed (because we'd be stuck waiting for a
+# zombie process without us actually collecting it). So this is a regression
+# test for that subtle bug.
+function test_timeout_exits_as_soon_as_process_terminates() {
local code=0
- $process_wrapper --timeout=1 --kill_delay=2 --stdout=$OUT --stderr=$ERR /bin/sh -c \
- 'trap "echo before; sleep 10; echo after; exit 0" INT TERM ALRM; sleep 10' \
+ $process_wrapper --timeout=1 --kill_delay=100000 --stdout=$OUT --stderr=$ERR \
+ /bin/sh -c \
+ 'trap "" INT TERM ARLM; \
+ for i in $(seq 5); do echo sleeping $i; sleep 1; done' \
&> $TEST_log || code=$?
assert_equals "${EXIT_STATUS_SIGALRM}" "$code"
- assert_stdout "before"
+ assert_stdout 'sleeping 1
+sleeping 2
+sleeping 3
+sleeping 4
+sleeping 5'
+}
+
+# Tests that the timeout causes the process to receive a SIGTERM first and a
+# SIGKILL later once a kill delay has passed without the process exiting on
+# its own. We make the process get stuck indefinitely until killed, and we do
+# this with individual calls to sleep instead of a single one to ensure that a
+# single termination of the sleep subprocess doesn't cause us to spuriously
+# exit and thus pass this test.
+function test_timeout_kill() {
+ local code=0
+ $process_wrapper --timeout=1 --kill_delay=5 --stdout=$OUT --stderr=$ERR \
+ /bin/sh -c \
+ 'trap "echo ignoring signal" INT TERM ARLM; \
+ while :; do sleep 1; done; echo after' \
+ &> $TEST_log || code=$?
+ assert_equals "${EXIT_STATUS_SIGALRM}" "$code"
+ assert_stdout 'ignoring signal'
}
function test_execvp_error_message() {