Make process-wrapper exit as soon as a timed-out process handles SIGTERM.

Before this change, passing --timeout=X --kill_delay=Y would cause the
process-wrapper to send a SIGTERM to the subprocess after X seconds... and
then unnecessarily wait for Y seconds to send a SIGKILL.  If the process
exited after X passed but before Y expired, we wouldn't notice.

The reason is that we were using kill(pid, 0) to check if the process had
already vanished after a SIGTERM, expecting this call to return an error
once the process is gone. The problem is: the process is never gone until
a wait() is done, as otherwise it remains as a zombie, and sending signals
to a zombie process doesn't return an error.

Given that we cannot wait in the signal handler, make the signal handler
install a second timer to send the SIGKILL once the Y seconds pass, and
then let the main logic in the process-wrapper await for the subprocess'
termination and finalization of the whole process tree.

Part of #7818.

RELNOTES: None.
PiperOrigin-RevId: 314905347
diff --git a/src/main/tools/process-tools.cc b/src/main/tools/process-tools.cc
index 985ff60..91ea2c9 100644
--- a/src/main/tools/process-tools.cc
+++ b/src/main/tools/process-tools.cc
@@ -33,6 +33,8 @@
 #include "src/main/protobuf/execution_statistics.pb.h"
 #include "src/main/tools/logging.h"
 
+static volatile sig_atomic_t child_pid_for_signal = 0;
+
 int SwitchToEuid() {
   int uid = getuid();
   int euid = geteuid();
@@ -80,20 +82,31 @@
   }
 }
 
+static void ForciblyKillEverything(int signo) {
+  // We don't update the last_signal field tracked in process-wrapper-legacy.cc,
+  // which is used to report the signal back to the user, because we want to
+  // know (from the caller side) the original signal that caused us to stop.
+  kill(-child_pid_for_signal, SIGKILL);
+}
+
 void KillEverything(pid_t pgrp, bool gracefully, double graceful_kill_delay) {
   if (gracefully) {
+    // TODO(jmmv): If we truly want to offer graceful termination, we should
+    // probably only send SIGTERM to the process group leader and allow it to
+    // decide what to do. Terminating its subprocesses out of its control might
+    // not have the right effect.
     kill(-pgrp, SIGTERM);
 
-    // Round up fractional seconds in this polling implementation.
-    int kill_delay = static_cast<int>(ceil(graceful_kill_delay));
-
-    // If the process is still alive, give it some time to die gracefully.
-    while (kill_delay-- > 0 && kill(-pgrp, 0) == 0) {
-      sleep(1);
-    }
+    // Previous versions of this code used to loop testing if the process had
+    // already died by sending it a 0 signal... but that loop would never
+    // terminate early because sending a signal to a zombie process succeeds
+    // (and we cannot collect the child's exit status here).
+    child_pid_for_signal = pgrp;
+    InstallSignalHandler(SIGALRM, ForciblyKillEverything);
+    SetTimeout(graceful_kill_delay);
+  } else {
+    kill(-pgrp, SIGKILL);
   }
-
-  kill(-pgrp, SIGKILL);
 }
 
 void InstallSignalHandler(int signum, void (*handler)(int)) {
diff --git a/src/test/shell/integration/process-wrapper_test.sh b/src/test/shell/integration/process-wrapper_test.sh
index 4aa4647..38d59ec 100755
--- a/src/test/shell/integration/process-wrapper_test.sh
+++ b/src/test/shell/integration/process-wrapper_test.sh
@@ -31,6 +31,8 @@
 readonly OUT="${OUT_DIR}/outfile"
 readonly ERR="${OUT_DIR}/errfile"
 
+readonly EXIT_STATUS_SIGABRT=$((128 + 6))
+readonly EXIT_STATUS_SIGKILL=$((128 + 9))
 readonly EXIT_STATUS_SIGALRM=$((128 + 14))
 
 function set_up() {
@@ -66,7 +68,7 @@
 function test_signal_death() {
   local code=0
   $process_wrapper --stdout=$OUT --stderr=$ERR /bin/sh -c 'kill -ABRT $$' &> $TEST_log || code=$?
-  assert_equals 134 "$code" # SIGNAL_BASE + SIGABRT = 128 + 6
+  assert_equals "${EXIT_STATUS_SIGABRT}" "$code"
 }
 
 function test_signal_catcher() {
@@ -79,35 +81,66 @@
 
 function test_basic_timeout() {
   $process_wrapper --timeout=1 --kill_delay=2 --stdout=$OUT --stderr=$ERR /bin/sh -c \
-    "echo before; sleep 10; echo after" &> $TEST_log && fail
+    "echo before; sleep 10; echo after" &> $TEST_log || code=$?
+  assert_equals "${EXIT_STATUS_SIGALRM}" "$code"
   assert_stdout "before"
 }
 
-# Tests that process_wrapper sends a SIGTERM to a process on timeout, but gives
-# it a grace period of 10 seconds before killing it with SIGKILL.
-# In this variant we expect the trap (that runs on SIGTERM) to exit within the
-# grace period, thus printing "beforeafter".
+# Tests that the timeout causes the process to receive a SIGTERM, but with the
+# process exiting on its own without the need for a SIGKILL. To make sure that
+# this is the case, we pass a very large kill delay to cause the outer test to
+# fail if we violate this expectation.
 function test_timeout_grace() {
   local code=0
-  $process_wrapper --timeout=1 --kill_delay=10 --stdout=$OUT --stderr=$ERR /bin/sh -c \
-    'trap "echo before; sleep 1; echo after; exit 0" INT TERM ALRM; sleep 10' \
+  $process_wrapper --timeout=1 --kill_delay=100000 --stdout=$OUT --stderr=$ERR \
+    /bin/sh -c \
+    'trap "echo ignoring signal" INT TERM ARLM; \
+     for i in $(seq 5); do sleep 1; done; echo after' \
     &> $TEST_log || code=$?
   assert_equals "${EXIT_STATUS_SIGALRM}" "$code"
-  assert_stdout 'before
+  assert_stdout 'ignoring signal
 after'
 }
 
-# Tests that process_wrapper sends a SIGTERM to a process on timeout, but gives
-# it a grace period of 2 seconds before killing it with SIGKILL.
-# In this variant, we expect the process to be killed with SIGKILL, because the
-# trap takes longer than the grace period, thus only printing "before".
-function test_timeout_kill() {
+# Tests that the timeout causes the process to receive a SIGTERM and waits until
+# the process has exited on its own, even if that takes a little bit of time. To
+# make sure that this is the case, we pass a very large kill delay to cause the
+# outer test to fail if we violate this expectation.
+#
+# In the past, even though we would terminate the process quickly, we would
+# get stuck until the kill delay passed (because we'd be stuck waiting for a
+# zombie process without us actually collecting it). So this is a regression
+# test for that subtle bug.
+function test_timeout_exits_as_soon_as_process_terminates() {
   local code=0
-  $process_wrapper --timeout=1 --kill_delay=2 --stdout=$OUT --stderr=$ERR /bin/sh -c \
-    'trap "echo before; sleep 10; echo after; exit 0" INT TERM ALRM; sleep 10' \
+  $process_wrapper --timeout=1 --kill_delay=100000 --stdout=$OUT --stderr=$ERR \
+    /bin/sh -c \
+    'trap "" INT TERM ARLM; \
+     for i in $(seq 5); do echo sleeping $i; sleep 1; done' \
     &> $TEST_log || code=$?
   assert_equals "${EXIT_STATUS_SIGALRM}" "$code"
-  assert_stdout "before"
+  assert_stdout 'sleeping 1
+sleeping 2
+sleeping 3
+sleeping 4
+sleeping 5'
+}
+
+# Tests that the timeout causes the process to receive a SIGTERM first and a
+# SIGKILL later once a kill delay has passed without the process exiting on
+# its own. We make the process get stuck indefinitely until killed, and we do
+# this with individual calls to sleep instead of a single one to ensure that a
+# single termination of the sleep subprocess doesn't cause us to spuriously
+# exit and thus pass this test.
+function test_timeout_kill() {
+  local code=0
+  $process_wrapper --timeout=1 --kill_delay=5 --stdout=$OUT --stderr=$ERR \
+    /bin/sh -c \
+    'trap "echo ignoring signal" INT TERM ARLM; \
+     while :; do sleep 1; done; echo after' \
+    &> $TEST_log || code=$?
+  assert_equals "${EXIT_STATUS_SIGALRM}" "$code"
+  assert_stdout 'ignoring signal'
 }
 
 function test_execvp_error_message() {