| // Copyright 2016 The Bazel Authors. All rights reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| /** |
| * linux-sandbox runs commands in a restricted environment where they are |
| * subject to a few rules: |
| * |
| * - The entire filesystem is made read-only. |
| * - The working directory (-W) will be made read-write, though. |
| * - Individual files or directories can be made writable (but not deletable) |
| * (-w). |
| * - If the process takes longer than the timeout (-T), it will be killed with |
| * SIGTERM. If it does not exit within the grace period (-t), it all of its |
| * children will be killed with SIGKILL. |
| * - tmpfs can be mounted on top of existing directories (-e). |
| * - If option -R is passed, the process will run as user 'root'. |
| * - If option -U is passed, the process will run as user 'nobody'. |
| * - Otherwise, the process runs using the current uid / gid. |
| * - If linux-sandbox itself gets killed, the process and all of its children |
| * will be killed. |
| * - If linux-sandbox's parent dies, it will kill itself, the process and all |
| * the children. |
| * - Network access is allowed, but can be disabled via -N. |
| * - The hostname and domainname will be set to "sandbox". |
| * - The process runs in its own PID namespace, so other processes on the |
| * system are invisible. |
| */ |
| |
| #include "src/main/tools/linux-sandbox.h" |
| |
| #include <ctype.h> |
| #include <dirent.h> |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <math.h> |
| #include <sched.h> |
| #include <signal.h> |
| #include <stdbool.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/prctl.h> |
| #include <sys/resource.h> |
| #include <sys/stat.h> |
| #include <sys/time.h> |
| #include <sys/types.h> |
| #include <sys/wait.h> |
| #include <unistd.h> |
| |
| #include <atomic> |
| #include <string> |
| #include <vector> |
| |
| #include "src/main/tools/linux-sandbox-options.h" |
| #include "src/main/tools/linux-sandbox-pid1.h" |
| #include "src/main/tools/logging.h" |
| #include "src/main/tools/process-tools.h" |
| |
| uid_t global_outer_uid; |
| gid_t global_outer_gid; |
| |
| // The PID of our child process, for use in signal handlers. |
| static std::atomic<pid_t> global_child_pid{0}; |
| // Our parent's pid at the outset, to check if the original parent has exited. |
| pid_t initial_ppid; |
| |
| // Must we politely ask the child to exit before we send it a SIGKILL (once we |
| // want it to exit)? Holds only zero or one. |
| static std::atomic<int> global_need_polite_sigterm{false}; |
| |
| #if __cplusplus >= 201703L |
| static_assert(global_child_pid.is_always_lock_free); |
| static_assert(global_need_polite_sigterm.is_always_lock_free); |
| #endif |
| |
| // Make sure the child process does not inherit any accidentally left open file |
| // handles from our parent. |
| static void CloseFds() { |
| DIR *fds = opendir("/proc/self/fd"); |
| if (fds == nullptr) { |
| DIE("opendir"); |
| } |
| |
| while (1) { |
| errno = 0; |
| struct dirent *dent = readdir(fds); |
| |
| if (dent == nullptr) { |
| if (errno != 0) { |
| DIE("readdir"); |
| } |
| break; |
| } |
| |
| if (isdigit(dent->d_name[0])) { |
| errno = 0; |
| int fd = strtol(dent->d_name, nullptr, 10); |
| |
| // (1) Skip unparseable entries. |
| // (2) Close everything except stdin, stdout, stderr and debug output. |
| // (3) Do not accidentally close our directory handle. |
| if (errno == 0 && fd > STDERR_FILENO && |
| (global_debug == NULL || fd != fileno(global_debug)) && |
| fd != dirfd(fds)) { |
| if (close(fd) < 0) { |
| DIE("close"); |
| } |
| } |
| } |
| } |
| |
| if (closedir(fds) < 0) { |
| DIE("closedir"); |
| } |
| } |
| |
| static void MaybeAddChildProcessToCgroup(const pid_t pid) { |
| for (const std::string &cgroups_dir : opt.cgroups_dirs) { |
| PRINT_DEBUG("Adding process %d to cgroups dir %s", pid, |
| cgroups_dir.c_str()); |
| WriteFile(cgroups_dir + "/cgroup.procs", "%d", pid); |
| } |
| } |
| |
| static void OnTimeoutOrTerm(int) { |
| // Find the PID of the child, which main set up before installing us as a |
| // signal handler. |
| const pid_t child_pid = global_child_pid.load(std::memory_order_relaxed); |
| |
| // Figure out whether we should send a SIGTERM here. If so, we won't want to |
| // next time we're called. |
| const bool need_polite_sigterm = |
| global_need_polite_sigterm.fetch_and(0, std::memory_order_relaxed); |
| |
| // If we're not supposed to ask politely, simply forcibly kill the child. |
| if (!need_polite_sigterm) { |
| kill(child_pid, SIGKILL); |
| return; |
| } |
| |
| // Otherwise make a polite request, then arrange to be called again after a |
| // delay, at which point we'll send SIGKILL. |
| // |
| // Note that main sets us up as the signal handler for SIGALRM, and arranges |
| // for this code path to be taken only if kill_delay_secs > 0. |
| kill(child_pid, SIGTERM); |
| alarm(opt.kill_delay_secs); |
| } |
| |
| static pid_t SpawnPid1() { |
| const int kStackSize = 1024 * 1024; |
| std::vector<char> child_stack(kStackSize); |
| |
| PRINT_DEBUG("calling pipe(2)..."); |
| |
| int pipe_from_child[2], pipe_to_child[2]; |
| if (pipe(pipe_from_child) < 0) { |
| DIE("pipe"); |
| } |
| if (pipe(pipe_to_child) < 0) { |
| DIE("pipe"); |
| } |
| |
| int clone_flags = |
| CLONE_NEWUSER | CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWPID | SIGCHLD; |
| PRINT_DEBUG("Netns is %d", opt.create_netns); |
| if (opt.create_netns != NO_NETNS) { |
| clone_flags |= CLONE_NEWNET; |
| } |
| if (opt.fake_hostname) { |
| clone_flags |= CLONE_NEWUTS; |
| } |
| |
| // We use clone instead of unshare, because unshare sometimes fails with |
| // EINVAL due to a race condition in the Linux kernel (see |
| // https://lkml.org/lkml/2015/7/28/833). |
| PRINT_DEBUG("calling clone(2)..."); |
| |
| Pid1Args pid1Args; |
| pid1Args.pipe_to_parent = pipe_from_child; |
| pid1Args.pipe_from_parent = pipe_to_child; |
| const pid_t child_pid = clone(Pid1Main, child_stack.data() + kStackSize, |
| clone_flags, &pid1Args); |
| |
| if (child_pid < 0) { |
| DIE("clone"); |
| } |
| |
| MaybeAddChildProcessToCgroup(child_pid); |
| // Signal the child that it can now proceed to spawn pid2. |
| SignalPipe(pipe_to_child); |
| |
| PRINT_DEBUG("linux-sandbox-pid1 has PID %d", child_pid); |
| |
| // Wait for a signal from the child linux-sandbox-pid1 process; this proves to |
| // the child process that we still existed after it ran |
| // prctl(PR_SET_PDEATHSIG, SIGKILL), thus preventing a race condition where |
| // the parent is killed before that call was made. |
| WaitPipe(pipe_from_child); |
| |
| PRINT_DEBUG("done manipulating pipes"); |
| |
| return child_pid; |
| } |
| |
| static int WaitForPid1(const pid_t child_pid) { |
| // Wait for the child to exit, obtaining usage information. Restart in the |
| // case of a signal interrupting us. |
| int child_status; |
| struct rusage child_rusage; |
| while (true) { |
| const int ret = wait4(child_pid, &child_status, 0, &child_rusage); |
| if (ret > 0) { |
| break; |
| } |
| |
| // We've been handed off to a reaper process and should die. |
| if (getppid() != initial_ppid) { |
| break; |
| } |
| |
| if (errno == EINTR) { |
| continue; |
| } |
| |
| DIE("wait4"); |
| } |
| |
| // If we're supposed to write stats to a file, do so now. |
| if (!opt.stats_path.empty()) { |
| WriteStatsToFile(&child_rusage, opt.stats_path); |
| } |
| |
| // We want to exit in the same manner as the child. |
| if (WIFSIGNALED(child_status)) { |
| const int signal = WTERMSIG(child_status); |
| PRINT_DEBUG("child exited due to receiving signal: %s", strsignal(signal)); |
| return 128 + signal; |
| } |
| |
| const int exit_code = WEXITSTATUS(child_status); |
| PRINT_DEBUG("child exited normally with code %d", exit_code); |
| return exit_code; |
| } |
| |
| int main(int argc, char *argv[]) { |
| // Ask the kernel to kill us with SIGKILL if our parent dies. |
| if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) { |
| DIE("prctl"); |
| } |
| |
| // Parse our command-line options. |
| ParseOptions(argc, argv); |
| |
| // Open the file PRINT_DEBUG writes to. |
| // Must happen early enough so we don't lose any debugging output. |
| if (!opt.debug_path.empty()) { |
| global_debug = fopen(opt.debug_path.c_str(), "w"); |
| if (!global_debug) { |
| DIE("fopen(%s)", opt.debug_path.c_str()); |
| } |
| } |
| |
| // Start with default signal actions and a clear signal mask. |
| ClearSignalMask(); |
| |
| // Ignore SIGTTIN and SIGTTOU, as we hand off the terminal to the child in |
| // SpawnChild. |
| IgnoreSignal(SIGTTIN); |
| IgnoreSignal(SIGTTOU); |
| |
| // Remember the parent pid so we can exit if the parent has exited. |
| // Doing this before prctl(PR_SET_PDEATHDIG, 0) ensures no race condition. |
| initial_ppid = getppid(); |
| |
| if (opt.persistent_process) { |
| if (prctl(PR_SET_PDEATHSIG, 0) < 0) { |
| DIE("prctl"); |
| } |
| } |
| |
| // Redirect output as requested. |
| Redirect(opt.stdout_path, STDOUT_FILENO); |
| Redirect(opt.stderr_path, STDERR_FILENO); |
| |
| // Set up two globals used by the child process. |
| global_outer_uid = getuid(); |
| global_outer_gid = getgid(); |
| |
| // Ensure we don't pass on any FDs from our parent to our child other than |
| // stdin, stdout, stderr and global_debug. |
| CloseFds(); |
| |
| // Spawn the child that will fork the sandboxed program with fresh |
| // namespaces etc. |
| const pid_t child_pid = SpawnPid1(); |
| |
| // Let the signal handlers installed below know the PID of the child. |
| global_child_pid.store(child_pid, std::memory_order_relaxed); |
| |
| // If a kill delay has been configured, let the signal handlers installed |
| // below know that it needs to be respected. |
| if (opt.kill_delay_secs > 0) { |
| global_need_polite_sigterm.store(1, std::memory_order_relaxed); |
| } |
| |
| // OnTimeoutOrTerm, which is used for other signals below, assumes that it |
| // handles SIGALRM. We also explicitly invoke it after the timeout using |
| // alarm(2). |
| InstallSignalHandler(SIGALRM, OnTimeoutOrTerm); |
| |
| // If requested, arrange for the child to be killed (optionally after being |
| // asked politely to terminate) once the timeout expires. |
| // |
| // Note that it's important to set this up before support for SIGTERM and |
| // SIGINT. Otherwise one of those signals could arrive before we get here, |
| // and then we would reset its opt.kill_delay_secs interval timer. |
| if (opt.timeout_secs > 0) { |
| alarm(opt.timeout_secs); |
| } |
| |
| // Also ask/tell the child to quit on SIGTERM, and optionally for SIGINT |
| // too. |
| InstallSignalHandler(SIGTERM, OnTimeoutOrTerm); |
| if (opt.sigint_sends_sigterm) { |
| InstallSignalHandler(SIGINT, OnTimeoutOrTerm); |
| } |
| |
| // Wait for the child to exit, returning an appropriate status. |
| return WaitForPid1(child_pid); |
| } |