|  | // Copyright 2016 The Bazel Authors. All rights reserved. | 
|  | // | 
|  | // Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | // you may not use this file except in compliance with the License. | 
|  | // You may obtain a copy of the License at | 
|  | // | 
|  | //    http://www.apache.org/licenses/LICENSE-2.0 | 
|  | // | 
|  | // Unless required by applicable law or agreed to in writing, software | 
|  | // distributed under the License is distributed on an "AS IS" BASIS, | 
|  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | // See the License for the specific language governing permissions and | 
|  | // limitations under the License. | 
|  |  | 
|  | /** | 
|  | * linux-sandbox runs commands in a restricted environment where they are | 
|  | * subject to a few rules: | 
|  | * | 
|  | *  - The entire filesystem is made read-only. | 
|  | *  - The working directory (-W) will be made read-write, though. | 
|  | *  - Individual files or directories can be made writable (but not deletable) | 
|  | *    (-w). | 
|  | *  - If the process takes longer than the timeout (-T), it will be killed with | 
|  | *    SIGTERM. If it does not exit within the grace period (-t), it all of its | 
|  | *    children will be killed with SIGKILL. | 
|  | *  - tmpfs can be mounted on top of existing directories (-e). | 
|  | *  - If option -R is passed, the process will run as user 'root'. | 
|  | *  - If option -U is passed, the process will run as user 'nobody'. | 
|  | *  - Otherwise, the process runs using the current uid / gid. | 
|  | *  - If linux-sandbox itself gets killed, the process and all of its children | 
|  | *    will be killed. | 
|  | *  - If linux-sandbox's parent dies, it will kill itself, the process and all | 
|  | *    the children. | 
|  | *  - Network access is allowed, but can be disabled via -N. | 
|  | *  - The hostname and domainname will be set to "sandbox". | 
|  | *  - The process runs in its own PID namespace, so other processes on the | 
|  | *    system are invisible. | 
|  | */ | 
|  |  | 
|  | #include "src/main/tools/linux-sandbox.h" | 
|  |  | 
|  | #include <ctype.h> | 
|  | #include <dirent.h> | 
|  | #include <errno.h> | 
|  | #include <fcntl.h> | 
|  | #include <math.h> | 
|  | #include <sched.h> | 
|  | #include <signal.h> | 
|  | #include <stdbool.h> | 
|  | #include <stdio.h> | 
|  | #include <stdlib.h> | 
|  | #include <string.h> | 
|  | #include <sys/prctl.h> | 
|  | #include <sys/resource.h> | 
|  | #include <sys/stat.h> | 
|  | #include <sys/time.h> | 
|  | #include <sys/types.h> | 
|  | #include <sys/wait.h> | 
|  | #include <unistd.h> | 
|  |  | 
|  | #include <atomic> | 
|  | #include <vector> | 
|  |  | 
|  | #include "src/main/tools/linux-sandbox-options.h" | 
|  | #include "src/main/tools/linux-sandbox-pid1.h" | 
|  | #include "src/main/tools/logging.h" | 
|  | #include "src/main/tools/process-tools.h" | 
|  |  | 
|  | uid_t global_outer_uid; | 
|  | gid_t global_outer_gid; | 
|  |  | 
|  | // The PID of our child process, for use in signal handlers. | 
|  | static std::atomic<pid_t> global_child_pid{0}; | 
|  |  | 
|  | // Must we politely ask the child to exit before we send it a SIGKILL (once we | 
|  | // want it to exit)? Holds only zero or one. | 
|  | static std::atomic<int> global_need_polite_sigterm{false}; | 
|  |  | 
|  | #if __cplusplus >= 201703L | 
|  | static_assert(global_child_pid.is_always_lock_free); | 
|  | static_assert(global_need_polite_sigterm.is_always_lock_free); | 
|  | #endif | 
|  |  | 
|  | // Make sure the child process does not inherit any accidentally left open file | 
|  | // handles from our parent. | 
|  | static void CloseFds() { | 
|  | DIR *fds = opendir("/proc/self/fd"); | 
|  | if (fds == nullptr) { | 
|  | DIE("opendir"); | 
|  | } | 
|  |  | 
|  | while (1) { | 
|  | errno = 0; | 
|  | struct dirent *dent = readdir(fds); | 
|  |  | 
|  | if (dent == nullptr) { | 
|  | if (errno != 0) { | 
|  | DIE("readdir"); | 
|  | } | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (isdigit(dent->d_name[0])) { | 
|  | errno = 0; | 
|  | int fd = strtol(dent->d_name, nullptr, 10); | 
|  |  | 
|  | // (1) Skip unparseable entries. | 
|  | // (2) Close everything except stdin, stdout and stderr. | 
|  | // (3) Do not accidentally close our directory handle. | 
|  | if (errno == 0 && fd > STDERR_FILENO && fd != dirfd(fds)) { | 
|  | if (close(fd) < 0) { | 
|  | DIE("close"); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | if (closedir(fds) < 0) { | 
|  | DIE("closedir"); | 
|  | } | 
|  | } | 
|  |  | 
|  | static void OnTimeoutOrTerm(int) { | 
|  | // Find the PID of the child, which main set up before installing us as a | 
|  | // signal handler. | 
|  | const pid_t child_pid = global_child_pid.load(std::memory_order_relaxed); | 
|  |  | 
|  | // Figure out whether we should send a SIGTERM here. If so, we won't want to | 
|  | // next time we're called. | 
|  | const bool need_polite_sigterm = | 
|  | global_need_polite_sigterm.fetch_and(0, std::memory_order_relaxed); | 
|  |  | 
|  | // If we're not supposed to ask politely, simply forcibly kill the child. | 
|  | if (!need_polite_sigterm) { | 
|  | kill(child_pid, SIGKILL); | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Otherwise make a polite request, then arrange to be called again after a | 
|  | // delay, at which point we'll send SIGKILL. | 
|  | // | 
|  | // Note that main sets us up as the signal handler for SIGALRM, and arranges | 
|  | // for this code path to be taken only if kill_delay_secs > 0. | 
|  | kill(child_pid, SIGTERM); | 
|  | alarm(opt.kill_delay_secs); | 
|  | } | 
|  |  | 
|  | static pid_t SpawnPid1() { | 
|  | const int kStackSize = 1024 * 1024; | 
|  | std::vector<char> child_stack(kStackSize); | 
|  |  | 
|  | PRINT_DEBUG("calling pipe(2)..."); | 
|  |  | 
|  | int sync_pipe[2]; | 
|  | if (pipe(sync_pipe) < 0) { | 
|  | DIE("pipe"); | 
|  | } | 
|  |  | 
|  | int clone_flags = | 
|  | CLONE_NEWUSER | CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWPID | SIGCHLD; | 
|  | if (opt.create_netns) { | 
|  | clone_flags |= CLONE_NEWNET; | 
|  | } | 
|  | if (opt.fake_hostname) { | 
|  | clone_flags |= CLONE_NEWUTS; | 
|  | } | 
|  |  | 
|  | // We use clone instead of unshare, because unshare sometimes fails with | 
|  | // EINVAL due to a race condition in the Linux kernel (see | 
|  | // https://lkml.org/lkml/2015/7/28/833). | 
|  | PRINT_DEBUG("calling clone(2)..."); | 
|  |  | 
|  | const pid_t child_pid = | 
|  | clone(Pid1Main, child_stack.data() + kStackSize, clone_flags, sync_pipe); | 
|  |  | 
|  | if (child_pid < 0) { | 
|  | DIE("clone"); | 
|  | } | 
|  |  | 
|  | PRINT_DEBUG("linux-sandbox-pid1 has PID %d", child_pid); | 
|  |  | 
|  | // We close the write end of the sync pipe, read a byte and then close the | 
|  | // pipe. This proves to the linux-sandbox-pid1 process that we still existed | 
|  | // after it ran prctl(PR_SET_PDEATHSIG, SIGKILL), thus preventing a race | 
|  | // condition where the parent is killed before that call was made. | 
|  | char buf; | 
|  | if (close(sync_pipe[1]) < 0) { | 
|  | DIE("close"); | 
|  | } | 
|  | if (read(sync_pipe[0], &buf, 1) < 0) { | 
|  | DIE("read"); | 
|  | } | 
|  | if (close(sync_pipe[0]) < 0) { | 
|  | DIE("close"); | 
|  | } | 
|  |  | 
|  | PRINT_DEBUG("done manipulating pipes"); | 
|  |  | 
|  | return child_pid; | 
|  | } | 
|  |  | 
|  | static int WaitForPid1(const pid_t child_pid) { | 
|  | // Wait for the child to exit, obtaining usage information. Restart in the | 
|  | // case of a signal interrupting us. | 
|  | int child_status; | 
|  | struct rusage child_rusage; | 
|  | while (true) { | 
|  | const int ret = wait4(child_pid, &child_status, 0, &child_rusage); | 
|  | if (ret > 0) { | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (errno == EINTR) { | 
|  | continue; | 
|  | } | 
|  |  | 
|  | DIE("wait4"); | 
|  | } | 
|  |  | 
|  | // If we're supposed to write stats to a file, do so now. | 
|  | if (!opt.stats_path.empty()) { | 
|  | WriteStatsToFile(&child_rusage, opt.stats_path); | 
|  | } | 
|  |  | 
|  | // We want to exit in the same manner as the child. | 
|  | if (WIFSIGNALED(child_status)) { | 
|  | const int signal = WTERMSIG(child_status); | 
|  | PRINT_DEBUG("child exited due to receiving signal: %s", strsignal(signal)); | 
|  | return 128 + signal; | 
|  | } | 
|  |  | 
|  | const int exit_code = WEXITSTATUS(child_status); | 
|  | PRINT_DEBUG("child exited normally with code %d", exit_code); | 
|  | return exit_code; | 
|  | } | 
|  |  | 
|  | int main(int argc, char *argv[]) { | 
|  | // Ask the kernel to kill us with SIGKILL if our parent dies. | 
|  | if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) { | 
|  | DIE("prctl"); | 
|  | } | 
|  |  | 
|  | // Start with default signal actions and a clear signal mask. | 
|  | ClearSignalMask(); | 
|  |  | 
|  | // Ignore SIGTTIN and SIGTTOU, as we hand off the terminal to the child in | 
|  | // SpawnChild. | 
|  | IgnoreSignal(SIGTTIN); | 
|  | IgnoreSignal(SIGTTOU); | 
|  |  | 
|  | // Parse our command-line options and set up a global variable used by | 
|  | // PRINT_DEBUG. | 
|  | ParseOptions(argc, argv); | 
|  | global_debug = opt.debug; | 
|  |  | 
|  | // Redirect output as requested. | 
|  | Redirect(opt.stdout_path, STDOUT_FILENO); | 
|  | Redirect(opt.stderr_path, STDERR_FILENO); | 
|  |  | 
|  | // Set up two globals used by the child process. | 
|  | global_outer_uid = getuid(); | 
|  | global_outer_gid = getgid(); | 
|  |  | 
|  | // Ensure we don't pass on any FDs from our parent to our child. | 
|  | CloseFds(); | 
|  |  | 
|  | // Spawn the child that will fork the sandboxed program with fresh namespaces | 
|  | // etc. | 
|  | const pid_t child_pid = SpawnPid1(); | 
|  |  | 
|  | // Let the signal handlers installed below know the PID of the child. | 
|  | global_child_pid.store(child_pid, std::memory_order_relaxed); | 
|  |  | 
|  | // If a kill delay has been configured, let the signal handlers installed | 
|  | // below know that it needs to be respected. | 
|  | if (opt.kill_delay_secs > 0) { | 
|  | global_need_polite_sigterm.store(1, std::memory_order_relaxed); | 
|  | } | 
|  |  | 
|  | // OnTimeoutOrTerm, which is used for other signals below, assumes that it | 
|  | // handles SIGALRM. We also explicitly invoke it after the timeout using | 
|  | // alarm(2). | 
|  | InstallSignalHandler(SIGALRM, OnTimeoutOrTerm); | 
|  |  | 
|  | // If requested, arrange for the child to be killed (optionally after being | 
|  | // asked politely to terminate) once the timeout expires. | 
|  | // | 
|  | // Note that it's important to set this up before support for SIGTERM and | 
|  | // SIGINT. Otherwise one of those signals could arrive before we get here, and | 
|  | // then we would reset its opt.kill_delay_secs interval timer. | 
|  | if (opt.timeout_secs > 0) { | 
|  | alarm(opt.timeout_secs); | 
|  | } | 
|  |  | 
|  | // Also ask/tell the child to quit on SIGTERM, and optionally for SIGINT too. | 
|  | InstallSignalHandler(SIGTERM, OnTimeoutOrTerm); | 
|  | if (opt.sigint_sends_sigterm) { | 
|  | InstallSignalHandler(SIGINT, OnTimeoutOrTerm); | 
|  | } | 
|  |  | 
|  | // Wait for the child to exit, returning an appropriate status. | 
|  | return WaitForPid1(child_pid); | 
|  | } |