blob: 01e01fdc30d4ba1417b7ed3b91b27c00b428bd9f [file] [log] [blame]
// Copyright 2016 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/**
* linux-sandbox runs commands in a restricted environment where they are
* subject to a few rules:
*
* - The entire filesystem is made read-only.
* - The working directory (-W) will be made read-write, though.
* - Individual files or directories can be made writable (but not deletable)
* (-w).
* - Individual files or directories can be made inaccessible / unreadable
* (-i).
* - tmpfs will be mounted on /tmp.
* - tmpfs can be mounted on top of existing directories (-e), too.
* - If the process takes longer than the timeout (-T), it will be killed with
* SIGTERM. If it does not exit within the grace period (-t), it all of its
* children will be killed with SIGKILL.
* - If linux-sandbox itself gets killed, the process and all of its children
* will be killed.
* - If linux-sandbox's parent dies, it will kill itself, the process and all
* the children.
* - Network access is allowed, but can be disabled via -N.
* - The process runs as user "nobody", unless fakeroot is enabled (-R).
* - The hostname and domainname will be set to "sandbox".
* - The process runs in its own PID namespace, so other processes on the
* system are invisible.
*/
#include "linux-sandbox-options.h"
#include "linux-sandbox-pid1.h"
#include "linux-sandbox-utils.h"
#define DIE(args...) \
{ \
fprintf(stderr, __FILE__ ":" S__LINE__ ": \"" args); \
fprintf(stderr, "\": "); \
perror(NULL); \
exit(EXIT_FAILURE); \
}
#include <ctype.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <math.h>
#include <sched.h>
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <vector>
int global_outer_uid;
int global_outer_gid;
static char global_sandbox_root[] = "/tmp/sandbox.XXXXXX";
static int global_child_pid;
// The signal that will be sent to the child when a timeout occurs.
static volatile sig_atomic_t global_next_timeout_signal = SIGTERM;
// The signal that caused us to kill the child (e.g. on timeout).
static volatile sig_atomic_t global_signal;
static void CloseFds() {
DIR *fds = opendir("/proc/self/fd");
if (fds == NULL) {
DIE("opendir");
}
while (1) {
errno = 0;
struct dirent *dent = readdir(fds);
if (dent == NULL) {
if (errno != 0) {
DIE("readdir");
}
break;
}
if (isdigit(dent->d_name[0])) {
errno = 0;
int fd = strtol(dent->d_name, nullptr, 10);
// (1) Skip unparseable entries.
// (2) Close everything except stdin, stdout and stderr.
// (3) Do not accidentally close our directory handle.
if (errno == 0 && fd > STDERR_FILENO && fd != dirfd(fds)) {
if (close(fd) < 0) {
DIE("close");
}
}
}
}
if (closedir(fds) < 0) {
DIE("closedir");
}
}
static void RemoveSandboxRoot() {
if (rmdir(global_sandbox_root) < 0) {
DIE("rmdir(%s)", global_sandbox_root);
}
}
static void SetupSandboxRoot() {
if (opt.sandbox_root_dir == NULL) {
if (mkdtemp(global_sandbox_root) == NULL) {
DIE("mkdtemp(%s)", global_sandbox_root);
}
atexit(RemoveSandboxRoot);
opt.sandbox_root_dir = global_sandbox_root;
}
}
static void HandleSignal(int signum, void (*handler)(int)) {
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = handler;
if (sigemptyset(&sa.sa_mask) < 0) {
DIE("sigemptyset");
}
if (sigaction(signum, &sa, NULL) < 0) {
DIE("sigaction");
}
}
static void OnTimeout(int sig) {
global_signal = sig;
kill(global_child_pid, global_next_timeout_signal);
if (global_next_timeout_signal == SIGTERM && opt.kill_delay_secs > 0) {
global_next_timeout_signal = SIGKILL;
alarm(opt.kill_delay_secs);
}
}
static void SpawnPid1() {
const int kStackSize = 1024 * 1024;
std::vector<char> child_stack(kStackSize);
int sync_pipe[2];
if (pipe(sync_pipe) < 0) {
DIE("pipe");
}
int clone_flags = CLONE_NEWUSER | CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | SIGCHLD;
if (opt.create_netns) {
clone_flags |= CLONE_NEWNET;
}
// We use clone instead of unshare, because unshare sometimes fails with
// EINVAL due to a race condition in the Linux kernel (see
// https://lkml.org/lkml/2015/7/28/833).
global_child_pid =
clone(Pid1Main, child_stack.data() + kStackSize, clone_flags, sync_pipe);
if (global_child_pid < 0) {
DIE("clone");
}
PRINT_DEBUG("linux-sandbox-pid1 has PID %d", global_child_pid);
// We close the write end of the sync pipe, read a byte and then close the
// pipe. This proves to the linux-sandbox-pid1 process that we still existed
// after it ran prctl(PR_SET_PDEATHSIG, SIGKILL), thus preventing a race
// condition where the parent is killed before that call was made.
char buf;
if (close(sync_pipe[1]) < 0) {
DIE("close");
}
if (read(sync_pipe[0], &buf, 1) < 0) {
DIE("read");
}
if (close(sync_pipe[0]) < 0) {
DIE("close");
}
}
static int WaitForPid1() {
int err, status;
do {
err = waitpid(global_child_pid, &status, 0);
} while (err < 0 && errno == EINTR);
if (err < 0) {
DIE("waitpid");
}
if (global_signal > 0) {
// The child exited because we killed it due to receiving a signal
// ourselves. Do not trust the exitcode in this case, just calculate it from
// the signal.
PRINT_DEBUG("child exited due to us catching signal: %s",
strsignal(global_signal));
return 128 + global_signal;
} else if (WIFSIGNALED(status)) {
PRINT_DEBUG("child exited due to receiving signal: %s",
strsignal(WTERMSIG(status)));
return 128 + WTERMSIG(status);
} else {
PRINT_DEBUG("child exited normally with exitcode %d", WEXITSTATUS(status));
return WEXITSTATUS(status);
}
}
static void Redirect(const char *target_path, int fd, const char *name) {
if (target_path != NULL && strcmp(target_path, "-") != 0) {
const int flags = O_WRONLY | O_CREAT | O_TRUNC | O_APPEND;
int fd_out = open(target_path, flags, 0666);
if (fd_out < 0) {
DIE("open(%s)", target_path);
}
// If we were launched with less than 3 fds (stdin, stdout, stderr) open,
// but redirection is still requested via a command-line flag, something is
// wacky and the following code would not do what we intend to do, so let's
// bail.
if (fd_out < 3) {
DIE("open(%s) returned a handle that is reserved for stdin / stdout / "
"stderr",
target_path);
}
if (dup2(fd_out, fd) < 0) {
DIE("dup2()");
}
if (close(fd_out) < 0) {
DIE("close()");
}
}
}
static void RedirectStdout(const char *stdout_path) {
Redirect(stdout_path, STDOUT_FILENO, "stdout");
}
static void RedirectStderr(const char *stderr_path) {
Redirect(stderr_path, STDERR_FILENO, "stderr");
}
int main(int argc, char *argv[]) {
// Ask the kernel to kill us with SIGKILL if our parent dies.
if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
DIE("prctl");
}
ParseOptions(argc, argv);
RedirectStdout(opt.stdout_path);
RedirectStderr(opt.stderr_path);
// This should never be called as a setuid binary, drop privileges just in
// case. We don't need to be root, because we use user namespaces anyway.
if (setuid(getuid()) < 0) {
DIE("setuid");
}
global_outer_uid = getuid();
global_outer_gid = getgid();
// Make sure the sandboxed process does not inherit any accidentally left open
// file handles from our parent.
CloseFds();
SetupSandboxRoot();
HandleSignal(SIGALRM, OnTimeout);
if (opt.timeout_secs > 0) {
alarm(opt.timeout_secs);
}
SpawnPid1();
return WaitForPid1();
}