blob: 7c7a2f540142006a5c88eed9a18f3bfc7d015e87 [file] [log] [blame]
// Copyright 2016 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/**
* This is PID 1 inside the sandbox environment and runs in a separate user,
* mount, UTS, IPC and PID namespace.
*/
#include "src/main/tools/linux-sandbox-pid1.h"
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <libgen.h>
#include <math.h>
#include <mntent.h>
#include <net/if.h>
#include <pwd.h>
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <string>
#include <unordered_set>
#ifndef MS_REC
// Some systems do not define MS_REC in sys/mount.h. We might be able to grab it
// from linux/fs.h instead (cf. #2667).
#include <linux/fs.h>
#endif
#ifndef TEMP_FAILURE_RETRY
// Some C standard libraries like musl do not define this macro, so we'll
// include our own version for compatibility.
#define TEMP_FAILURE_RETRY(exp) \
({ \
decltype(exp) _rc; \
do { \
_rc = (exp); \
} while (_rc == -1 && errno == EINTR); \
_rc; \
})
#endif // TEMP_FAILURE_RETRY
#include "src/main/tools/linux-sandbox-options.h"
#include "src/main/tools/linux-sandbox.h"
#include "src/main/tools/logging.h"
#include "src/main/tools/process-tools.h"
static int global_child_pid;
// Helper methods
static void CreateFile(const char *path) {
int handle = open(path, O_CREAT | O_WRONLY | O_EXCL, 0666);
if (handle < 0) {
DIE("open");
}
if (close(handle) < 0) {
DIE("close");
}
}
// Creates an empty file at 'path' by hard linking it from a known empty file.
// This is over two times faster than creating empty files via open() on
// certain filesystems (e.g. XFS).
static void LinkFile(const char *path) {
if (link("tmp/empty_file", path) < 0) {
DIE("link %s", path);
}
}
// Recursively creates the file or directory specified in "path" and its parent
// directories.
// Return -1 on failure and sets errno to:
// EINVAL path is null
// ENOTDIR path exists and is not a directory
// EEXIST path exists and is a directory
// ENOENT stat call with the path failed
static int CreateTarget(const char *path, bool is_directory) {
if (path == NULL) {
errno = EINVAL;
return -1;
}
struct stat sb;
// If the path already exists...
if (stat(path, &sb) == 0) {
if (is_directory && S_ISDIR(sb.st_mode)) {
// and it's a directory and supposed to be a directory, we're done here.
return 0;
} else if (!is_directory && S_ISREG(sb.st_mode)) {
// and it's a regular file and supposed to be one, we're done here.
return 0;
} else {
// otherwise something is really wrong.
errno = is_directory ? ENOTDIR : EEXIST;
return -1;
}
} else {
// If stat failed because of any error other than "the path does not exist",
// this is an error.
if (errno != ENOENT) {
return -1;
}
}
// Create the parent directory.
{
char *buf, *dir;
if (!(buf = strdup(path))) DIE("strdup");
dir = dirname(buf);
if (CreateTarget(dir, true) < 0) {
DIE("CreateTarget %s", dir);
}
free(buf);
}
if (is_directory) {
if (mkdir(path, 0755) < 0) {
DIE("mkdir(%s)", path);
}
} else {
LinkFile(path);
}
return 0;
}
static void SetupSelfDestruction(int *pipe_to_parent) {
// We could also poll() on the pipe fd to find out when the parent goes away,
// and rely on SIGCHLD interrupting that otherwise. That might require us to
// install some trivial handler for SIGCHLD. Using O_ASYNC to turn the pipe
// close into SIGIO may also work. Another option is signalfd, although that's
// almost as obscure as this prctl.
if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
DIE("prctl");
}
// Switch to a new process group, otherwise our process group will still refer
// to the outer PID namespace. We might then accidentally kill our parent by a
// call to e.g. `kill(0, sig)`.
if (setpgid(0, 0) < 0) {
DIE("setpgid");
}
// Verify that the parent still lives.
SignalPipe(pipe_to_parent);
}
static void SetupMountNamespace() {
// Fully isolate our mount namespace private from outside events, so that
// mounts in the outside environment do not affect our sandbox.
if (mount(nullptr, "/", nullptr, MS_REC | MS_PRIVATE, nullptr) < 0) {
DIE("mount");
}
}
static void SetupUserNamespace() {
// Disable needs for CAP_SETGID.
struct stat sb;
if (stat("/proc/self/setgroups", &sb) == 0) {
WriteFile("/proc/self/setgroups", "deny");
} else {
// Ignore ENOENT, because older Linux versions do not have this file (but
// also do not require writing to it).
if (errno != ENOENT) {
DIE("stat(/proc/self/setgroups");
}
}
uid_t inner_uid;
gid_t inner_gid;
if (opt.fake_root) {
// Change our username to 'root'.
inner_uid = 0;
inner_gid = 0;
} else if (opt.fake_username) {
// Change our username to 'nobody'.
struct passwd *pwd = getpwnam("nobody");
if (pwd == nullptr) {
DIE("unable to find passwd entry for user nobody")
}
inner_uid = pwd->pw_uid;
inner_gid = pwd->pw_gid;
} else {
// Do not change the username inside the sandbox.
inner_uid = global_outer_uid;
inner_gid = global_outer_gid;
}
if (opt.enable_pty) {
// Change the group to "tty" regardless of what was previously set
struct group grp;
char buf[256];
size_t buflen = sizeof(buf);
struct group *result;
getgrnam_r("tty", &grp, buf, buflen, &result);
if (result == nullptr) {
DIE("getgrnam_r");
}
inner_gid = grp.gr_gid;
}
WriteFile("/proc/self/uid_map", "%u %u 1\n", inner_uid, global_outer_uid);
WriteFile("/proc/self/gid_map", "%u %u 1\n", inner_gid, global_outer_gid);
}
static void SetupUtsNamespace() {
if (sethostname("localhost", 9) < 0) {
DIE("sethostname");
}
if (setdomainname("localdomain", 11) < 0) {
DIE("setdomainname");
}
}
static void MountFilesystems() {
// An attempt to mount the sandbox in tmpfs will always fail, so this block is
// slightly redundant with the next mount() check, but dumping the mount()
// syscall is incredibly cryptic, so we explicitly check against and warn
// about attempts to use tmpfs.
for (const std::string &tmpfs_dir : opt.tmpfs_dirs) {
if (opt.working_dir.find(tmpfs_dir) == 0) {
DIE("The sandbox working directory cannot be below a path where we mount "
"tmpfs (you requested mounting %s in %s). Is your --output_base= "
"below one of your --sandbox_tmpfs_path values?",
opt.working_dir.c_str(), tmpfs_dir.c_str());
}
}
std::unordered_set<std::string> bind_mount_sources;
for (size_t i = 0; i < opt.bind_mount_sources.size(); i++) {
const std::string &source = opt.bind_mount_sources.at(i);
bind_mount_sources.insert(source);
const std::string &target = opt.bind_mount_targets.at(i);
PRINT_DEBUG("bind mount: %s -> %s", source.c_str(), target.c_str());
if (mount(source.c_str(), target.c_str(), nullptr, MS_BIND | MS_REC,
nullptr) < 0) {
DIE("mount(%s, %s, nullptr, MS_BIND | MS_REC, nullptr)", source.c_str(),
target.c_str());
}
}
for (const std::string &tmpfs_dir : opt.tmpfs_dirs) {
PRINT_DEBUG("tmpfs: %s", tmpfs_dir.c_str());
if (mount("tmpfs", tmpfs_dir.c_str(), "tmpfs",
MS_NOSUID | MS_NODEV | MS_NOATIME, nullptr) < 0) {
DIE("mount(tmpfs, %s, tmpfs, MS_NOSUID | MS_NODEV | MS_NOATIME, nullptr)",
tmpfs_dir.c_str());
}
}
for (const std::string &writable_file : opt.writable_files) {
PRINT_DEBUG("writable: %s", writable_file.c_str());
if (bind_mount_sources.find(writable_file) != bind_mount_sources.end()) {
// Bind mount sources contained in writable_files will be kept writable in
// MakeFileSystemMostlyReadOnly, but have already been mounted at this
// point.
continue;
}
if (mount(writable_file.c_str(), writable_file.c_str(), nullptr,
MS_BIND | MS_REC, nullptr) < 0) {
DIE("mount(%s, %s, nullptr, MS_BIND | MS_REC, nullptr)",
writable_file.c_str(), writable_file.c_str());
}
}
// Make sure that the working directory is writable (unlike most of the rest
// of the file system, which is read-only by default). The easiest way to do
// this is by bind-mounting it upon itself.
PRINT_DEBUG("working dir: %s", opt.working_dir.c_str());
if (mount(opt.working_dir.c_str(), opt.working_dir.c_str(), nullptr, MS_BIND,
nullptr) < 0) {
DIE("mount(%s, %s, nullptr, MS_BIND, nullptr)", opt.working_dir.c_str(),
opt.working_dir.c_str());
}
}
// We later remount everything read-only, except the paths for which this method
// returns true.
static bool ShouldBeWritable(const std::string &mnt_dir) {
if (mnt_dir == opt.working_dir) {
return true;
}
if (opt.enable_pty && mnt_dir == "/dev/pts") {
return true;
}
for (const std::string &writable_file : opt.writable_files) {
if (mnt_dir == writable_file) {
return true;
}
}
for (const std::string &tmpfs_dir : opt.tmpfs_dirs) {
if (mnt_dir == tmpfs_dir) {
return true;
}
}
return false;
}
// Makes the whole filesystem read-only, except for the paths for which
// ShouldBeWritable returns true.
static void MakeFilesystemMostlyReadOnly() {
FILE *mounts = setmntent("/proc/self/mounts", "r");
if (mounts == nullptr) {
DIE("setmntent");
}
struct mntent *ent;
while ((ent = getmntent(mounts)) != nullptr) {
int mountFlags = MS_BIND | MS_REMOUNT;
// MS_REMOUNT does not allow us to change certain flags. This means, we have
// to first read them out and then pass them in back again. There seems to
// be no better way than this (an API for just getting the mount flags of a
// mount entry as a bitmask would be great).
if (hasmntopt(ent, "nodev") != nullptr) {
mountFlags |= MS_NODEV;
}
if (hasmntopt(ent, "noexec") != nullptr) {
mountFlags |= MS_NOEXEC;
}
if (hasmntopt(ent, "nosuid") != nullptr) {
mountFlags |= MS_NOSUID;
}
if (hasmntopt(ent, "noatime") != nullptr) {
mountFlags |= MS_NOATIME;
}
if (hasmntopt(ent, "nodiratime") != nullptr) {
mountFlags |= MS_NODIRATIME;
}
if (hasmntopt(ent, "relatime") != nullptr) {
mountFlags |= MS_RELATIME;
}
if (!ShouldBeWritable(ent->mnt_dir)) {
mountFlags |= MS_RDONLY;
}
PRINT_DEBUG("remount %s: %s", (mountFlags & MS_RDONLY) ? "ro" : "rw",
ent->mnt_dir);
if (mount(nullptr, ent->mnt_dir, nullptr, mountFlags, nullptr) < 0) {
// If we get EACCES or EPERM, this might be a mount-point for which we
// don't have read access. Not much we can do about this, but it also
// won't do any harm, so let's go on. The same goes for EINVAL or ENOENT,
// which are fired in case a later mount overlaps an earlier mount, e.g.
// consider the case of /proc, /proc/sys/fs/binfmt_misc and /proc, with
// the latter /proc being the one that an outer sandbox has mounted on
// top of its parent /proc. In that case, we're not allowed to remount
// /proc/sys/fs/binfmt_misc, because it is hidden. If we get ESTALE, the
// mount is a broken NFS mount. In the ideal case, the user would either
// fix or remove that mount, but in cases where that's not possible, we
// should just ignore it. Similarly, one can get ENODEV in case of
// autofs/automount failure.
switch (errno) {
case EACCES:
case EPERM:
case EINVAL:
case ENOENT:
case ESTALE:
case ENODEV:
PRINT_DEBUG(
"remount(nullptr, %s, nullptr, %d, nullptr) failure (%m) ignored",
ent->mnt_dir, mountFlags);
break;
default:
DIE("remount(nullptr, %s, nullptr, %d, nullptr)", ent->mnt_dir,
mountFlags);
}
}
}
endmntent(mounts);
}
static void MountProc() {
// Mount a new proc on top of the old one, because the old one still refers to
// our parent PID namespace.
if (mount("/proc", "/proc", "proc", MS_NODEV | MS_NOEXEC | MS_NOSUID,
nullptr) < 0) {
DIE("mount");
}
}
static void SetupNetworking() {
// When running in a separate network namespace, enable the loopback interface
// because some application may want to use it.
if (opt.create_netns == NETNS_WITH_LOOPBACK) {
int fd;
fd = socket(AF_INET, SOCK_DGRAM, 0);
if (fd < 0) {
DIE("socket");
}
struct ifreq ifr = {};
strncpy(ifr.ifr_name, "lo", IF_NAMESIZE);
// Verify that name is valid.
if (if_nametoindex(ifr.ifr_name) == 0) {
DIE("if_nametoindex");
}
// Enable the interface.
ifr.ifr_flags |= IFF_UP;
if (ioctl(fd, SIOCSIFFLAGS, &ifr) < 0) {
DIE("ioctl");
}
if (close(fd) < 0) {
DIE("close");
}
}
}
static void EnterWorkingDirectory() {
std::string path = opt.working_dir;
if (opt.hermetic) {
path = path.substr(opt.sandbox_root.size() + 1);
}
if (chdir(path.c_str()) < 0) {
DIE("chdir(%s)", path.c_str());
}
}
static void ForwardSignal(int signum) {
kill(-global_child_pid, signum);
}
static void SpawnChild() {
PRINT_DEBUG("calling fork...");
global_child_pid = fork();
if (global_child_pid < 0) {
DIE("fork()");
} else if (global_child_pid == 0) {
// Put the child into its own process group.
if (setpgid(0, 0) < 0) {
DIE("setpgid");
}
// Try to assign our terminal to the child process.
if (tcsetpgrp(STDIN_FILENO, getpgrp()) < 0 && errno != ENOTTY) {
DIE("tcsetpgrp");
}
// Unblock all signals, restore default handlers.
ClearSignalMask();
// Close the file PRINT_DEBUG writes to.
// Must happen late enough so we don't lose any debugging output.
if (global_debug) {
fclose(global_debug);
global_debug = nullptr;
}
// Force umask to include read and execute for everyone, to make output
// permissions predictable.
umask(022);
// argv[] passed to execve() must be a null-terminated array.
opt.args.push_back(nullptr);
if (execvp(opt.args[0], opt.args.data()) < 0) {
DIE("execvp(%s, %p)", opt.args[0], opt.args.data());
}
} else {
PRINT_DEBUG("child started with PID %d", global_child_pid);
}
}
static int WaitForChild() {
while (true) {
// Wait for some process to exit. This includes reparented processes in our
// PID namespace.
int status;
const pid_t pid = TEMP_FAILURE_RETRY(wait(&status));
if (pid < 0) {
// We don't expect any errors besides EINTR. In particular, ECHILD should
// be impossible because we haven't yet seen global_child_pid exit.
DIE("wait");
}
PRINT_DEBUG("wait returned pid=%d, status=0x%02x", pid, status);
// If this isn't our child's PID, there's nothing further to do; we've
// successfully reaped a zombie.
if (pid != global_child_pid) {
continue;
}
// If the child exited due to a signal, log that fact and exit with the same
// status.
if (WIFSIGNALED(status)) {
const int signal = WTERMSIG(status);
PRINT_DEBUG("child exited due to signal %d", WTERMSIG(status));
return 128 + signal;
}
// Otherwise it must have exited normally.
const int exit_code = WEXITSTATUS(status);
PRINT_DEBUG("child exited normally with code %d", exit_code);
return exit_code;
}
}
static void MountSandboxAndGoThere() {
if (mount(opt.sandbox_root.c_str(), opt.sandbox_root.c_str(), nullptr,
MS_BIND | MS_NOSUID, nullptr) < 0) {
DIE("mount");
}
if (chdir(opt.sandbox_root.c_str()) < 0) {
DIE("chdir(%s)", opt.sandbox_root.c_str());
}
}
static void CreateEmptyFile() {
// This is used as the base for bind mounting.
if (CreateTarget("tmp", true) < 0) {
DIE("CreateTarget tmp")
}
CreateFile("tmp/empty_file");
}
static void MountDev() {
if (CreateTarget("dev", true) < 0) {
DIE("CreateTarget /dev");
}
const char *devs[] = {"/dev/null", "/dev/random", "/dev/urandom", "/dev/zero",
NULL};
for (int i = 0; devs[i] != NULL; i++) {
LinkFile(devs[i] + 1);
if (mount(devs[i], devs[i] + 1, NULL, MS_BIND, NULL) < 0) {
DIE("mount");
}
}
if (symlink("/proc/self/fd", "dev/fd") < 0) {
DIE("symlink");
}
}
static void MountAllMounts() {
for (const std::string &tmpfs_dir : opt.tmpfs_dirs) {
PRINT_DEBUG("tmpfs: %s", tmpfs_dir.c_str());
if (mount("tmpfs", tmpfs_dir.c_str(), "tmpfs",
MS_NOSUID | MS_NODEV | MS_NOATIME, nullptr) < 0) {
DIE("mount(tmpfs, %s, tmpfs, MS_NOSUID | MS_NODEV | MS_NOATIME, nullptr)",
tmpfs_dir.c_str());
}
}
// Make sure that the working directory is writable (unlike most of the rest
// of the file system, which is read-only by default). The easiest way to do
// this is by bind-mounting it upon itself.
if (mount(opt.working_dir.c_str(), opt.working_dir.c_str(), nullptr, MS_BIND,
nullptr) < 0) {
DIE("mount(%s, %s, nullptr, MS_BIND, nullptr)", opt.working_dir.c_str(),
opt.working_dir.c_str());
}
for (int i = 0; i < (signed)opt.bind_mount_sources.size(); i++) {
if (global_debug) {
if (strcmp(opt.bind_mount_sources[i].c_str(),
opt.bind_mount_targets[i].c_str()) == 0) {
// The file is mounted to the same path inside the sandbox, as outside
// (e.g. /home/user -> <sandbox>/home/user), so we'll just show a
// simplified version of the mount command.
PRINT_DEBUG("mount: %s\n", opt.bind_mount_sources[i].c_str());
} else {
// The file is mounted to a custom location inside the sandbox.
// Create a user-friendly string for the sandboxed path and show it.
const std::string user_friendly_mount_target("<sandbox>" +
opt.bind_mount_targets[i]);
PRINT_DEBUG("mount: %s -> %s\n", opt.bind_mount_sources[i].c_str(),
user_friendly_mount_target.c_str());
}
}
const std::string full_sandbox_path(opt.sandbox_root +
opt.bind_mount_targets[i]);
struct stat sb;
if (stat(opt.bind_mount_sources[i].c_str(), &sb) < 0) {
DIE("stat");
}
bool IsDirectory = S_ISDIR(sb.st_mode);
if (CreateTarget(full_sandbox_path.c_str(), IsDirectory) < 0) {
DIE("CreateTarget %s", full_sandbox_path.c_str());
}
int result =
mount(opt.bind_mount_sources[i].c_str(), full_sandbox_path.c_str(),
NULL, MS_REC | MS_BIND | MS_RDONLY, NULL);
if (result != 0) {
DIE("mount");
}
}
for (const std::string &writable_file : opt.writable_files) {
PRINT_DEBUG("writable: %s", writable_file.c_str());
if (mount(writable_file.c_str(), writable_file.c_str(), nullptr,
MS_BIND | MS_REC, nullptr) < 0) {
DIE("mount(%s, %s, nullptr, MS_BIND | MS_REC, nullptr)",
writable_file.c_str(), writable_file.c_str());
}
}
}
static void ChangeRoot() {
// move the real root to old_root, then detach it
char old_root[16] = "old-root-XXXXXX";
if (mkdtemp(old_root) == NULL) {
perror("mkdtemp");
DIE("mkdtemp returned NULL\n");
}
// pivot_root has no wrapper in libc, so we need syscall()
if (syscall(SYS_pivot_root, ".", old_root) < 0) {
DIE("syscall");
}
if (chroot(".") < 0) {
DIE("chroot");
}
if (umount2(old_root, MNT_DETACH) < 0) {
DIE("umount2");
}
if (rmdir(old_root) < 0) {
DIE("rmdir");
}
}
int Pid1Main(void *args) {
PRINT_DEBUG("Pid1Main started");
Pid1Args pid1Args = *(static_cast<Pid1Args *>(args));
if (getpid() != 1) {
DIE("Using PID namespaces, but we are not PID 1");
}
// Before pid1 spawns a child pid2, we want to wait for the parent process to
// move pid1 to the cgroup so that pid2 will be created in the same cgroup.
WaitPipe(pid1Args.pipe_from_parent);
// Start with default signal handlers and an empty signal mask.
ClearSignalMask();
SetupSelfDestruction(pid1Args.pipe_to_parent);
// Sandbox ourselves.
SetupMountNamespace();
SetupUserNamespace();
if (opt.fake_hostname) {
SetupUtsNamespace();
}
if (opt.hermetic) {
MountSandboxAndGoThere();
CreateEmptyFile();
MountDev();
MountProc();
MountAllMounts();
ChangeRoot();
} else {
MountFilesystems();
MakeFilesystemMostlyReadOnly();
MountProc();
}
SetupNetworking();
EnterWorkingDirectory();
// Ignore terminal signals; we hand off the terminal to the child in
// SpawnChild below.
IgnoreSignal(SIGTTIN);
IgnoreSignal(SIGTTOU);
// Fork the child process.
SpawnChild();
// Forward requests to shut down gracefully to the child.
InstallSignalHandler(SIGTERM, ForwardSignal);
// Note that there's no need to kill any remaining descendant processes; they
// are in our PID namespace and the kernel will send them SIGKILL
// automatically once we exit.
return WaitForChild();
}