| // Copyright 2016 The Bazel Authors. All rights reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| /** |
| * This is PID 1 inside the sandbox environment and runs in a separate user, |
| * mount, UTS, IPC and PID namespace. |
| */ |
| |
| #include "linux-sandbox-options.h" |
| #include "linux-sandbox-utils.h" |
| #include "linux-sandbox.h" |
| |
| // Note that we define DIE() here and not in a shared header, because we want to |
| // use _exit() in the |
| // pid1 child, but exit() in the parent. |
| #define DIE(args...) \ |
| { \ |
| fprintf(stderr, __FILE__ ":" S__LINE__ ": \"" args); \ |
| fprintf(stderr, "\": "); \ |
| perror(NULL); \ |
| _exit(EXIT_FAILURE); \ |
| } |
| |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <libgen.h> |
| #include <math.h> |
| #include <mntent.h> |
| #include <net/if.h> |
| #include <pwd.h> |
| #include <signal.h> |
| #include <stdarg.h> |
| #include <stdbool.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/ioctl.h> |
| #include <sys/mount.h> |
| #include <sys/prctl.h> |
| #include <sys/stat.h> |
| #include <sys/syscall.h> |
| #include <sys/types.h> |
| #include <sys/wait.h> |
| #include <unistd.h> |
| |
| static int global_child_pid; |
| static char global_inaccessible_directory[] = "tmp/empty.XXXXXX"; |
| static char global_inaccessible_file[] = "tmp/empty.XXXXXX"; |
| |
| static void SetupSelfDestruction(int *sync_pipe) { |
| // We could also poll() on the pipe fd to find out when the parent goes away, |
| // and rely on SIGCHLD interrupting that otherwise. That might require us to |
| // install some trivial handler for SIGCHLD. Using O_ASYNC to turn the pipe |
| // close into SIGIO may also work. Another option is signalfd, although that's |
| // almost as obscure as this prctl. |
| if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) { |
| DIE("prctl"); |
| } |
| |
| // Verify that the parent still lives. |
| char buf = 0; |
| if (close(sync_pipe[0]) < 0) { |
| DIE("close"); |
| } |
| if (write(sync_pipe[1], &buf, 1) < 0) { |
| DIE("write"); |
| } |
| if (close(sync_pipe[1]) < 0) { |
| DIE("close"); |
| } |
| } |
| |
| static void SetupMountNamespace() { |
| // Fully isolate our mount namespace private from outside events, so that |
| // mounts in the outside environment do not affect our sandbox. |
| if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) { |
| DIE("mount"); |
| } |
| } |
| |
| static void WriteFile(const char *filename, const char *fmt, ...) { |
| FILE *stream = fopen(filename, "w"); |
| if (stream == NULL) { |
| DIE("fopen(%s)", filename); |
| } |
| |
| va_list ap; |
| va_start(ap, fmt); |
| int r = vfprintf(stream, fmt, ap); |
| va_end(ap); |
| |
| if (r < 0) { |
| DIE("vfprintf"); |
| } |
| |
| if (fclose(stream) != 0) { |
| DIE("fclose(%s)", filename); |
| } |
| } |
| |
| static void SetupUserNamespace() { |
| // Disable needs for CAP_SETGID. |
| struct stat sb; |
| if (stat("/proc/self/setgroups", &sb) == 0) { |
| WriteFile("/proc/self/setgroups", "deny"); |
| } else { |
| // Ignore ENOENT, because older Linux versions do not have this file (but |
| // also do not require writing to it). |
| if (errno != ENOENT) { |
| DIE("stat(/proc/self/setgroups"); |
| } |
| } |
| |
| int inner_uid = 0, inner_gid = 0; |
| if (!opt.fake_root) { |
| struct passwd *pwd = getpwnam("nobody"); |
| if (pwd == NULL) { |
| DIE("unable to find passwd entry for user nobody") |
| } |
| |
| inner_uid = pwd->pw_uid; |
| inner_gid = pwd->pw_gid; |
| } |
| |
| WriteFile("/proc/self/uid_map", "%d %d 1\n", inner_uid, global_outer_uid); |
| WriteFile("/proc/self/gid_map", "%d %d 1\n", inner_gid, global_outer_gid); |
| } |
| |
| static void SetupUtsNamespace() { |
| if (sethostname("sandbox", 7) < 0) { |
| DIE("sethostname"); |
| } |
| |
| if (setdomainname("sandbox", 7) < 0) { |
| DIE("setdomainname"); |
| } |
| } |
| |
| static void SetupHelperFiles() { |
| if (mkdtemp(global_inaccessible_directory) == NULL) { |
| DIE("mkdtemp(%s)", global_inaccessible_directory); |
| } |
| if (chmod(global_inaccessible_directory, 0) < 0) { |
| DIE("chmod(%s, 0)", global_inaccessible_directory); |
| } |
| |
| int handle = mkstemp(global_inaccessible_file); |
| if (handle < 0) { |
| DIE("mkstemp(%s)", global_inaccessible_file); |
| } |
| if (fchmod(handle, 0)) { |
| DIE("fchmod(%s, 0)", global_inaccessible_file); |
| } |
| if (close(handle) < 0) { |
| DIE("close(%s)", global_inaccessible_file); |
| } |
| } |
| |
| static bool IsDirectory(const char *path) { |
| struct stat sb; |
| if (stat(path, &sb) < 0) { |
| DIE("stat(%s)", path); |
| } |
| return S_ISDIR(sb.st_mode); |
| } |
| |
| // Recursively creates the file or directory specified in "path" and its parent |
| // directories. |
| static int CreateTarget(const char *path, bool is_directory) { |
| PRINT_DEBUG("CreateTarget(%s, %s)", path, is_directory ? "true" : "false"); |
| if (path == NULL) { |
| errno = EINVAL; |
| return -1; |
| } |
| |
| struct stat sb; |
| // If the path already exists... |
| if (stat(path, &sb) == 0) { |
| if (is_directory && S_ISDIR(sb.st_mode)) { |
| // and it's a directory and supposed to be a directory, we're done here. |
| return 0; |
| } else if (!is_directory && S_ISREG(sb.st_mode)) { |
| // and it's a regular file and supposed to be one, we're done here. |
| return 0; |
| } else { |
| // otherwise something is really wrong. |
| errno = is_directory ? ENOTDIR : EEXIST; |
| return -1; |
| } |
| } else { |
| // If stat failed because of any error other than "the path does not exist", |
| // this is an error. |
| if (errno != ENOENT) { |
| return -1; |
| } |
| } |
| |
| // Create the parent directory. |
| if (CreateTarget(dirname(strdupa(path)), true) < 0) { |
| DIE("CreateTarget(%s, true)", dirname(strdupa(path))); |
| } |
| |
| if (is_directory) { |
| if (mkdir(path, 0755) < 0) { |
| DIE("mkdir(%s, 0755)", path); |
| } |
| } else { |
| int handle; |
| if ((handle = open(path, O_CREAT | O_WRONLY | O_EXCL, 0666)) < 0) { |
| DIE("open(%s, O_CREAT | O_WRONLY | O_EXCL, 0666)", path); |
| } |
| if (close(handle) < 0) { |
| DIE("close(%d)", handle); |
| } |
| } |
| |
| return 0; |
| } |
| |
| static void MountFilesystems() { |
| if (mount("/", opt.sandbox_root_dir, NULL, MS_BIND | MS_REC, NULL) < 0) { |
| DIE("mount(/, %s, NULL, MS_BIND | MS_REC, NULL)", opt.sandbox_root_dir); |
| } |
| |
| if (chdir(opt.sandbox_root_dir) < 0) { |
| DIE("chdir(%s)", opt.sandbox_root_dir); |
| } |
| |
| for (const char *tmpfs_dir : opt.tmpfs_dirs) { |
| PRINT_DEBUG("tmpfs: %s", tmpfs_dir); |
| if (mount("tmpfs", tmpfs_dir + 1, "tmpfs", |
| MS_NOSUID | MS_NODEV | MS_NOATIME, NULL) < 0) { |
| DIE("mount(tmpfs, %s, tmpfs, MS_NOSUID | MS_NODEV | MS_NOATIME, NULL)", |
| tmpfs_dir + 1); |
| } |
| } |
| |
| // Make sure that our working directory is a mount point. The easiest way to |
| // do this is by bind-mounting it upon itself. |
| PRINT_DEBUG("working dir: %s", opt.working_dir); |
| CreateTarget(opt.working_dir + 1, true); |
| if (mount(opt.working_dir, opt.working_dir + 1, NULL, MS_BIND, NULL) < 0) { |
| DIE("mount(%s, %s, NULL, MS_BIND, NULL)", opt.working_dir, |
| opt.working_dir + 1); |
| } |
| |
| for (const char *bind_mount : opt.bind_mounts) { |
| PRINT_DEBUG("bind mount: %s", bind_mount); |
| CreateTarget(bind_mount + 1, IsDirectory(bind_mount)); |
| if (mount(bind_mount, bind_mount + 1, NULL, MS_BIND, NULL) < 0) { |
| DIE("mount(%s, %s, NULL, MS_BIND, NULL)", bind_mount, bind_mount + 1); |
| } |
| } |
| |
| for (const char *writable_file : opt.writable_files) { |
| PRINT_DEBUG("writable: %s", writable_file); |
| if (mount(writable_file, writable_file + 1, NULL, MS_BIND, NULL) < 0) { |
| DIE("mount(%s, %s, NULL, MS_BIND, NULL)", writable_file, |
| writable_file + 1); |
| } |
| } |
| |
| SetupHelperFiles(); |
| |
| for (const char *inaccessible_file : opt.inaccessible_files) { |
| struct stat sb; |
| if (stat(inaccessible_file, &sb) < 0) { |
| DIE("stat(%s)", inaccessible_file); |
| } |
| |
| if (S_ISDIR(sb.st_mode)) { |
| PRINT_DEBUG("inaccessible dir: %s", inaccessible_file); |
| if (mount(global_inaccessible_directory, inaccessible_file + 1, NULL, |
| MS_BIND, NULL) < 0) { |
| DIE("mount(%s, %s, NULL, MS_BIND, NULL)", global_inaccessible_directory, |
| inaccessible_file + 1); |
| } |
| } else { |
| PRINT_DEBUG("inaccessible file: %s", inaccessible_file); |
| if (mount(global_inaccessible_file, inaccessible_file + 1, NULL, MS_BIND, |
| NULL) < 0) { |
| DIE("mount(%s, %s, NULL, MS_BIND, NULL", global_inaccessible_file, |
| inaccessible_file + 1); |
| } |
| } |
| } |
| } |
| |
| // We later remount everything read-only, except the paths for which this method |
| // returns true. |
| static bool ShouldBeWritable(char *mnt_dir) { |
| mnt_dir += strlen(opt.sandbox_root_dir); |
| |
| if (strcmp(mnt_dir, opt.working_dir) == 0) { |
| return true; |
| } |
| |
| for (const char *writable_file : opt.writable_files) { |
| if (strcmp(mnt_dir, writable_file) == 0) { |
| return true; |
| } |
| } |
| |
| for (const char *tmpfs_dir : opt.tmpfs_dirs) { |
| if (strcmp(mnt_dir, tmpfs_dir) == 0) { |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| static bool IsUnderTmpDir(const char *mnt_dir) { |
| for (const char *tmpfs_dir : opt.tmpfs_dirs) { |
| if (strstr(mnt_dir, tmpfs_dir) == mnt_dir) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| // Makes the whole filesystem read-only, except for the paths for which |
| // ShouldBeWritable returns true. |
| static void MakeFilesystemMostlyReadOnly() { |
| FILE *mounts = setmntent("/proc/self/mounts", "r"); |
| if (mounts == NULL) { |
| DIE("setmntent"); |
| } |
| |
| struct mntent *ent; |
| while ((ent = getmntent(mounts)) != NULL) { |
| // Skip mounts that do not belong to our sandbox. |
| if (strstr(ent->mnt_dir, opt.sandbox_root_dir) != ent->mnt_dir) { |
| continue; |
| } |
| // Skip mounts that are under tmpfs directories because we've already |
| // replaced such directories with new tmpfs instances. |
| // mount() would fail with ENOENT if we tried to remount such mount points. |
| if (IsUnderTmpDir(ent->mnt_dir + strlen(opt.sandbox_root_dir))) { |
| continue; |
| } |
| |
| int mountFlags = MS_BIND | MS_REMOUNT; |
| |
| // MS_REMOUNT does not allow us to change certain flags. This means, we have |
| // to first read them out and then pass them in back again. There seems to |
| // be no better way than this (an API for just getting the mount flags of a |
| // mount entry as a bitmask would be great). |
| if (hasmntopt(ent, "nodev") != NULL) { |
| mountFlags |= MS_NODEV; |
| } |
| if (hasmntopt(ent, "noexec") != NULL) { |
| mountFlags |= MS_NOEXEC; |
| } |
| if (hasmntopt(ent, "nosuid") != NULL) { |
| mountFlags |= MS_NOSUID; |
| } |
| if (hasmntopt(ent, "noatime") != NULL) { |
| mountFlags |= MS_NOATIME; |
| } |
| if (hasmntopt(ent, "nodiratime") != NULL) { |
| mountFlags |= MS_NODIRATIME; |
| } |
| if (hasmntopt(ent, "relatime") != NULL) { |
| mountFlags |= MS_RELATIME; |
| } |
| |
| if (!ShouldBeWritable(ent->mnt_dir)) { |
| mountFlags |= MS_RDONLY; |
| } |
| |
| PRINT_DEBUG("remount %s: %s", (mountFlags & MS_RDONLY) ? "ro" : "rw", |
| ent->mnt_dir); |
| if (mount(NULL, ent->mnt_dir, NULL, mountFlags, NULL) < 0) { |
| // If we get EACCES, this might be a mount-point for which we don't have |
| // read access. Not much we can do about this, but it also won't do any |
| // harm, so let's go on. The same goes for EINVAL, which is fired in case |
| // a later mount overlaps an earlier mount, e.g. consider the case of |
| // /proc, /proc/sys/fs/binfmt_misc and /proc, with the latter /proc being |
| // the one that an outer sandbox has mounted on top of its parent /proc. |
| // In that case, we're not allowed to remount /proc/sys/fs/binfmt_misc, |
| // because it is hidden. |
| if (errno != EACCES && errno != EINVAL) { |
| DIE("remount(NULL, %s, NULL, %d, NULL)", ent->mnt_dir, mountFlags); |
| } |
| } |
| } |
| |
| endmntent(mounts); |
| } |
| |
| static void MountProc() { |
| // Mount a new proc on top of the old one, because the old one still refers to |
| // our parent PID namespace. |
| if (mount("proc", "proc", "proc", MS_NODEV | MS_NOEXEC | MS_NOSUID, NULL) < |
| 0) { |
| DIE("mount"); |
| } |
| } |
| |
| static void SetupNetworking() { |
| // When running in a separate network namespace, enable the loopback interface |
| // because some application may want to use it. |
| if (opt.create_netns) { |
| int fd; |
| fd = socket(AF_INET, SOCK_DGRAM, 0); |
| if (fd < 0) { |
| DIE("socket"); |
| } |
| |
| struct ifreq ifr; |
| memset(&ifr, 0, sizeof(ifr)); |
| strncpy(ifr.ifr_name, "lo", IF_NAMESIZE); |
| |
| // Verify that name is valid. |
| if (if_nametoindex(ifr.ifr_name) == 0) { |
| DIE("if_nametoindex"); |
| } |
| |
| // Enable the interface. |
| ifr.ifr_flags |= IFF_UP; |
| if (ioctl(fd, SIOCSIFFLAGS, &ifr) < 0) { |
| DIE("ioctl"); |
| } |
| |
| if (close(fd) < 0) { |
| DIE("close"); |
| } |
| } |
| } |
| |
| static void EnterSandbox() { |
| // Move the real root to old_root, then detach it. |
| char old_root[] = "tmp/old-root-XXXXXX"; |
| if (mkdtemp(old_root) == NULL) { |
| DIE("mkdtemp(%s)", old_root); |
| } |
| |
| // pivot_root has no wrapper in libc, so we need syscall() |
| if (syscall(SYS_pivot_root, ".", old_root) < 0) { |
| DIE("pivot_root(., %s)", old_root); |
| } |
| |
| if (chroot(".") < 0) { |
| DIE("chroot(.)"); |
| } |
| |
| if (umount2(old_root, MNT_DETACH) < 0) { |
| DIE("umount2(%s, MNT_DETACH)", old_root); |
| } |
| |
| if (rmdir(old_root) < 0) { |
| DIE("rmdir(%s)", old_root); |
| } |
| |
| if (chdir(opt.working_dir) < 0) { |
| DIE("chdir(%s)", opt.working_dir); |
| } |
| } |
| |
| static void InstallSignalHandler(int signum, void (*handler)(int)) { |
| struct sigaction sa; |
| memset(&sa, 0, sizeof(sa)); |
| sa.sa_handler = handler; |
| if (handler == SIG_IGN || handler == SIG_DFL) { |
| // No point in blocking signals when using the default handler or ignoring |
| // the signal. |
| if (sigemptyset(&sa.sa_mask) < 0) { |
| DIE("sigemptyset"); |
| } |
| } else { |
| // When using a custom handler, block all signals from firing while the |
| // handler is running. |
| if (sigfillset(&sa.sa_mask) < 0) { |
| DIE("sigfillset"); |
| } |
| } |
| // sigaction may fail for certain reserved signals. Ignore failure in this |
| // case, but report it in debug mode, just in case. |
| if (sigaction(signum, &sa, NULL) < 0) { |
| PRINT_DEBUG("sigaction(%d, &sa, NULL) failed", signum); |
| } |
| } |
| |
| static void IgnoreSignal(int signum) { InstallSignalHandler(signum, SIG_IGN); } |
| |
| // Reset the signal mask and restore the default handler for all signals. |
| static void RestoreSignalHandlersAndMask() { |
| // Use an empty signal mask for the process (= unblock all signals). |
| sigset_t empty_set; |
| if (sigemptyset(&empty_set) < 0) { |
| DIE("sigemptyset"); |
| } |
| if (sigprocmask(SIG_SETMASK, &empty_set, nullptr) < 0) { |
| DIE("sigprocmask(SIG_SETMASK, <empty set>, nullptr)"); |
| } |
| |
| // Set the default signal handler for all signals. |
| struct sigaction sa; |
| memset(&sa, 0, sizeof(sa)); |
| if (sigemptyset(&sa.sa_mask) < 0) { |
| DIE("sigemptyset"); |
| } |
| sa.sa_handler = SIG_DFL; |
| for (int i = 1; i < NSIG; ++i) { |
| // Ignore possible errors, because we might not be allowed to set the |
| // handler for certain signals, but we still want to try. |
| sigaction(i, &sa, nullptr); |
| } |
| } |
| |
| static void ForwardSignal(int signum) { |
| PRINT_DEBUG("ForwardSignal(%d)", signum); |
| kill(-global_child_pid, signum); |
| } |
| |
| static void SetupSignalHandlers() { |
| RestoreSignalHandlersAndMask(); |
| |
| for (int signum = 1; signum < NSIG; signum++) { |
| switch (signum) { |
| // Some signals should indeed kill us and not be forwarded to the child, |
| // thus we can use the default handler. |
| case SIGABRT: |
| case SIGBUS: |
| case SIGFPE: |
| case SIGILL: |
| case SIGSEGV: |
| case SIGSYS: |
| case SIGTRAP: |
| break; |
| // It's fine to use the default handler for SIGCHLD, because we use |
| // waitpid() in the main loop to wait for children to die anyway. |
| case SIGCHLD: |
| break; |
| // One does not simply install a signal handler for these two signals |
| case SIGKILL: |
| case SIGSTOP: |
| break; |
| // Ignore SIGTTIN and SIGTTOU, as we hand off the terminal to the child in |
| // SpawnChild(). |
| case SIGTTIN: |
| case SIGTTOU: |
| IgnoreSignal(signum); |
| break; |
| // All other signals should be forwarded to the child. |
| default: |
| InstallSignalHandler(signum, ForwardSignal); |
| break; |
| } |
| } |
| } |
| |
| static void SpawnChild() { |
| global_child_pid = fork(); |
| |
| if (global_child_pid < 0) { |
| DIE("fork()"); |
| } else if (global_child_pid == 0) { |
| // Put the child into its own process group. |
| if (setpgid(0, 0) < 0) { |
| DIE("setpgid"); |
| } |
| |
| // Try to assign our terminal to the child process. |
| if (tcsetpgrp(STDIN_FILENO, getpgrp()) < 0 && errno != ENOTTY) { |
| DIE("tcsetpgrp") |
| } |
| |
| // Unblock all signals, restore default handlers. |
| RestoreSignalHandlersAndMask(); |
| |
| // Force umask to include read and execute for everyone, to make output |
| // permissions predictable. |
| umask(022); |
| |
| // argv[] passed to execve() must be a null-terminated array. |
| opt.args.push_back(nullptr); |
| |
| if (execvp(opt.args[0], opt.args.data()) < 0) { |
| DIE("execvp(%s, %p)", opt.args[0], opt.args.data()); |
| } |
| } |
| } |
| |
| static void WaitForChild() { |
| while (1) { |
| // Check for zombies to be reaped and exit, if our own child exited. |
| int status; |
| pid_t killed_pid = waitpid(-1, &status, 0); |
| PRINT_DEBUG("waitpid returned %d", killed_pid); |
| |
| if (killed_pid < 0) { |
| // Our PID1 process got a signal that interrupted the waitpid() call and |
| // that was either ignored or forwared to the child. This is expected & |
| // fine, just continue waiting. |
| if (errno == EINTR) { |
| continue; |
| } |
| DIE("waitpid") |
| } else { |
| if (killed_pid == global_child_pid) { |
| // If the child process we spawned earlier terminated, we'll also |
| // terminate. We can simply _exit() here, because the Linux kernel will |
| // kindly SIGKILL all remaining processes in our PID namespace once we |
| // exit. |
| if (WIFSIGNALED(status)) { |
| PRINT_DEBUG("child died due to signal %d", WTERMSIG(status)); |
| _exit(128 + WTERMSIG(status)); |
| } else { |
| PRINT_DEBUG("child exited with code %d", WEXITSTATUS(status)); |
| _exit(WEXITSTATUS(status)); |
| } |
| } |
| } |
| } |
| } |
| |
| int Pid1Main(void *sync_pipe_param) { |
| if (getpid() != 1) { |
| DIE("Using PID namespaces, but we are not PID 1"); |
| } |
| |
| SetupSelfDestruction(reinterpret_cast<int *>(sync_pipe_param)); |
| SetupMountNamespace(); |
| SetupUserNamespace(); |
| SetupUtsNamespace(); |
| MountFilesystems(); |
| MakeFilesystemMostlyReadOnly(); |
| MountProc(); |
| SetupNetworking(); |
| EnterSandbox(); |
| SetupSignalHandlers(); |
| SpawnChild(); |
| WaitForChild(); |
| _exit(EXIT_FAILURE); |
| } |