| // Copyright 2016 The Bazel Authors. All rights reserved. | 
 | // | 
 | // Licensed under the Apache License, Version 2.0 (the "License"); | 
 | // you may not use this file except in compliance with the License. | 
 | // You may obtain a copy of the License at | 
 | // | 
 | //    http://www.apache.org/licenses/LICENSE-2.0 | 
 | // | 
 | // Unless required by applicable law or agreed to in writing, software | 
 | // distributed under the License is distributed on an "AS IS" BASIS, | 
 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
 | // See the License for the specific language governing permissions and | 
 | // limitations under the License. | 
 |  | 
 | /** | 
 |  * This is PID 1 inside the sandbox environment and runs in a separate user, | 
 |  * mount, UTS, IPC and PID namespace. | 
 |  */ | 
 |  | 
 | #include "src/main/tools/linux-sandbox-pid1.h" | 
 |  | 
 | #include <errno.h> | 
 | #include <fcntl.h> | 
 | #include <libgen.h> | 
 | #include <math.h> | 
 | #include <mntent.h> | 
 | #include <net/if.h> | 
 | #include <pwd.h> | 
 | #include <signal.h> | 
 | #include <stdarg.h> | 
 | #include <stdbool.h> | 
 | #include <stdio.h> | 
 | #include <stdlib.h> | 
 | #include <string.h> | 
 | #include <sys/ioctl.h> | 
 | #include <sys/mount.h> | 
 | #include <sys/prctl.h> | 
 | #include <sys/stat.h> | 
 | #include <sys/syscall.h> | 
 | #include <sys/types.h> | 
 | #include <sys/wait.h> | 
 | #include <unistd.h> | 
 |  | 
 | #include <string> | 
 |  | 
 | #ifndef MS_REC | 
 | // Some systems do not define MS_REC in sys/mount.h. We might be able to grab it | 
 | // from linux/fs.h instead (cf. #2667). | 
 | #include <linux/fs.h> | 
 | #endif | 
 |  | 
 | #ifndef TEMP_FAILURE_RETRY | 
 | // Some C standard libraries like musl do not define this macro, so we'll | 
 | // include our own version for compatibility. | 
 | #define TEMP_FAILURE_RETRY(exp)            \ | 
 |   ({                                       \ | 
 |     decltype(exp) _rc;                     \ | 
 |     do {                                   \ | 
 |       _rc = (exp);                         \ | 
 |     } while (_rc == -1 && errno == EINTR); \ | 
 |     _rc;                                   \ | 
 |   }) | 
 | #endif  // TEMP_FAILURE_RETRY | 
 |  | 
 | #include "src/main/tools/linux-sandbox-options.h" | 
 | #include "src/main/tools/linux-sandbox.h" | 
 | #include "src/main/tools/logging.h" | 
 | #include "src/main/tools/process-tools.h" | 
 |  | 
 | static void WriteFile(const std::string &filename, const char *fmt, ...) { | 
 |   FILE *stream = fopen(filename.c_str(), "w"); | 
 |   if (stream == nullptr) { | 
 |     DIE("fopen(%s)", filename.c_str()); | 
 |   } | 
 |  | 
 |   va_list ap; | 
 |   va_start(ap, fmt); | 
 |   int r = vfprintf(stream, fmt, ap); | 
 |   va_end(ap); | 
 |  | 
 |   if (r < 0) { | 
 |     DIE("vfprintf"); | 
 |   } | 
 |  | 
 |   if (fclose(stream) != 0) { | 
 |     DIE("fclose(%s)", filename.c_str()); | 
 |   } | 
 | } | 
 |  | 
 | static int global_child_pid; | 
 |  | 
 | // Helper methods | 
 | static void CreateFile(const char *path) { | 
 |   int handle = open(path, O_CREAT | O_WRONLY | O_EXCL, 0666); | 
 |   if (handle < 0) { | 
 |     DIE("open"); | 
 |   } | 
 |   if (close(handle) < 0) { | 
 |     DIE("close"); | 
 |   } | 
 | } | 
 |  | 
 | // Creates an empty file at 'path' by hard linking it from a known empty file. | 
 | // This is over two times faster than creating empty files via open() on | 
 | // certain filesystems (e.g. XFS). | 
 | static void LinkFile(const char *path) { | 
 |   if (link("tmp/empty_file", path) < 0) { | 
 |     DIE("link %s", path); | 
 |   } | 
 | } | 
 |  | 
 | // Recursively creates the file or directory specified in "path" and its parent | 
 | // directories. | 
 | // Return -1 on failure and sets errno to: | 
 | //    EINVAL   path is null | 
 | //    ENOTDIR  path exists and is not a directory | 
 | //    EEXIST   path exists and is a directory | 
 | //    ENOENT   stat call with the path failed | 
 | static int CreateTarget(const char *path, bool is_directory) { | 
 |   if (path == NULL) { | 
 |     errno = EINVAL; | 
 |     return -1; | 
 |   } | 
 |  | 
 |   struct stat sb; | 
 |   // If the path already exists... | 
 |  | 
 |   if (stat(path, &sb) == 0) { | 
 |     if (is_directory && S_ISDIR(sb.st_mode)) { | 
 |       // and it's a directory and supposed to be a directory, we're done here. | 
 |       return 0; | 
 |     } else if (!is_directory && S_ISREG(sb.st_mode)) { | 
 |       // and it's a regular file and supposed to be one, we're done here. | 
 |       return 0; | 
 |     } else { | 
 |       // otherwise something is really wrong. | 
 |       errno = is_directory ? ENOTDIR : EEXIST; | 
 |       return -1; | 
 |     } | 
 |   } else { | 
 |     // If stat failed because of any error other than "the path does not exist", | 
 |     // this is an error. | 
 |     if (errno != ENOENT) { | 
 |       return -1; | 
 |     } | 
 |   } | 
 |  | 
 |   // Create the parent directory. | 
 |   if (CreateTarget(dirname(strdupa(path)), true) < 0) { | 
 |     DIE("CreateTarget %s", dirname(strdupa(path))); | 
 |   } | 
 |  | 
 |   if (is_directory) { | 
 |     if (mkdir(path, 0755) < 0) { | 
 |       DIE("mkdir"); | 
 |     } | 
 |   } else { | 
 |     LinkFile(path); | 
 |   } | 
 |  | 
 |   return 0; | 
 | } | 
 |  | 
 | static void SetupSelfDestruction(int *sync_pipe) { | 
 |   // We could also poll() on the pipe fd to find out when the parent goes away, | 
 |   // and rely on SIGCHLD interrupting that otherwise. That might require us to | 
 |   // install some trivial handler for SIGCHLD. Using O_ASYNC to turn the pipe | 
 |   // close into SIGIO may also work. Another option is signalfd, although that's | 
 |   // almost as obscure as this prctl. | 
 |   if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) { | 
 |     DIE("prctl"); | 
 |   } | 
 |  | 
 |   // Switch to a new process group, otherwise our process group will still refer | 
 |   // to the outer PID namespace. We might then accidentally kill our parent by a | 
 |   // call to e.g. `kill(0, sig)`. | 
 |   if (setpgid(0, 0) < 0) { | 
 |     DIE("setpgid"); | 
 |   } | 
 |  | 
 |   // Verify that the parent still lives. | 
 |   char buf = 0; | 
 |   if (close(sync_pipe[0]) < 0) { | 
 |     DIE("close"); | 
 |   } | 
 |   if (write(sync_pipe[1], &buf, 1) < 0) { | 
 |     DIE("write"); | 
 |   } | 
 |   if (close(sync_pipe[1]) < 0) { | 
 |     DIE("close"); | 
 |   } | 
 | } | 
 |  | 
 | static void SetupMountNamespace() { | 
 |   // Fully isolate our mount namespace private from outside events, so that | 
 |   // mounts in the outside environment do not affect our sandbox. | 
 |   if (mount(nullptr, "/", nullptr, MS_REC | MS_PRIVATE, nullptr) < 0) { | 
 |     DIE("mount"); | 
 |   } | 
 | } | 
 |  | 
 | static void SetupUserNamespace() { | 
 |   // Disable needs for CAP_SETGID. | 
 |   struct stat sb; | 
 |   if (stat("/proc/self/setgroups", &sb) == 0) { | 
 |     WriteFile("/proc/self/setgroups", "deny"); | 
 |   } else { | 
 |     // Ignore ENOENT, because older Linux versions do not have this file (but | 
 |     // also do not require writing to it). | 
 |     if (errno != ENOENT) { | 
 |       DIE("stat(/proc/self/setgroups"); | 
 |     } | 
 |   } | 
 |  | 
 |   uid_t inner_uid; | 
 |   gid_t inner_gid; | 
 |   if (opt.fake_root) { | 
 |     // Change our username to 'root'. | 
 |     inner_uid = 0; | 
 |     inner_gid = 0; | 
 |   } else if (opt.fake_username) { | 
 |     // Change our username to 'nobody'. | 
 |     struct passwd *pwd = getpwnam("nobody"); | 
 |     if (pwd == nullptr) { | 
 |       DIE("unable to find passwd entry for user nobody") | 
 |     } | 
 |  | 
 |     inner_uid = pwd->pw_uid; | 
 |     inner_gid = pwd->pw_gid; | 
 |   } else { | 
 |     // Do not change the username inside the sandbox. | 
 |     inner_uid = global_outer_uid; | 
 |     inner_gid = global_outer_gid; | 
 |   } | 
 |   WriteFile("/proc/self/uid_map", "%u %u 1\n", inner_uid, global_outer_uid); | 
 |   WriteFile("/proc/self/gid_map", "%u %u 1\n", inner_gid, global_outer_gid); | 
 | } | 
 |  | 
 | static void SetupUtsNamespace() { | 
 |   if (sethostname("localhost", 9) < 0) { | 
 |     DIE("sethostname"); | 
 |   } | 
 |  | 
 |   if (setdomainname("localdomain", 11) < 0) { | 
 |     DIE("setdomainname"); | 
 |   } | 
 | } | 
 |  | 
 | static void MountFilesystems() { | 
 |   for (const std::string &tmpfs_dir : opt.tmpfs_dirs) { | 
 |     PRINT_DEBUG("tmpfs: %s", tmpfs_dir.c_str()); | 
 |     if (mount("tmpfs", tmpfs_dir.c_str(), "tmpfs", | 
 |               MS_NOSUID | MS_NODEV | MS_NOATIME, nullptr) < 0) { | 
 |       DIE("mount(tmpfs, %s, tmpfs, MS_NOSUID | MS_NODEV | MS_NOATIME, nullptr)", | 
 |           tmpfs_dir.c_str()); | 
 |     } | 
 |   } | 
 |  | 
 |   // Make sure that our working directory is a mount point. The easiest way to | 
 |   // do this is by bind-mounting it upon itself. | 
 |   PRINT_DEBUG("working dir: %s", opt.working_dir.c_str()); | 
 |  | 
 |   // An attempt to mount the sandbox in tmpfs will always fail, so this block is | 
 |   // slightly redundant with the next mount() check, but dumping the mount() | 
 |   // syscall is incredibly cryptic, so we explicitly check against and warn | 
 |   // about attempts to use tmpfs. | 
 |   for (const std::string &tmpfs_dir : opt.tmpfs_dirs) { | 
 |     if (opt.working_dir.find(tmpfs_dir) == 0) { | 
 |       DIE("The sandbox working directory cannot be below a path where we mount " | 
 |           "tmpfs (you requested mounting %s in %s). Is your --output_base= " | 
 |           "below one of your --sandbox_tmpfs_path values?", | 
 |           opt.working_dir.c_str(), tmpfs_dir.c_str()); | 
 |     } | 
 |   } | 
 |  | 
 |   if (mount(opt.working_dir.c_str(), opt.working_dir.c_str(), nullptr, MS_BIND, | 
 |             nullptr) < 0) { | 
 |     DIE("mount(%s, %s, nullptr, MS_BIND, nullptr)", opt.working_dir.c_str(), | 
 |         opt.working_dir.c_str()); | 
 |   } | 
 |  | 
 |   for (size_t i = 0; i < opt.bind_mount_sources.size(); i++) { | 
 |     const std::string &source = opt.bind_mount_sources.at(i); | 
 |     const std::string &target = opt.bind_mount_targets.at(i); | 
 |     PRINT_DEBUG("bind mount: %s -> %s", source.c_str(), target.c_str()); | 
 |     if (mount(source.c_str(), target.c_str(), nullptr, MS_BIND, nullptr) < 0) { | 
 |       DIE("mount(%s, %s, nullptr, MS_BIND, nullptr)", source.c_str(), | 
 |           target.c_str()); | 
 |     } | 
 |   } | 
 |  | 
 |   for (const std::string &writable_file : opt.writable_files) { | 
 |     PRINT_DEBUG("writable: %s", writable_file.c_str()); | 
 |     if (mount(writable_file.c_str(), writable_file.c_str(), nullptr, | 
 |               MS_BIND | MS_REC, nullptr) < 0) { | 
 |       DIE("mount(%s, %s, nullptr, MS_BIND | MS_REC, nullptr)", | 
 |           writable_file.c_str(), writable_file.c_str()); | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | // We later remount everything read-only, except the paths for which this method | 
 | // returns true. | 
 | static bool ShouldBeWritable(const std::string &mnt_dir) { | 
 |   if (mnt_dir == opt.working_dir) { | 
 |     return true; | 
 |   } | 
 |  | 
 |   for (const std::string &writable_file : opt.writable_files) { | 
 |     if (mnt_dir == writable_file) { | 
 |       return true; | 
 |     } | 
 |   } | 
 |  | 
 |   for (const std::string &tmpfs_dir : opt.tmpfs_dirs) { | 
 |     if (mnt_dir == tmpfs_dir) { | 
 |       return true; | 
 |     } | 
 |   } | 
 |  | 
 |   return false; | 
 | } | 
 |  | 
 | // Makes the whole filesystem read-only, except for the paths for which | 
 | // ShouldBeWritable returns true. | 
 | static void MakeFilesystemMostlyReadOnly() { | 
 |   FILE *mounts = setmntent("/proc/self/mounts", "r"); | 
 |   if (mounts == nullptr) { | 
 |     DIE("setmntent"); | 
 |   } | 
 |  | 
 |   struct mntent *ent; | 
 |   while ((ent = getmntent(mounts)) != nullptr) { | 
 |     int mountFlags = MS_BIND | MS_REMOUNT; | 
 |  | 
 |     // MS_REMOUNT does not allow us to change certain flags. This means, we have | 
 |     // to first read them out and then pass them in back again. There seems to | 
 |     // be no better way than this (an API for just getting the mount flags of a | 
 |     // mount entry as a bitmask would be great). | 
 |     if (hasmntopt(ent, "nodev") != nullptr) { | 
 |       mountFlags |= MS_NODEV; | 
 |     } | 
 |     if (hasmntopt(ent, "noexec") != nullptr) { | 
 |       mountFlags |= MS_NOEXEC; | 
 |     } | 
 |     if (hasmntopt(ent, "nosuid") != nullptr) { | 
 |       mountFlags |= MS_NOSUID; | 
 |     } | 
 |     if (hasmntopt(ent, "noatime") != nullptr) { | 
 |       mountFlags |= MS_NOATIME; | 
 |     } | 
 |     if (hasmntopt(ent, "nodiratime") != nullptr) { | 
 |       mountFlags |= MS_NODIRATIME; | 
 |     } | 
 |     if (hasmntopt(ent, "relatime") != nullptr) { | 
 |       mountFlags |= MS_RELATIME; | 
 |     } | 
 |  | 
 |     if (!ShouldBeWritable(ent->mnt_dir)) { | 
 |       mountFlags |= MS_RDONLY; | 
 |     } | 
 |  | 
 |     PRINT_DEBUG("remount %s: %s", (mountFlags & MS_RDONLY) ? "ro" : "rw", | 
 |                 ent->mnt_dir); | 
 |     if (mount(nullptr, ent->mnt_dir, nullptr, mountFlags, nullptr) < 0) { | 
 |       // If we get EACCES or EPERM, this might be a mount-point for which we | 
 |       // don't have read access. Not much we can do about this, but it also | 
 |       // won't do any harm, so let's go on. The same goes for EINVAL or ENOENT, | 
 |       // which are fired in case a later mount overlaps an earlier mount, e.g. | 
 |       // consider the case of /proc, /proc/sys/fs/binfmt_misc and /proc, with | 
 |       // the latter /proc being the one that an outer sandbox has mounted on | 
 |       // top of its parent /proc. In that case, we're not allowed to remount | 
 |       // /proc/sys/fs/binfmt_misc, because it is hidden. If we get ESTALE, the | 
 |       // mount is a broken NFS mount. In the ideal case, the user would either | 
 |       // fix or remove that mount, but in cases where that's not possible, we | 
 |       // should just ignore it. | 
 |       switch (errno) { | 
 |         case EACCES: | 
 |         case EPERM: | 
 |         case EINVAL: | 
 |         case ENOENT: | 
 |         case ESTALE: | 
 |           PRINT_DEBUG( | 
 |               "remount(nullptr, %s, nullptr, %d, nullptr) failure (%m) ignored", | 
 |               ent->mnt_dir, mountFlags); | 
 |           break; | 
 |         default: | 
 |           DIE("remount(nullptr, %s, nullptr, %d, nullptr)", ent->mnt_dir, | 
 |               mountFlags); | 
 |       } | 
 |     } | 
 |   } | 
 |  | 
 |   endmntent(mounts); | 
 | } | 
 |  | 
 | static void MountProc() { | 
 |   // Mount a new proc on top of the old one, because the old one still refers to | 
 |   // our parent PID namespace. | 
 |   if (mount("/proc", "/proc", "proc", MS_NODEV | MS_NOEXEC | MS_NOSUID, | 
 |             nullptr) < 0) { | 
 |     DIE("mount"); | 
 |   } | 
 | } | 
 |  | 
 | static void SetupNetworking() { | 
 |   // When running in a separate network namespace, enable the loopback interface | 
 |   // because some application may want to use it. | 
 |   if (opt.create_netns) { | 
 |     int fd; | 
 |     fd = socket(AF_INET, SOCK_DGRAM, 0); | 
 |     if (fd < 0) { | 
 |       DIE("socket"); | 
 |     } | 
 |  | 
 |     struct ifreq ifr = {}; | 
 |     strncpy(ifr.ifr_name, "lo", IF_NAMESIZE); | 
 |  | 
 |     // Verify that name is valid. | 
 |     if (if_nametoindex(ifr.ifr_name) == 0) { | 
 |       DIE("if_nametoindex"); | 
 |     } | 
 |  | 
 |     // Enable the interface. | 
 |     ifr.ifr_flags |= IFF_UP; | 
 |     if (ioctl(fd, SIOCSIFFLAGS, &ifr) < 0) { | 
 |       DIE("ioctl"); | 
 |     } | 
 |  | 
 |     if (close(fd) < 0) { | 
 |       DIE("close"); | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | static void EnterWorkingDirectory() { | 
 |   std::string path = opt.working_dir; | 
 |   if (opt.hermetic) { | 
 |     path = path.substr(opt.sandbox_root.size() + 1); | 
 |   } | 
 |  | 
 |   if (chdir(path.c_str()) < 0) { | 
 |     DIE("chdir(%s)", path.c_str()); | 
 |   } | 
 | } | 
 |  | 
 | static void ForwardSignal(int signum) { | 
 |   kill(-global_child_pid, signum); | 
 | } | 
 |  | 
 | static void SpawnChild() { | 
 |   PRINT_DEBUG("calling fork..."); | 
 |   global_child_pid = fork(); | 
 |  | 
 |   if (global_child_pid < 0) { | 
 |     DIE("fork()"); | 
 |   } else if (global_child_pid == 0) { | 
 |     // Put the child into its own process group. | 
 |     if (setpgid(0, 0) < 0) { | 
 |       DIE("setpgid"); | 
 |     } | 
 |  | 
 |     // Try to assign our terminal to the child process. | 
 |     if (tcsetpgrp(STDIN_FILENO, getpgrp()) < 0 && errno != ENOTTY) { | 
 |       DIE("tcsetpgrp"); | 
 |     } | 
 |  | 
 |     // Unblock all signals, restore default handlers. | 
 |     ClearSignalMask(); | 
 |  | 
 |     // Force umask to include read and execute for everyone, to make output | 
 |     // permissions predictable. | 
 |     umask(022); | 
 |  | 
 |     // argv[] passed to execve() must be a null-terminated array. | 
 |     opt.args.push_back(nullptr); | 
 |  | 
 |     if (execvp(opt.args[0], opt.args.data()) < 0) { | 
 |       DIE("execvp(%s, %p)", opt.args[0], opt.args.data()); | 
 |     } | 
 |   } else { | 
 |     PRINT_DEBUG("child started with PID %d", global_child_pid); | 
 |   } | 
 | } | 
 |  | 
 | static int WaitForChild() { | 
 |   while (true) { | 
 |     // Wait for some process to exit. This includes reparented processes in our | 
 |     // PID namespace. | 
 |     int status; | 
 |     const pid_t pid = TEMP_FAILURE_RETRY(wait(&status)); | 
 |  | 
 |     if (pid < 0) { | 
 |       // We don't expect any errors besides EINTR. In particular, ECHILD should | 
 |       // be impossible because we haven't yet seen global_child_pid exit. | 
 |       DIE("wait"); | 
 |     } | 
 |  | 
 |     PRINT_DEBUG("wait returned pid=%d, status=0x%02x", pid, status); | 
 |  | 
 |     // If this isn't our child's PID, there's nothing further to do; we've | 
 |     // successfully reaped a zombie. | 
 |     if (pid != global_child_pid) { | 
 |       continue; | 
 |     } | 
 |  | 
 |     // If the child exited due to a signal, log that fact and exit with the same | 
 |     // status. | 
 |     if (WIFSIGNALED(status)) { | 
 |       const int signal = WTERMSIG(status); | 
 |       PRINT_DEBUG("child exited due to signal %d", WTERMSIG(status)); | 
 |       return 128 + signal; | 
 |     } | 
 |  | 
 |     // Otherwise it must have exited normally. | 
 |     const int exit_code = WEXITSTATUS(status); | 
 |     PRINT_DEBUG("child exited normally with code %d", exit_code); | 
 |     return exit_code; | 
 |   } | 
 | } | 
 |  | 
 | static void MountSandboxAndGoThere() { | 
 |   if (mount(opt.sandbox_root.c_str(), opt.sandbox_root.c_str(), nullptr, | 
 |             MS_BIND | MS_NOSUID, nullptr) < 0) { | 
 |     DIE("mount"); | 
 |   } | 
 |   if (chdir(opt.sandbox_root.c_str()) < 0) { | 
 |     DIE("chdir(%s)", opt.sandbox_root.c_str()); | 
 |   } | 
 | } | 
 |  | 
 | static void CreateEmptyFile() { | 
 |   // This is used as the base for bind mounting. | 
 |   if (CreateTarget("tmp", true) < 0) { | 
 |     DIE("CreateTarget tmp") | 
 |   } | 
 |   CreateFile("tmp/empty_file"); | 
 | } | 
 |  | 
 | static void MountDev() { | 
 |   if (CreateTarget("dev", true) < 0) { | 
 |     DIE("CreateTarget /dev"); | 
 |   } | 
 |   const char *devs[] = {"/dev/null", "/dev/random", "/dev/urandom", "/dev/zero", | 
 |                         NULL}; | 
 |   for (int i = 0; devs[i] != NULL; i++) { | 
 |     LinkFile(devs[i] + 1); | 
 |     if (mount(devs[i], devs[i] + 1, NULL, MS_BIND, NULL) < 0) { | 
 |       DIE("mount"); | 
 |     } | 
 |   } | 
 |   if (symlink("/proc/self/fd", "dev/fd") < 0) { | 
 |     DIE("symlink"); | 
 |   } | 
 | } | 
 |  | 
 | static void MountAllMounts() { | 
 |   for (const std::string &tmpfs_dir : opt.tmpfs_dirs) { | 
 |     PRINT_DEBUG("tmpfs: %s", tmpfs_dir.c_str()); | 
 |     if (mount("tmpfs", tmpfs_dir.c_str(), "tmpfs", | 
 |               MS_NOSUID | MS_NODEV | MS_NOATIME, nullptr) < 0) { | 
 |       DIE("mount(tmpfs, %s, tmpfs, MS_NOSUID | MS_NODEV | MS_NOATIME, nullptr)", | 
 |           tmpfs_dir.c_str()); | 
 |     } | 
 |   } | 
 |  | 
 |   // Make sure that our working directory is a mount point. The easiest way to | 
 |   // do this is by bind-mounting it upon itself. | 
 |   if (mount(opt.working_dir.c_str(), opt.working_dir.c_str(), nullptr, MS_BIND, | 
 |             nullptr) < 0) { | 
 |     DIE("mount(%s, %s, nullptr, MS_BIND, nullptr)", opt.working_dir.c_str(), | 
 |         opt.working_dir.c_str()); | 
 |   } | 
 |   for (int i = 0; i < (signed)opt.bind_mount_sources.size(); i++) { | 
 |     if (opt.debug) { | 
 |       if (strcmp(opt.bind_mount_sources[i].c_str(), | 
 |                  opt.bind_mount_targets[i].c_str()) == 0) { | 
 |         // The file is mounted to the same path inside the sandbox, as outside | 
 |         // (e.g. /home/user -> <sandbox>/home/user), so we'll just show a | 
 |         // simplified version of the mount command. | 
 |         PRINT_DEBUG("mount: %s\n", opt.bind_mount_sources[i].c_str()); | 
 |       } else { | 
 |         // The file is mounted to a custom location inside the sandbox. | 
 |         // Create a user-friendly string for the sandboxed path and show it. | 
 |         const std::string user_friendly_mount_target("<sandbox>" + | 
 |                                                      opt.bind_mount_targets[i]); | 
 |         PRINT_DEBUG("mount: %s -> %s\n", opt.bind_mount_sources[i].c_str(), | 
 |                     user_friendly_mount_target.c_str()); | 
 |       } | 
 |     } | 
 |     const std::string full_sandbox_path(opt.sandbox_root + | 
 |                                         opt.bind_mount_targets[i]); | 
 |  | 
 |     struct stat sb; | 
 |     if (stat(opt.bind_mount_sources[i].c_str(), &sb) < 0) { | 
 |       DIE("stat"); | 
 |     } | 
 |     bool IsDirectory = S_ISDIR(sb.st_mode); | 
 |     if (CreateTarget(full_sandbox_path.c_str(), IsDirectory) < 0) { | 
 |       DIE("CreateTarget %s", full_sandbox_path.c_str()); | 
 |     } | 
 |     int result = | 
 |         mount(opt.bind_mount_sources[i].c_str(), full_sandbox_path.c_str(), | 
 |               NULL, MS_REC | MS_BIND | MS_RDONLY, NULL); | 
 |     if (result != 0) { | 
 |       DIE("mount"); | 
 |     } | 
 |   } | 
 |   for (const std::string &writable_file : opt.writable_files) { | 
 |     PRINT_DEBUG("writable: %s", writable_file.c_str()); | 
 |     if (mount(writable_file.c_str(), writable_file.c_str(), nullptr, | 
 |               MS_BIND | MS_REC, nullptr) < 0) { | 
 |       DIE("mount(%s, %s, nullptr, MS_BIND | MS_REC, nullptr)", | 
 |           writable_file.c_str(), writable_file.c_str()); | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | static void ChangeRoot() { | 
 |   // move the real root to old_root, then detach it | 
 |   char old_root[16] = "old-root-XXXXXX"; | 
 |   if (mkdtemp(old_root) == NULL) { | 
 |     perror("mkdtemp"); | 
 |     DIE("mkdtemp returned NULL\n"); | 
 |   } | 
 |   // pivot_root has no wrapper in libc, so we need syscall() | 
 |   if (syscall(SYS_pivot_root, ".", old_root) < 0) { | 
 |     DIE("syscall"); | 
 |   } | 
 |   if (chroot(".") < 0) { | 
 |     DIE("chroot"); | 
 |   } | 
 |   if (umount2(old_root, MNT_DETACH) < 0) { | 
 |     DIE("umount2"); | 
 |   } | 
 |   if (rmdir(old_root) < 0) { | 
 |     DIE("rmdir"); | 
 |   } | 
 | } | 
 |  | 
 | int Pid1Main(void *sync_pipe_param) { | 
 |   PRINT_DEBUG("Pid1Main started"); | 
 |  | 
 |   if (getpid() != 1) { | 
 |     DIE("Using PID namespaces, but we are not PID 1"); | 
 |   } | 
 |  | 
 |   // Start with default signal handlers and an empty signal mask. | 
 |   ClearSignalMask(); | 
 |  | 
 |   SetupSelfDestruction(reinterpret_cast<int *>(sync_pipe_param)); | 
 |  | 
 |   // Sandbox ourselves. | 
 |   SetupMountNamespace(); | 
 |   SetupUserNamespace(); | 
 |   if (opt.fake_hostname) { | 
 |     SetupUtsNamespace(); | 
 |   } | 
 |  | 
 |   if (opt.hermetic) { | 
 |     MountSandboxAndGoThere(); | 
 |     CreateEmptyFile(); | 
 |     MountDev(); | 
 |     MountProc(); | 
 |     MountAllMounts(); | 
 |     ChangeRoot(); | 
 |   } else { | 
 |     MountFilesystems(); | 
 |     MakeFilesystemMostlyReadOnly(); | 
 |     MountProc(); | 
 |   } | 
 |   SetupNetworking(); | 
 |   EnterWorkingDirectory(); | 
 |  | 
 |   // Ignore terminal signals; we hand off the terminal to the child in | 
 |   // SpawnChild below. | 
 |   IgnoreSignal(SIGTTIN); | 
 |   IgnoreSignal(SIGTTOU); | 
 |  | 
 |   // Fork the child process. | 
 |   SpawnChild(); | 
 |  | 
 |   // Forward requests to shut down gracefully to the child. | 
 |   InstallSignalHandler(SIGTERM, ForwardSignal); | 
 |  | 
 |   // Note that there's no need to kill any remaining descendant processes; they | 
 |   // are in our PID namespace and the kernel will send them SIGKILL | 
 |   // automatically once we exit. | 
 |   return WaitForChild(); | 
 | } |