src/main/tools/linux-sandbox-pid1.cc - bazel - Git at Google

 // Copyright 2016 The Bazel Authors. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 /**
  * This is PID 1 inside the sandbox environment and runs in a separate user,
  * mount, UTS, IPC and PID namespace.
  */

 #include "linux-sandbox-options.h"
 #include "linux-sandbox-utils.h"
 #include "linux-sandbox.h"

 // Note that we define DIE() here and not in a shared header, because we want to
 // use _exit() in the
 // pid1 child, but exit() in the parent.
 #define DIE(args...)                                     \
   {                                                      \
     fprintf(stderr, __FILE__ ":" S__LINE__ ": \"" args); \
     fprintf(stderr, "\": ");                             \
     perror(NULL);                                        \
     _exit(EXIT_FAILURE);                                 \
   }

 #include <errno.h>
 #include <fcntl.h>
 #include <libgen.h>
 #include <math.h>
 #include <mntent.h>
 #include <net/if.h>
 #include <pwd.h>
 #include <signal.h>
 #include <stdarg.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/ioctl.h>
 #include <sys/mount.h>
 #include <sys/prctl.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>

 static int global_child_pid;
 static char global_inaccessible_directory[] = "tmp/empty.XXXXXX";
 static char global_inaccessible_file[] = "tmp/empty.XXXXXX";

 static void SetupSelfDestruction(int *sync_pipe) {
   // We could also poll() on the pipe fd to find out when the parent goes away,
   // and rely on SIGCHLD interrupting that otherwise. That might require us to
   // install some trivial handler for SIGCHLD. Using O_ASYNC to turn the pipe
   // close into SIGIO may also work. Another option is signalfd, although that's
   // almost as obscure as this prctl.
   if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
     DIE("prctl");
   }

   // Verify that the parent still lives.
   char buf = 0;
   if (close(sync_pipe[0]) < 0) {
     DIE("close");
   }
   if (write(sync_pipe[1], &buf, 1) < 0) {
     DIE("write");
   }
   if (close(sync_pipe[1]) < 0) {
     DIE("close");
   }
 }

 static void SetupMountNamespace() {
   // Fully isolate our mount namespace private from outside events, so that
   // mounts in the outside environment do not affect our sandbox.
   if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
     DIE("mount");
   }
 }

 static void WriteFile(const char *filename, const char *fmt, ...) {
   FILE *stream = fopen(filename, "w");
   if (stream == NULL) {
     DIE("fopen(%s)", filename);
   }

   va_list ap;
   va_start(ap, fmt);
   int r = vfprintf(stream, fmt, ap);
   va_end(ap);

   if (r < 0) {
     DIE("vfprintf");
   }

   if (fclose(stream) != 0) {
     DIE("fclose(%s)", filename);
   }
 }

 static void SetupUserNamespace() {
   // Disable needs for CAP_SETGID.
   struct stat sb;
   if (stat("/proc/self/setgroups", &sb) == 0) {
     WriteFile("/proc/self/setgroups", "deny");
   } else {
     // Ignore ENOENT, because older Linux versions do not have this file (but
     // also do not require writing to it).
     if (errno != ENOENT) {
       DIE("stat(/proc/self/setgroups");
     }
   }

   int inner_uid = 0, inner_gid = 0;
   if (!opt.fake_root) {
     struct passwd *pwd = getpwnam("nobody");
     if (pwd == NULL) {
       DIE("unable to find passwd entry for user nobody")
     }

     inner_uid = pwd->pw_uid;
     inner_gid = pwd->pw_gid;
   }

   WriteFile("/proc/self/uid_map", "%d %d 1\n", inner_uid, global_outer_uid);
   WriteFile("/proc/self/gid_map", "%d %d 1\n", inner_gid, global_outer_gid);
 }

 static void SetupUtsNamespace() {
   if (sethostname("sandbox", 7) < 0) {
     DIE("sethostname");
   }

   if (setdomainname("sandbox", 7) < 0) {
     DIE("setdomainname");
   }
 }

 static void SetupHelperFiles() {
   if (mkdtemp(global_inaccessible_directory) == NULL) {
     DIE("mkdtemp(%s)", global_inaccessible_directory);
   }
   if (chmod(global_inaccessible_directory, 0) < 0) {
     DIE("chmod(%s, 0)", global_inaccessible_directory);
   }

   int handle = mkstemp(global_inaccessible_file);
   if (handle < 0) {
     DIE("mkstemp(%s)", global_inaccessible_file);
   }
   if (fchmod(handle, 0)) {
     DIE("fchmod(%s, 0)", global_inaccessible_file);
   }
   if (close(handle) < 0) {
     DIE("close(%s)", global_inaccessible_file);
   }
 }

 static bool IsDirectory(const char *path) {
   struct stat sb;
   if (stat(path, &sb) < 0) {
     DIE("stat(%s)", path);
   }
   return S_ISDIR(sb.st_mode);
 }

 // Recursively creates the file or directory specified in "path" and its parent
 // directories.
 static int CreateTarget(const char *path, bool is_directory) {
   PRINT_DEBUG("CreateTarget(%s, %s)", path, is_directory ? "true" : "false");
   if (path == NULL) {
     errno = EINVAL;
     return -1;
   }

   struct stat sb;
   // If the path already exists...
   if (stat(path, &sb) == 0) {
     if (is_directory && S_ISDIR(sb.st_mode)) {
       // and it's a directory and supposed to be a directory, we're done here.
       return 0;
     } else if (!is_directory && S_ISREG(sb.st_mode)) {
       // and it's a regular file and supposed to be one, we're done here.
       return 0;
     } else {
       // otherwise something is really wrong.
       errno = is_directory ? ENOTDIR : EEXIST;
       return -1;
     }
   } else {
     // If stat failed because of any error other than "the path does not exist",
     // this is an error.
     if (errno != ENOENT) {
       return -1;
     }
   }

   // Create the parent directory.
   if (CreateTarget(dirname(strdupa(path)), true) < 0) {
     DIE("CreateTarget(%s, true)", dirname(strdupa(path)));
   }

   if (is_directory) {
     if (mkdir(path, 0755) < 0) {
       DIE("mkdir(%s, 0755)", path);
     }
   } else {
     int handle;
     if ((handle = open(path, O_CREAT | O_WRONLY | O_EXCL, 0666)) < 0) {
       DIE("open(%s, O_CREAT | O_WRONLY | O_EXCL, 0666)", path);
     }
     if (close(handle) < 0) {
       DIE("close(%d)", handle);
     }
   }

   return 0;
 }

 static void MountFilesystems() {
   if (mount("/", opt.sandbox_root_dir, NULL, MS_BIND | MS_REC, NULL) < 0) {
     DIE("mount(/, %s, NULL, MS_BIND | MS_REC, NULL)", opt.sandbox_root_dir);
   }

   if (chdir(opt.sandbox_root_dir) < 0) {
     DIE("chdir(%s)", opt.sandbox_root_dir);
   }

   for (const char *tmpfs_dir : opt.tmpfs_dirs) {
     PRINT_DEBUG("tmpfs: %s", tmpfs_dir);
     if (mount("tmpfs", tmpfs_dir + 1, "tmpfs",
               MS_NOSUID | MS_NODEV | MS_NOATIME, NULL) < 0) {
       DIE("mount(tmpfs, %s, tmpfs, MS_NOSUID | MS_NODEV | MS_NOATIME, NULL)",
           tmpfs_dir + 1);
     }
   }

   // Make sure that our working directory is a mount point. The easiest way to
   // do this is by bind-mounting it upon itself.
   PRINT_DEBUG("working dir: %s", opt.working_dir);
   CreateTarget(opt.working_dir + 1, true);
   if (mount(opt.working_dir, opt.working_dir + 1, NULL, MS_BIND, NULL) < 0) {
     DIE("mount(%s, %s, NULL, MS_BIND, NULL)", opt.working_dir,
         opt.working_dir + 1);
   }

   for (const char *bind_mount : opt.bind_mounts) {
     PRINT_DEBUG("bind mount: %s", bind_mount);
     CreateTarget(bind_mount + 1, IsDirectory(bind_mount));
     if (mount(bind_mount, bind_mount + 1, NULL, MS_BIND, NULL) < 0) {
       DIE("mount(%s, %s, NULL, MS_BIND, NULL)", bind_mount, bind_mount + 1);
     }
   }

   for (const char *writable_file : opt.writable_files) {
     PRINT_DEBUG("writable: %s", writable_file);
     if (mount(writable_file, writable_file + 1, NULL, MS_BIND, NULL) < 0) {
       DIE("mount(%s, %s, NULL, MS_BIND, NULL)", writable_file,
           writable_file + 1);
     }
   }

   SetupHelperFiles();

   for (const char *inaccessible_file : opt.inaccessible_files) {
     struct stat sb;
     if (stat(inaccessible_file, &sb) < 0) {
       DIE("stat(%s)", inaccessible_file);
     }

     if (S_ISDIR(sb.st_mode)) {
       PRINT_DEBUG("inaccessible dir: %s", inaccessible_file);
       if (mount(global_inaccessible_directory, inaccessible_file + 1, NULL,
                 MS_BIND, NULL) < 0) {
         DIE("mount(%s, %s, NULL, MS_BIND, NULL)", global_inaccessible_directory,
             inaccessible_file + 1);
       }
     } else {
       PRINT_DEBUG("inaccessible file: %s", inaccessible_file);
       if (mount(global_inaccessible_file, inaccessible_file + 1, NULL, MS_BIND,
                 NULL) < 0) {
         DIE("mount(%s, %s, NULL, MS_BIND, NULL", global_inaccessible_file,
             inaccessible_file + 1);
       }
     }
   }
 }

 // We later remount everything read-only, except the paths for which this method
 // returns true.
 static bool ShouldBeWritable(char *mnt_dir) {
   mnt_dir += strlen(opt.sandbox_root_dir);

   if (strcmp(mnt_dir, opt.working_dir) == 0) {
     return true;
   }

   for (const char *writable_file : opt.writable_files) {
     if (strcmp(mnt_dir, writable_file) == 0) {
       return true;
     }
   }

   for (const char *tmpfs_dir : opt.tmpfs_dirs) {
     if (strcmp(mnt_dir, tmpfs_dir) == 0) {
       return true;
     }
   }

   return false;
 }

 static bool IsUnderTmpDir(const char *mnt_dir) {
   for (const char *tmpfs_dir : opt.tmpfs_dirs) {
     if (strstr(mnt_dir, tmpfs_dir) == mnt_dir) {
       return true;
     }
   }
   return false;
 }

 // Makes the whole filesystem read-only, except for the paths for which
 // ShouldBeWritable returns true.
 static void MakeFilesystemMostlyReadOnly() {
   FILE *mounts = setmntent("/proc/self/mounts", "r");
   if (mounts == NULL) {
     DIE("setmntent");
   }

   struct mntent *ent;
   while ((ent = getmntent(mounts)) != NULL) {
     // Skip mounts that do not belong to our sandbox.
     if (strstr(ent->mnt_dir, opt.sandbox_root_dir) != ent->mnt_dir) {
       continue;
     }
     // Skip mounts that are under tmpfs directories because we've already
     // replaced such directories with new tmpfs instances.
     // mount() would fail with ENOENT if we tried to remount such mount points.
     if (IsUnderTmpDir(ent->mnt_dir + strlen(opt.sandbox_root_dir))) {
       continue;
     }

     int mountFlags = MS_BIND | MS_REMOUNT;

     // MS_REMOUNT does not allow us to change certain flags. This means, we have
     // to first read them out and then pass them in back again. There seems to
     // be no better way than this (an API for just getting the mount flags of a
     // mount entry as a bitmask would be great).
     if (hasmntopt(ent, "nodev") != NULL) {
       mountFlags |= MS_NODEV;
     }
     if (hasmntopt(ent, "noexec") != NULL) {
       mountFlags |= MS_NOEXEC;
     }
     if (hasmntopt(ent, "nosuid") != NULL) {
       mountFlags |= MS_NOSUID;
     }
     if (hasmntopt(ent, "noatime") != NULL) {
       mountFlags |= MS_NOATIME;
     }
     if (hasmntopt(ent, "nodiratime") != NULL) {
       mountFlags |= MS_NODIRATIME;
     }
     if (hasmntopt(ent, "relatime") != NULL) {
       mountFlags |= MS_RELATIME;
     }

     if (!ShouldBeWritable(ent->mnt_dir)) {
       mountFlags |= MS_RDONLY;
     }

     PRINT_DEBUG("remount %s: %s", (mountFlags & MS_RDONLY) ? "ro" : "rw",
                 ent->mnt_dir);
     if (mount(NULL, ent->mnt_dir, NULL, mountFlags, NULL) < 0) {
       // If we get EACCES, this might be a mount-point for which we don't have
       // read access. Not much we can do about this, but it also won't do any
       // harm, so let's go on. The same goes for EINVAL, which is fired in case
       // a later mount overlaps an earlier mount, e.g. consider the case of
       // /proc, /proc/sys/fs/binfmt_misc and /proc, with the latter /proc being
       // the one that an outer sandbox has mounted on top of its parent /proc.
       // In that case, we're not allowed to remount /proc/sys/fs/binfmt_misc,
       // because it is hidden. If we get ESTALE, the mount is a broken NFS
       // mount. In the ideal case, the user would either fix or remove that
       // mount, but in cases where that's not possible, we should just ignore
       // it.
       if (errno != EACCES && errno != EINVAL && errno != ESTALE) {
         DIE("remount(NULL, %s, NULL, %d, NULL)", ent->mnt_dir, mountFlags);
       }
     }
   }

   endmntent(mounts);
 }

 static void MountProc() {
   // Mount a new proc on top of the old one, because the old one still refers to
   // our parent PID namespace.
   if (mount("proc", "proc", "proc", MS_NODEV | MS_NOEXEC | MS_NOSUID, NULL) <
       0) {
     DIE("mount");
   }
 }

 static void SetupNetworking() {
   // When running in a separate network namespace, enable the loopback interface
   // because some application may want to use it.
   if (opt.create_netns) {
     int fd;
     fd = socket(AF_INET, SOCK_DGRAM, 0);
     if (fd < 0) {
       DIE("socket");
     }

     struct ifreq ifr;
     memset(&ifr, 0, sizeof(ifr));
     strncpy(ifr.ifr_name, "lo", IF_NAMESIZE);

     // Verify that name is valid.
     if (if_nametoindex(ifr.ifr_name) == 0) {
       DIE("if_nametoindex");
     }

     // Enable the interface.
     ifr.ifr_flags |= IFF_UP;
     if (ioctl(fd, SIOCSIFFLAGS, &ifr) < 0) {
       DIE("ioctl");
     }

     if (close(fd) < 0) {
       DIE("close");
     }
   }
 }

 static void EnterSandbox() {
   // Move the real root to old_root, then detach it.
   char old_root[] = "tmp/old-root-XXXXXX";
   if (mkdtemp(old_root) == NULL) {
     DIE("mkdtemp(%s)", old_root);
   }

   // pivot_root has no wrapper in libc, so we need syscall()
   if (syscall(SYS_pivot_root, ".", old_root) < 0) {
     DIE("pivot_root(., %s)", old_root);
   }

   if (chroot(".") < 0) {
     DIE("chroot(.)");
   }

   if (umount2(old_root, MNT_DETACH) < 0) {
     DIE("umount2(%s, MNT_DETACH)", old_root);
   }

   if (rmdir(old_root) < 0) {
     DIE("rmdir(%s)", old_root);
   }

   if (chdir(opt.working_dir) < 0) {
     DIE("chdir(%s)", opt.working_dir);
   }
 }

 static void InstallSignalHandler(int signum, void (*handler)(int)) {
   struct sigaction sa;
   memset(&sa, 0, sizeof(sa));
   sa.sa_handler = handler;
   if (handler == SIG_IGN || handler == SIG_DFL) {
     // No point in blocking signals when using the default handler or ignoring
     // the signal.
     if (sigemptyset(&sa.sa_mask) < 0) {
       DIE("sigemptyset");
     }
   } else {
     // When using a custom handler, block all signals from firing while the
     // handler is running.
     if (sigfillset(&sa.sa_mask) < 0) {
       DIE("sigfillset");
     }
   }
   // sigaction may fail for certain reserved signals. Ignore failure in this
   // case, but report it in debug mode, just in case.
   if (sigaction(signum, &sa, NULL) < 0) {
     PRINT_DEBUG("sigaction(%d, &sa, NULL) failed", signum);
   }
 }

 static void IgnoreSignal(int signum) { InstallSignalHandler(signum, SIG_IGN); }

 // Reset the signal mask and restore the default handler for all signals.
 static void RestoreSignalHandlersAndMask() {
   // Use an empty signal mask for the process (= unblock all signals).
   sigset_t empty_set;
   if (sigemptyset(&empty_set) < 0) {
     DIE("sigemptyset");
   }
   if (sigprocmask(SIG_SETMASK, &empty_set, nullptr) < 0) {
     DIE("sigprocmask(SIG_SETMASK, <empty set>, nullptr)");
   }

   // Set the default signal handler for all signals.
   struct sigaction sa;
   memset(&sa, 0, sizeof(sa));
   if (sigemptyset(&sa.sa_mask) < 0) {
     DIE("sigemptyset");
   }
   sa.sa_handler = SIG_DFL;
   for (int i = 1; i < NSIG; ++i) {
     // Ignore possible errors, because we might not be allowed to set the
     // handler for certain signals, but we still want to try.
     sigaction(i, &sa, nullptr);
   }
 }

 static void ForwardSignal(int signum) {
   PRINT_DEBUG("ForwardSignal(%d)", signum);
   kill(-global_child_pid, signum);
 }

 static void SetupSignalHandlers() {
   RestoreSignalHandlersAndMask();

   for (int signum = 1; signum < NSIG; signum++) {
     switch (signum) {
       // Some signals should indeed kill us and not be forwarded to the child,
       // thus we can use the default handler.
       case SIGABRT:
       case SIGBUS:
       case SIGFPE:
       case SIGILL:
       case SIGSEGV:
       case SIGSYS:
       case SIGTRAP:
         break;
       // It's fine to use the default handler for SIGCHLD, because we use
       // waitpid() in the main loop to wait for children to die anyway.
       case SIGCHLD:
         break;
       // One does not simply install a signal handler for these two signals
       case SIGKILL:
       case SIGSTOP:
         break;
       // Ignore SIGTTIN and SIGTTOU, as we hand off the terminal to the child in
       // SpawnChild().
       case SIGTTIN:
       case SIGTTOU:
         IgnoreSignal(signum);
         break;
       // All other signals should be forwarded to the child.
       default:
         InstallSignalHandler(signum, ForwardSignal);
         break;
     }
   }
 }

 static void SpawnChild() {
   global_child_pid = fork();

   if (global_child_pid < 0) {
     DIE("fork()");
   } else if (global_child_pid == 0) {
     // Put the child into its own process group.
     if (setpgid(0, 0) < 0) {
       DIE("setpgid");
     }

     // Try to assign our terminal to the child process.
     if (tcsetpgrp(STDIN_FILENO, getpgrp()) < 0 && errno != ENOTTY) {
       DIE("tcsetpgrp")
     }

     // Unblock all signals, restore default handlers.
     RestoreSignalHandlersAndMask();

     // Force umask to include read and execute for everyone, to make output
     // permissions predictable.
     umask(022);

     // argv[] passed to execve() must be a null-terminated array.
     opt.args.push_back(nullptr);

     if (execvp(opt.args[0], opt.args.data()) < 0) {
       DIE("execvp(%s, %p)", opt.args[0], opt.args.data());
     }
   }
 }

 static void WaitForChild() {
   while (1) {
     // Check for zombies to be reaped and exit, if our own child exited.
     int status;
     pid_t killed_pid = waitpid(-1, &status, 0);
     PRINT_DEBUG("waitpid returned %d", killed_pid);

     if (killed_pid < 0) {
       // Our PID1 process got a signal that interrupted the waitpid() call and
       // that was either ignored or forwared to the child. This is expected &
       // fine, just continue waiting.
       if (errno == EINTR) {
         continue;
       }
       DIE("waitpid")
     } else {
       if (killed_pid == global_child_pid) {
         // If the child process we spawned earlier terminated, we'll also
         // terminate. We can simply _exit() here, because the Linux kernel will
         // kindly SIGKILL all remaining processes in our PID namespace once we
         // exit.
         if (WIFSIGNALED(status)) {
           PRINT_DEBUG("child died due to signal %d", WTERMSIG(status));
           _exit(128 + WTERMSIG(status));
         } else {
           PRINT_DEBUG("child exited with code %d", WEXITSTATUS(status));
           _exit(WEXITSTATUS(status));
         }
       }
     }
   }
 }

 int Pid1Main(void *sync_pipe_param) {
   if (getpid() != 1) {
     DIE("Using PID namespaces, but we are not PID 1");
   }

   SetupSelfDestruction(reinterpret_cast<int *>(sync_pipe_param));
   SetupMountNamespace();
   SetupUserNamespace();
   SetupUtsNamespace();
   MountFilesystems();
   MakeFilesystemMostlyReadOnly();
   MountProc();
   SetupNetworking();
   EnterSandbox();
   SetupSignalHandlers();
   SpawnChild();
   WaitForChild();
   _exit(EXIT_FAILURE);
 }
	// Copyright 2016 The Bazel Authors. All rights reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	/**
	* This is PID 1 inside the sandbox environment and runs in a separate user,
	* mount, UTS, IPC and PID namespace.
	*/

	#include "linux-sandbox-options.h"
	#include "linux-sandbox-utils.h"
	#include "linux-sandbox.h"

	// Note that we define DIE() here and not in a shared header, because we want to
	// use _exit() in the
	// pid1 child, but exit() in the parent.
	#define DIE(args...) \
	{ \
	fprintf(stderr, __FILE__ ":" S__LINE__ ": \"" args); \
	fprintf(stderr, "\": "); \
	perror(NULL); \
	_exit(EXIT_FAILURE); \
	}

	#include <errno.h>
	#include <fcntl.h>
	#include <libgen.h>
	#include <math.h>
	#include <mntent.h>
	#include <net/if.h>
	#include <pwd.h>
	#include <signal.h>
	#include <stdarg.h>
	#include <stdbool.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <sys/ioctl.h>
	#include <sys/mount.h>
	#include <sys/prctl.h>
	#include <sys/stat.h>
	#include <sys/syscall.h>
	#include <sys/types.h>
	#include <sys/wait.h>
	#include <unistd.h>

	static int global_child_pid;
	static char global_inaccessible_directory[] = "tmp/empty.XXXXXX";
	static char global_inaccessible_file[] = "tmp/empty.XXXXXX";

	static void SetupSelfDestruction(int *sync_pipe) {
	// We could also poll() on the pipe fd to find out when the parent goes away,
	// and rely on SIGCHLD interrupting that otherwise. That might require us to
	// install some trivial handler for SIGCHLD. Using O_ASYNC to turn the pipe
	// close into SIGIO may also work. Another option is signalfd, although that's
	// almost as obscure as this prctl.
	if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
	DIE("prctl");
	}

	// Verify that the parent still lives.
	char buf = 0;
	if (close(sync_pipe[0]) < 0) {
	DIE("close");
	}
	if (write(sync_pipe[1], &buf, 1) < 0) {
	DIE("write");
	}
	if (close(sync_pipe[1]) < 0) {
	DIE("close");
	}
	}

	static void SetupMountNamespace() {
	// Fully isolate our mount namespace private from outside events, so that
	// mounts in the outside environment do not affect our sandbox.
	if (mount(NULL, "/", NULL, MS_REC \| MS_PRIVATE, NULL) < 0) {
	DIE("mount");
	}
	}

	static void WriteFile(const char filename, const char fmt, ...) {
	FILE *stream = fopen(filename, "w");
	if (stream == NULL) {
	DIE("fopen(%s)", filename);
	}

	va_list ap;
	va_start(ap, fmt);
	int r = vfprintf(stream, fmt, ap);
	va_end(ap);

	if (r < 0) {
	DIE("vfprintf");
	}

	if (fclose(stream) != 0) {
	DIE("fclose(%s)", filename);
	}
	}

	static void SetupUserNamespace() {
	// Disable needs for CAP_SETGID.
	struct stat sb;
	if (stat("/proc/self/setgroups", &sb) == 0) {
	WriteFile("/proc/self/setgroups", "deny");
	} else {
	// Ignore ENOENT, because older Linux versions do not have this file (but
	// also do not require writing to it).
	if (errno != ENOENT) {
	DIE("stat(/proc/self/setgroups");
	}
	}

	int inner_uid = 0, inner_gid = 0;
	if (!opt.fake_root) {
	struct passwd *pwd = getpwnam("nobody");
	if (pwd == NULL) {
	DIE("unable to find passwd entry for user nobody")
	}

	inner_uid = pwd->pw_uid;
	inner_gid = pwd->pw_gid;
	}

	WriteFile("/proc/self/uid_map", "%d %d 1\n", inner_uid, global_outer_uid);
	WriteFile("/proc/self/gid_map", "%d %d 1\n", inner_gid, global_outer_gid);
	}

	static void SetupUtsNamespace() {
	if (sethostname("sandbox", 7) < 0) {
	DIE("sethostname");
	}

	if (setdomainname("sandbox", 7) < 0) {
	DIE("setdomainname");
	}
	}

	static void SetupHelperFiles() {
	if (mkdtemp(global_inaccessible_directory) == NULL) {
	DIE("mkdtemp(%s)", global_inaccessible_directory);
	}
	if (chmod(global_inaccessible_directory, 0) < 0) {
	DIE("chmod(%s, 0)", global_inaccessible_directory);
	}

	int handle = mkstemp(global_inaccessible_file);
	if (handle < 0) {
	DIE("mkstemp(%s)", global_inaccessible_file);
	}
	if (fchmod(handle, 0)) {
	DIE("fchmod(%s, 0)", global_inaccessible_file);
	}
	if (close(handle) < 0) {
	DIE("close(%s)", global_inaccessible_file);
	}
	}

	static bool IsDirectory(const char *path) {
	struct stat sb;
	if (stat(path, &sb) < 0) {
	DIE("stat(%s)", path);
	}
	return S_ISDIR(sb.st_mode);
	}

	// Recursively creates the file or directory specified in "path" and its parent
	// directories.
	static int CreateTarget(const char *path, bool is_directory) {
	PRINT_DEBUG("CreateTarget(%s, %s)", path, is_directory ? "true" : "false");
	if (path == NULL) {
	errno = EINVAL;
	return -1;
	}

	struct stat sb;
	// If the path already exists...
	if (stat(path, &sb) == 0) {
	if (is_directory && S_ISDIR(sb.st_mode)) {
	// and it's a directory and supposed to be a directory, we're done here.
	return 0;
	} else if (!is_directory && S_ISREG(sb.st_mode)) {
	// and it's a regular file and supposed to be one, we're done here.
	return 0;
	} else {
	// otherwise something is really wrong.
	errno = is_directory ? ENOTDIR : EEXIST;
	return -1;
	}
	} else {
	// If stat failed because of any error other than "the path does not exist",
	// this is an error.
	if (errno != ENOENT) {
	return -1;
	}
	}

	// Create the parent directory.
	if (CreateTarget(dirname(strdupa(path)), true) < 0) {
	DIE("CreateTarget(%s, true)", dirname(strdupa(path)));
	}

	if (is_directory) {
	if (mkdir(path, 0755) < 0) {
	DIE("mkdir(%s, 0755)", path);
	}
	} else {
	int handle;
	if ((handle = open(path, O_CREAT \| O_WRONLY \| O_EXCL, 0666)) < 0) {
	DIE("open(%s, O_CREAT \| O_WRONLY \| O_EXCL, 0666)", path);
	}
	if (close(handle) < 0) {
	DIE("close(%d)", handle);
	}
	}

	return 0;
	}

	static void MountFilesystems() {
	if (mount("/", opt.sandbox_root_dir, NULL, MS_BIND \| MS_REC, NULL) < 0) {
	DIE("mount(/, %s, NULL, MS_BIND \| MS_REC, NULL)", opt.sandbox_root_dir);
	}

	if (chdir(opt.sandbox_root_dir) < 0) {
	DIE("chdir(%s)", opt.sandbox_root_dir);
	}

	for (const char *tmpfs_dir : opt.tmpfs_dirs) {
	PRINT_DEBUG("tmpfs: %s", tmpfs_dir);
	if (mount("tmpfs", tmpfs_dir + 1, "tmpfs",
	MS_NOSUID \| MS_NODEV \| MS_NOATIME, NULL) < 0) {
	DIE("mount(tmpfs, %s, tmpfs, MS_NOSUID \| MS_NODEV \| MS_NOATIME, NULL)",
	tmpfs_dir + 1);
	}
	}

	// Make sure that our working directory is a mount point. The easiest way to
	// do this is by bind-mounting it upon itself.
	PRINT_DEBUG("working dir: %s", opt.working_dir);
	CreateTarget(opt.working_dir + 1, true);
	if (mount(opt.working_dir, opt.working_dir + 1, NULL, MS_BIND, NULL) < 0) {
	DIE("mount(%s, %s, NULL, MS_BIND, NULL)", opt.working_dir,
	opt.working_dir + 1);
	}

	for (const char *bind_mount : opt.bind_mounts) {
	PRINT_DEBUG("bind mount: %s", bind_mount);
	CreateTarget(bind_mount + 1, IsDirectory(bind_mount));
	if (mount(bind_mount, bind_mount + 1, NULL, MS_BIND, NULL) < 0) {
	DIE("mount(%s, %s, NULL, MS_BIND, NULL)", bind_mount, bind_mount + 1);
	}
	}

	for (const char *writable_file : opt.writable_files) {
	PRINT_DEBUG("writable: %s", writable_file);
	if (mount(writable_file, writable_file + 1, NULL, MS_BIND, NULL) < 0) {
	DIE("mount(%s, %s, NULL, MS_BIND, NULL)", writable_file,
	writable_file + 1);
	}
	}

	SetupHelperFiles();

	for (const char *inaccessible_file : opt.inaccessible_files) {
	struct stat sb;
	if (stat(inaccessible_file, &sb) < 0) {
	DIE("stat(%s)", inaccessible_file);
	}

	if (S_ISDIR(sb.st_mode)) {
	PRINT_DEBUG("inaccessible dir: %s", inaccessible_file);
	if (mount(global_inaccessible_directory, inaccessible_file + 1, NULL,
	MS_BIND, NULL) < 0) {
	DIE("mount(%s, %s, NULL, MS_BIND, NULL)", global_inaccessible_directory,
	inaccessible_file + 1);
	}
	} else {
	PRINT_DEBUG("inaccessible file: %s", inaccessible_file);
	if (mount(global_inaccessible_file, inaccessible_file + 1, NULL, MS_BIND,
	NULL) < 0) {
	DIE("mount(%s, %s, NULL, MS_BIND, NULL", global_inaccessible_file,
	inaccessible_file + 1);
	}
	}
	}
	}

	// We later remount everything read-only, except the paths for which this method
	// returns true.
	static bool ShouldBeWritable(char *mnt_dir) {
	mnt_dir += strlen(opt.sandbox_root_dir);

	if (strcmp(mnt_dir, opt.working_dir) == 0) {
	return true;
	}

	for (const char *writable_file : opt.writable_files) {
	if (strcmp(mnt_dir, writable_file) == 0) {
	return true;
	}
	}

	for (const char *tmpfs_dir : opt.tmpfs_dirs) {
	if (strcmp(mnt_dir, tmpfs_dir) == 0) {
	return true;
	}
	}

	return false;
	}

	static bool IsUnderTmpDir(const char *mnt_dir) {
	for (const char *tmpfs_dir : opt.tmpfs_dirs) {
	if (strstr(mnt_dir, tmpfs_dir) == mnt_dir) {
	return true;
	}
	}
	return false;
	}

	// Makes the whole filesystem read-only, except for the paths for which
	// ShouldBeWritable returns true.
	static void MakeFilesystemMostlyReadOnly() {
	FILE *mounts = setmntent("/proc/self/mounts", "r");
	if (mounts == NULL) {
	DIE("setmntent");
	}

	struct mntent *ent;
	while ((ent = getmntent(mounts)) != NULL) {
	// Skip mounts that do not belong to our sandbox.
	if (strstr(ent->mnt_dir, opt.sandbox_root_dir) != ent->mnt_dir) {
	continue;
	}
	// Skip mounts that are under tmpfs directories because we've already
	// replaced such directories with new tmpfs instances.
	// mount() would fail with ENOENT if we tried to remount such mount points.
	if (IsUnderTmpDir(ent->mnt_dir + strlen(opt.sandbox_root_dir))) {
	continue;
	}

	int mountFlags = MS_BIND \| MS_REMOUNT;

	// MS_REMOUNT does not allow us to change certain flags. This means, we have
	// to first read them out and then pass them in back again. There seems to
	// be no better way than this (an API for just getting the mount flags of a
	// mount entry as a bitmask would be great).
	if (hasmntopt(ent, "nodev") != NULL) {
	mountFlags \|= MS_NODEV;
	}
	if (hasmntopt(ent, "noexec") != NULL) {
	mountFlags \|= MS_NOEXEC;
	}
	if (hasmntopt(ent, "nosuid") != NULL) {
	mountFlags \|= MS_NOSUID;
	}
	if (hasmntopt(ent, "noatime") != NULL) {
	mountFlags \|= MS_NOATIME;
	}
	if (hasmntopt(ent, "nodiratime") != NULL) {
	mountFlags \|= MS_NODIRATIME;
	}
	if (hasmntopt(ent, "relatime") != NULL) {
	mountFlags \|= MS_RELATIME;
	}

	if (!ShouldBeWritable(ent->mnt_dir)) {
	mountFlags \|= MS_RDONLY;
	}

	PRINT_DEBUG("remount %s: %s", (mountFlags & MS_RDONLY) ? "ro" : "rw",
	ent->mnt_dir);
	if (mount(NULL, ent->mnt_dir, NULL, mountFlags, NULL) < 0) {
	// If we get EACCES, this might be a mount-point for which we don't have
	// read access. Not much we can do about this, but it also won't do any
	// harm, so let's go on. The same goes for EINVAL, which is fired in case
	// a later mount overlaps an earlier mount, e.g. consider the case of
	// /proc, /proc/sys/fs/binfmt_misc and /proc, with the latter /proc being
	// the one that an outer sandbox has mounted on top of its parent /proc.
	// In that case, we're not allowed to remount /proc/sys/fs/binfmt_misc,
	// because it is hidden. If we get ESTALE, the mount is a broken NFS
	// mount. In the ideal case, the user would either fix or remove that
	// mount, but in cases where that's not possible, we should just ignore
	// it.
	if (errno != EACCES && errno != EINVAL && errno != ESTALE) {
	DIE("remount(NULL, %s, NULL, %d, NULL)", ent->mnt_dir, mountFlags);
	}
	}
	}

	endmntent(mounts);
	}

	static void MountProc() {
	// Mount a new proc on top of the old one, because the old one still refers to
	// our parent PID namespace.
	if (mount("proc", "proc", "proc", MS_NODEV \| MS_NOEXEC \| MS_NOSUID, NULL) <
	0) {
	DIE("mount");
	}
	}

	static void SetupNetworking() {
	// When running in a separate network namespace, enable the loopback interface
	// because some application may want to use it.
	if (opt.create_netns) {
	int fd;
	fd = socket(AF_INET, SOCK_DGRAM, 0);
	if (fd < 0) {
	DIE("socket");
	}

	struct ifreq ifr;
	memset(&ifr, 0, sizeof(ifr));
	strncpy(ifr.ifr_name, "lo", IF_NAMESIZE);

	// Verify that name is valid.
	if (if_nametoindex(ifr.ifr_name) == 0) {
	DIE("if_nametoindex");
	}

	// Enable the interface.
	ifr.ifr_flags \|= IFF_UP;
	if (ioctl(fd, SIOCSIFFLAGS, &ifr) < 0) {
	DIE("ioctl");
	}

	if (close(fd) < 0) {
	DIE("close");
	}
	}
	}

	static void EnterSandbox() {
	// Move the real root to old_root, then detach it.
	char old_root[] = "tmp/old-root-XXXXXX";
	if (mkdtemp(old_root) == NULL) {
	DIE("mkdtemp(%s)", old_root);
	}

	// pivot_root has no wrapper in libc, so we need syscall()
	if (syscall(SYS_pivot_root, ".", old_root) < 0) {
	DIE("pivot_root(., %s)", old_root);
	}

	if (chroot(".") < 0) {
	DIE("chroot(.)");
	}

	if (umount2(old_root, MNT_DETACH) < 0) {
	DIE("umount2(%s, MNT_DETACH)", old_root);
	}

	if (rmdir(old_root) < 0) {
	DIE("rmdir(%s)", old_root);
	}

	if (chdir(opt.working_dir) < 0) {
	DIE("chdir(%s)", opt.working_dir);
	}
	}

	static void InstallSignalHandler(int signum, void (*handler)(int)) {
	struct sigaction sa;
	memset(&sa, 0, sizeof(sa));
	sa.sa_handler = handler;
	if (handler == SIG_IGN \|\| handler == SIG_DFL) {
	// No point in blocking signals when using the default handler or ignoring
	// the signal.
	if (sigemptyset(&sa.sa_mask) < 0) {
	DIE("sigemptyset");
	}
	} else {
	// When using a custom handler, block all signals from firing while the
	// handler is running.
	if (sigfillset(&sa.sa_mask) < 0) {
	DIE("sigfillset");
	}
	}
	// sigaction may fail for certain reserved signals. Ignore failure in this
	// case, but report it in debug mode, just in case.
	if (sigaction(signum, &sa, NULL) < 0) {
	PRINT_DEBUG("sigaction(%d, &sa, NULL) failed", signum);
	}
	}

	static void IgnoreSignal(int signum) { InstallSignalHandler(signum, SIG_IGN); }

	// Reset the signal mask and restore the default handler for all signals.
	static void RestoreSignalHandlersAndMask() {
	// Use an empty signal mask for the process (= unblock all signals).
	sigset_t empty_set;
	if (sigemptyset(&empty_set) < 0) {
	DIE("sigemptyset");
	}
	if (sigprocmask(SIG_SETMASK, &empty_set, nullptr) < 0) {
	DIE("sigprocmask(SIG_SETMASK, <empty set>, nullptr)");
	}

	// Set the default signal handler for all signals.
	struct sigaction sa;
	memset(&sa, 0, sizeof(sa));
	if (sigemptyset(&sa.sa_mask) < 0) {
	DIE("sigemptyset");
	}
	sa.sa_handler = SIG_DFL;
	for (int i = 1; i < NSIG; ++i) {
	// Ignore possible errors, because we might not be allowed to set the
	// handler for certain signals, but we still want to try.
	sigaction(i, &sa, nullptr);
	}
	}

	static void ForwardSignal(int signum) {
	PRINT_DEBUG("ForwardSignal(%d)", signum);
	kill(-global_child_pid, signum);
	}

	static void SetupSignalHandlers() {
	RestoreSignalHandlersAndMask();

	for (int signum = 1; signum < NSIG; signum++) {
	switch (signum) {
	// Some signals should indeed kill us and not be forwarded to the child,
	// thus we can use the default handler.
	case SIGABRT:
	case SIGBUS:
	case SIGFPE:
	case SIGILL:
	case SIGSEGV:
	case SIGSYS:
	case SIGTRAP:
	break;
	// It's fine to use the default handler for SIGCHLD, because we use
	// waitpid() in the main loop to wait for children to die anyway.
	case SIGCHLD:
	break;
	// One does not simply install a signal handler for these two signals
	case SIGKILL:
	case SIGSTOP:
	break;
	// Ignore SIGTTIN and SIGTTOU, as we hand off the terminal to the child in
	// SpawnChild().
	case SIGTTIN:
	case SIGTTOU:
	IgnoreSignal(signum);
	break;
	// All other signals should be forwarded to the child.
	default:
	InstallSignalHandler(signum, ForwardSignal);
	break;
	}
	}
	}

	static void SpawnChild() {
	global_child_pid = fork();

	if (global_child_pid < 0) {
	DIE("fork()");
	} else if (global_child_pid == 0) {
	// Put the child into its own process group.
	if (setpgid(0, 0) < 0) {
	DIE("setpgid");
	}

	// Try to assign our terminal to the child process.
	if (tcsetpgrp(STDIN_FILENO, getpgrp()) < 0 && errno != ENOTTY) {
	DIE("tcsetpgrp")
	}

	// Unblock all signals, restore default handlers.
	RestoreSignalHandlersAndMask();

	// Force umask to include read and execute for everyone, to make output
	// permissions predictable.
	umask(022);

	// argv[] passed to execve() must be a null-terminated array.
	opt.args.push_back(nullptr);

	if (execvp(opt.args[0], opt.args.data()) < 0) {
	DIE("execvp(%s, %p)", opt.args[0], opt.args.data());
	}
	}
	}

	static void WaitForChild() {
	while (1) {
	// Check for zombies to be reaped and exit, if our own child exited.
	int status;
	pid_t killed_pid = waitpid(-1, &status, 0);
	PRINT_DEBUG("waitpid returned %d", killed_pid);

	if (killed_pid < 0) {
	// Our PID1 process got a signal that interrupted the waitpid() call and
	// that was either ignored or forwared to the child. This is expected &
	// fine, just continue waiting.
	if (errno == EINTR) {
	continue;
	}
	DIE("waitpid")
	} else {
	if (killed_pid == global_child_pid) {
	// If the child process we spawned earlier terminated, we'll also
	// terminate. We can simply _exit() here, because the Linux kernel will
	// kindly SIGKILL all remaining processes in our PID namespace once we
	// exit.
	if (WIFSIGNALED(status)) {
	PRINT_DEBUG("child died due to signal %d", WTERMSIG(status));
	_exit(128 + WTERMSIG(status));
	} else {
	PRINT_DEBUG("child exited with code %d", WEXITSTATUS(status));
	_exit(WEXITSTATUS(status));
	}
	}
	}
	}
	}

	int Pid1Main(void *sync_pipe_param) {
	if (getpid() != 1) {
	DIE("Using PID namespaces, but we are not PID 1");
	}

	SetupSelfDestruction(reinterpret_cast<int *>(sync_pipe_param));
	SetupMountNamespace();
	SetupUserNamespace();
	SetupUtsNamespace();
	MountFilesystems();
	MakeFilesystemMostlyReadOnly();
	MountProc();
	SetupNetworking();
	EnterSandbox();
	SetupSignalHandlers();
	SpawnChild();
	WaitForChild();
	_exit(EXIT_FAILURE);
	}