src/main/tools/linux-sandbox.cc - bazel - Git at Google

 // Copyright 2016 The Bazel Authors. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 /**
  * linux-sandbox runs commands in a restricted environment where they are
  * subject to a few rules:
  *
  *  - The entire filesystem is made read-only.
  *  - The working directory (-W) will be made read-write, though.
  *  - Individual files or directories can be made writable (but not deletable)
  *    (-w).
  *  - If the process takes longer than the timeout (-T), it will be killed with
  *    SIGTERM. If it does not exit within the grace period (-t), it all of its
  *    children will be killed with SIGKILL.
  *  - tmpfs can be mounted on top of existing directories (-e).
  *  - If option -R is passed, the process will run as user 'root'.
  *  - If option -U is passed, the process will run as user 'nobody'.
  *  - Otherwise, the process runs using the current uid / gid.
  *  - If linux-sandbox itself gets killed, the process and all of its children
  *    will be killed.
  *  - If linux-sandbox's parent dies, it will kill itself, the process and all
  *    the children.
  *  - Network access is allowed, but can be disabled via -N.
  *  - The hostname and domainname will be set to "sandbox".
  *  - The process runs in its own PID namespace, so other processes on the
  *    system are invisible.
  */

 #include "src/main/tools/linux-sandbox.h"

 #include <ctype.h>
 #include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <math.h>
 #include <sched.h>
 #include <signal.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/prctl.h>
 #include <sys/resource.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>

 #include <atomic>
 #include <string>
 #include <vector>

 #include "src/main/tools/linux-sandbox-options.h"
 #include "src/main/tools/linux-sandbox-pid1.h"
 #include "src/main/tools/logging.h"
 #include "src/main/tools/process-tools.h"

 uid_t global_outer_uid;
 gid_t global_outer_gid;

 // The PID of our child process, for use in signal handlers.
 static std::atomic<pid_t> global_child_pid{0};
 // Our parent's pid at the outset, to check if the original parent has exited.
 pid_t initial_ppid;

 // Must we politely ask the child to exit before we send it a SIGKILL (once we
 // want it to exit)? Holds only zero or one.
 static std::atomic<int> global_need_polite_sigterm{false};

 #if __cplusplus >= 201703L
 static_assert(global_child_pid.is_always_lock_free);
 static_assert(global_need_polite_sigterm.is_always_lock_free);
 #endif

 // Make sure the child process does not inherit any accidentally left open file
 // handles from our parent.
 static void CloseFds() {
   DIR *fds = opendir("/proc/self/fd");
   if (fds == nullptr) {
     DIE("opendir");
   }

   while (1) {
     errno = 0;
     struct dirent *dent = readdir(fds);

     if (dent == nullptr) {
       if (errno != 0) {
         DIE("readdir");
       }
       break;
     }

     if (isdigit(dent->d_name[0])) {
       errno = 0;
       int fd = strtol(dent->d_name, nullptr, 10);

       // (1) Skip unparseable entries.
       // (2) Close everything except stdin, stdout, stderr and debug output.
       // (3) Do not accidentally close our directory handle.
       if (errno == 0 && fd > STDERR_FILENO &&
           (global_debug == NULL || fd != fileno(global_debug)) &&
           fd != dirfd(fds)) {
         if (close(fd) < 0) {
           DIE("close");
         }
       }
     }
   }

   if (closedir(fds) < 0) {
     DIE("closedir");
   }
 }

 static void MaybeAddChildProcessToCgroup(const pid_t pid) {
   for (const std::string &cgroups_dir : opt.cgroups_dirs) {
     PRINT_DEBUG("Adding process %d to cgroups dir %s", pid,
                 cgroups_dir.c_str());
     WriteFile(cgroups_dir + "/cgroup.procs", "%d", pid);
   }
 }

 static void OnTimeoutOrTerm(int) {
   // Find the PID of the child, which main set up before installing us as a
   // signal handler.
   const pid_t child_pid = global_child_pid.load(std::memory_order_relaxed);

   // Figure out whether we should send a SIGTERM here. If so, we won't want to
   // next time we're called.
   const bool need_polite_sigterm =
       global_need_polite_sigterm.fetch_and(0, std::memory_order_relaxed);

   // If we're not supposed to ask politely, simply forcibly kill the child.
   if (!need_polite_sigterm) {
     kill(child_pid, SIGKILL);
     return;
   }

   // Otherwise make a polite request, then arrange to be called again after a
   // delay, at which point we'll send SIGKILL.
   //
   // Note that main sets us up as the signal handler for SIGALRM, and arranges
   // for this code path to be taken only if kill_delay_secs > 0.
   kill(child_pid, SIGTERM);
   alarm(opt.kill_delay_secs);
 }

 static pid_t SpawnPid1() {
   const int kStackSize = 1024 * 1024;
   std::vector<char> child_stack(kStackSize);

   PRINT_DEBUG("calling pipe(2)...");

   int pipe_from_child[2], pipe_to_child[2];
   if (pipe(pipe_from_child) < 0) {
     DIE("pipe");
   }
   if (pipe(pipe_to_child) < 0) {
       DIE("pipe");
     }

   int clone_flags =
       CLONE_NEWUSER | CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWPID | SIGCHLD;
   PRINT_DEBUG("Netns is %d", opt.create_netns);
   if (opt.create_netns != NO_NETNS) {
     clone_flags |= CLONE_NEWNET;
   }
   if (opt.fake_hostname) {
     clone_flags |= CLONE_NEWUTS;
   }

   // We use clone instead of unshare, because unshare sometimes fails with
   // EINVAL due to a race condition in the Linux kernel (see
   // https://lkml.org/lkml/2015/7/28/833).
   PRINT_DEBUG("calling clone(2)...");

   Pid1Args pid1Args;
   pid1Args.pipe_to_parent = pipe_from_child;
   pid1Args.pipe_from_parent = pipe_to_child;
   const pid_t child_pid = clone(Pid1Main, child_stack.data() + kStackSize,
                                 clone_flags, &pid1Args);

   if (child_pid < 0) {
     DIE("clone");
   }

   MaybeAddChildProcessToCgroup(child_pid);
   // Signal the child that it can now proceed to spawn pid2.
   SignalPipe(pipe_to_child);

   PRINT_DEBUG("linux-sandbox-pid1 has PID %d", child_pid);

   // Wait for a signal from the child linux-sandbox-pid1 process; this proves to
   // the child process that we still existed after it ran
   // prctl(PR_SET_PDEATHSIG, SIGKILL), thus preventing a race condition where
   // the parent is killed before that call was made.
   WaitPipe(pipe_from_child);

   PRINT_DEBUG("done manipulating pipes");

   return child_pid;
 }

 static int WaitForPid1(const pid_t child_pid) {
   // Wait for the child to exit, obtaining usage information. Restart in the
   // case of a signal interrupting us.
   int child_status;
   struct rusage child_rusage;
   while (true) {
     const int ret = wait4(child_pid, &child_status, 0, &child_rusage);
     if (ret > 0) {
       break;
     }

     // We've been handed off to a reaper process and should die.
     if (getppid() != initial_ppid) {
       break;
     }

     if (errno == EINTR) {
       continue;
     }

     DIE("wait4");
   }

   // If we're supposed to write stats to a file, do so now.
   if (!opt.stats_path.empty()) {
     WriteStatsToFile(&child_rusage, opt.stats_path);
   }

   // We want to exit in the same manner as the child.
   if (WIFSIGNALED(child_status)) {
     const int signal = WTERMSIG(child_status);
     PRINT_DEBUG("child exited due to receiving signal: %s", strsignal(signal));
     return 128 + signal;
   }

   const int exit_code = WEXITSTATUS(child_status);
   PRINT_DEBUG("child exited normally with code %d", exit_code);
   return exit_code;
 }

 int main(int argc, char *argv[]) {
   // Ask the kernel to kill us with SIGKILL if our parent dies.
   if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
     DIE("prctl");
   }

   // Parse our command-line options.
   ParseOptions(argc, argv);

   // Open the file PRINT_DEBUG writes to.
   // Must happen early enough so we don't lose any debugging output.
   if (!opt.debug_path.empty()) {
     global_debug = fopen(opt.debug_path.c_str(), "w");
     if (!global_debug) {
       DIE("fopen(%s)", opt.debug_path.c_str());
     }
   }

   // Start with default signal actions and a clear signal mask.
   ClearSignalMask();

   // Ignore SIGTTIN and SIGTTOU, as we hand off the terminal to the child in
   // SpawnChild.
   IgnoreSignal(SIGTTIN);
   IgnoreSignal(SIGTTOU);

   // Remember the parent pid so we can exit if the parent has exited.
   // Doing this before prctl(PR_SET_PDEATHDIG, 0) ensures no race condition.
   initial_ppid = getppid();

   if (opt.persistent_process) {
     if (prctl(PR_SET_PDEATHSIG, 0) < 0) {
       DIE("prctl");
     }
   }

   // Redirect output as requested.
   Redirect(opt.stdout_path, STDOUT_FILENO);
   Redirect(opt.stderr_path, STDERR_FILENO);

   // Set up two globals used by the child process.
   global_outer_uid = getuid();
   global_outer_gid = getgid();

   // Ensure we don't pass on any FDs from our parent to our child other than
   // stdin, stdout, stderr and global_debug.
   CloseFds();

   // Spawn the child that will fork the sandboxed program with fresh
   // namespaces etc.
   const pid_t child_pid = SpawnPid1();

   // Let the signal handlers installed below know the PID of the child.
   global_child_pid.store(child_pid, std::memory_order_relaxed);

   // If a kill delay has been configured, let the signal handlers installed
   // below know that it needs to be respected.
   if (opt.kill_delay_secs > 0) {
     global_need_polite_sigterm.store(1, std::memory_order_relaxed);
   }

   // OnTimeoutOrTerm, which is used for other signals below, assumes that it
   // handles SIGALRM. We also explicitly invoke it after the timeout using
   // alarm(2).
   InstallSignalHandler(SIGALRM, OnTimeoutOrTerm);

   // If requested, arrange for the child to be killed (optionally after being
   // asked politely to terminate) once the timeout expires.
   //
   // Note that it's important to set this up before support for SIGTERM and
   // SIGINT. Otherwise one of those signals could arrive before we get here,
   // and then we would reset its opt.kill_delay_secs interval timer.
   if (opt.timeout_secs > 0) {
     alarm(opt.timeout_secs);
   }

   // Also ask/tell the child to quit on SIGTERM, and optionally for SIGINT
   // too.
   InstallSignalHandler(SIGTERM, OnTimeoutOrTerm);
   if (opt.sigint_sends_sigterm) {
     InstallSignalHandler(SIGINT, OnTimeoutOrTerm);
   }

   // Wait for the child to exit, returning an appropriate status.
   return WaitForPid1(child_pid);
 }
	// Copyright 2016 The Bazel Authors. All rights reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	/**
	* linux-sandbox runs commands in a restricted environment where they are
	* subject to a few rules:
	*
	* - The entire filesystem is made read-only.
	* - The working directory (-W) will be made read-write, though.
	* - Individual files or directories can be made writable (but not deletable)
	* (-w).
	* - If the process takes longer than the timeout (-T), it will be killed with
	* SIGTERM. If it does not exit within the grace period (-t), it all of its
	* children will be killed with SIGKILL.
	* - tmpfs can be mounted on top of existing directories (-e).
	* - If option -R is passed, the process will run as user 'root'.
	* - If option -U is passed, the process will run as user 'nobody'.
	* - Otherwise, the process runs using the current uid / gid.
	* - If linux-sandbox itself gets killed, the process and all of its children
	* will be killed.
	* - If linux-sandbox's parent dies, it will kill itself, the process and all
	* the children.
	* - Network access is allowed, but can be disabled via -N.
	* - The hostname and domainname will be set to "sandbox".
	* - The process runs in its own PID namespace, so other processes on the
	* system are invisible.
	*/

	#include "src/main/tools/linux-sandbox.h"

	#include <ctype.h>
	#include <dirent.h>
	#include <errno.h>
	#include <fcntl.h>
	#include <math.h>
	#include <sched.h>
	#include <signal.h>
	#include <stdbool.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <sys/prctl.h>
	#include <sys/resource.h>
	#include <sys/stat.h>
	#include <sys/time.h>
	#include <sys/types.h>
	#include <sys/wait.h>
	#include <unistd.h>

	#include <atomic>
	#include <string>
	#include <vector>

	#include "src/main/tools/linux-sandbox-options.h"
	#include "src/main/tools/linux-sandbox-pid1.h"
	#include "src/main/tools/logging.h"
	#include "src/main/tools/process-tools.h"

	uid_t global_outer_uid;
	gid_t global_outer_gid;

	// The PID of our child process, for use in signal handlers.
	static std::atomic<pid_t> global_child_pid{0};
	// Our parent's pid at the outset, to check if the original parent has exited.
	pid_t initial_ppid;

	// Must we politely ask the child to exit before we send it a SIGKILL (once we
	// want it to exit)? Holds only zero or one.
	static std::atomic<int> global_need_polite_sigterm{false};

	#if __cplusplus >= 201703L
	static_assert(global_child_pid.is_always_lock_free);
	static_assert(global_need_polite_sigterm.is_always_lock_free);
	#endif

	// Make sure the child process does not inherit any accidentally left open file
	// handles from our parent.
	static void CloseFds() {
	DIR *fds = opendir("/proc/self/fd");
	if (fds == nullptr) {
	DIE("opendir");
	}

	while (1) {
	errno = 0;
	struct dirent *dent = readdir(fds);

	if (dent == nullptr) {
	if (errno != 0) {
	DIE("readdir");
	}
	break;
	}

	if (isdigit(dent->d_name[0])) {
	errno = 0;
	int fd = strtol(dent->d_name, nullptr, 10);

	// (1) Skip unparseable entries.
	// (2) Close everything except stdin, stdout, stderr and debug output.
	// (3) Do not accidentally close our directory handle.
	if (errno == 0 && fd > STDERR_FILENO &&
	(global_debug == NULL \|\| fd != fileno(global_debug)) &&
	fd != dirfd(fds)) {
	if (close(fd) < 0) {
	DIE("close");
	}
	}
	}
	}

	if (closedir(fds) < 0) {
	DIE("closedir");
	}
	}

	static void MaybeAddChildProcessToCgroup(const pid_t pid) {
	for (const std::string &cgroups_dir : opt.cgroups_dirs) {
	PRINT_DEBUG("Adding process %d to cgroups dir %s", pid,
	cgroups_dir.c_str());
	WriteFile(cgroups_dir + "/cgroup.procs", "%d", pid);
	}
	}

	static void OnTimeoutOrTerm(int) {
	// Find the PID of the child, which main set up before installing us as a
	// signal handler.
	const pid_t child_pid = global_child_pid.load(std::memory_order_relaxed);

	// Figure out whether we should send a SIGTERM here. If so, we won't want to
	// next time we're called.
	const bool need_polite_sigterm =
	global_need_polite_sigterm.fetch_and(0, std::memory_order_relaxed);

	// If we're not supposed to ask politely, simply forcibly kill the child.
	if (!need_polite_sigterm) {
	kill(child_pid, SIGKILL);
	return;
	}

	// Otherwise make a polite request, then arrange to be called again after a
	// delay, at which point we'll send SIGKILL.
	//
	// Note that main sets us up as the signal handler for SIGALRM, and arranges
	// for this code path to be taken only if kill_delay_secs > 0.
	kill(child_pid, SIGTERM);
	alarm(opt.kill_delay_secs);
	}

	static pid_t SpawnPid1() {
	const int kStackSize = 1024 * 1024;
	std::vector<char> child_stack(kStackSize);

	PRINT_DEBUG("calling pipe(2)...");

	int pipe_from_child[2], pipe_to_child[2];
	if (pipe(pipe_from_child) < 0) {
	DIE("pipe");
	}
	if (pipe(pipe_to_child) < 0) {
	DIE("pipe");
	}

	int clone_flags =
	CLONE_NEWUSER \| CLONE_NEWNS \| CLONE_NEWIPC \| CLONE_NEWPID \| SIGCHLD;
	PRINT_DEBUG("Netns is %d", opt.create_netns);
	if (opt.create_netns != NO_NETNS) {
	clone_flags \|= CLONE_NEWNET;
	}
	if (opt.fake_hostname) {
	clone_flags \|= CLONE_NEWUTS;
	}

	// We use clone instead of unshare, because unshare sometimes fails with
	// EINVAL due to a race condition in the Linux kernel (see
	// https://lkml.org/lkml/2015/7/28/833).
	PRINT_DEBUG("calling clone(2)...");

	Pid1Args pid1Args;
	pid1Args.pipe_to_parent = pipe_from_child;
	pid1Args.pipe_from_parent = pipe_to_child;
	const pid_t child_pid = clone(Pid1Main, child_stack.data() + kStackSize,
	clone_flags, &pid1Args);

	if (child_pid < 0) {
	DIE("clone");
	}

	MaybeAddChildProcessToCgroup(child_pid);
	// Signal the child that it can now proceed to spawn pid2.
	SignalPipe(pipe_to_child);

	PRINT_DEBUG("linux-sandbox-pid1 has PID %d", child_pid);

	// Wait for a signal from the child linux-sandbox-pid1 process; this proves to
	// the child process that we still existed after it ran
	// prctl(PR_SET_PDEATHSIG, SIGKILL), thus preventing a race condition where
	// the parent is killed before that call was made.
	WaitPipe(pipe_from_child);

	PRINT_DEBUG("done manipulating pipes");

	return child_pid;
	}

	static int WaitForPid1(const pid_t child_pid) {
	// Wait for the child to exit, obtaining usage information. Restart in the
	// case of a signal interrupting us.
	int child_status;
	struct rusage child_rusage;
	while (true) {
	const int ret = wait4(child_pid, &child_status, 0, &child_rusage);
	if (ret > 0) {
	break;
	}

	// We've been handed off to a reaper process and should die.
	if (getppid() != initial_ppid) {
	break;
	}

	if (errno == EINTR) {
	continue;
	}

	DIE("wait4");
	}

	// If we're supposed to write stats to a file, do so now.
	if (!opt.stats_path.empty()) {
	WriteStatsToFile(&child_rusage, opt.stats_path);
	}

	// We want to exit in the same manner as the child.
	if (WIFSIGNALED(child_status)) {
	const int signal = WTERMSIG(child_status);
	PRINT_DEBUG("child exited due to receiving signal: %s", strsignal(signal));
	return 128 + signal;
	}

	const int exit_code = WEXITSTATUS(child_status);
	PRINT_DEBUG("child exited normally with code %d", exit_code);
	return exit_code;
	}

	int main(int argc, char *argv[]) {
	// Ask the kernel to kill us with SIGKILL if our parent dies.
	if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
	DIE("prctl");
	}

	// Parse our command-line options.
	ParseOptions(argc, argv);

	// Open the file PRINT_DEBUG writes to.
	// Must happen early enough so we don't lose any debugging output.
	if (!opt.debug_path.empty()) {
	global_debug = fopen(opt.debug_path.c_str(), "w");
	if (!global_debug) {
	DIE("fopen(%s)", opt.debug_path.c_str());
	}
	}

	// Start with default signal actions and a clear signal mask.
	ClearSignalMask();

	// Ignore SIGTTIN and SIGTTOU, as we hand off the terminal to the child in
	// SpawnChild.
	IgnoreSignal(SIGTTIN);
	IgnoreSignal(SIGTTOU);

	// Remember the parent pid so we can exit if the parent has exited.
	// Doing this before prctl(PR_SET_PDEATHDIG, 0) ensures no race condition.
	initial_ppid = getppid();

	if (opt.persistent_process) {
	if (prctl(PR_SET_PDEATHSIG, 0) < 0) {
	DIE("prctl");
	}
	}

	// Redirect output as requested.
	Redirect(opt.stdout_path, STDOUT_FILENO);
	Redirect(opt.stderr_path, STDERR_FILENO);

	// Set up two globals used by the child process.
	global_outer_uid = getuid();
	global_outer_gid = getgid();

	// Ensure we don't pass on any FDs from our parent to our child other than
	// stdin, stdout, stderr and global_debug.
	CloseFds();

	// Spawn the child that will fork the sandboxed program with fresh
	// namespaces etc.
	const pid_t child_pid = SpawnPid1();

	// Let the signal handlers installed below know the PID of the child.
	global_child_pid.store(child_pid, std::memory_order_relaxed);

	// If a kill delay has been configured, let the signal handlers installed
	// below know that it needs to be respected.
	if (opt.kill_delay_secs > 0) {
	global_need_polite_sigterm.store(1, std::memory_order_relaxed);
	}

	// OnTimeoutOrTerm, which is used for other signals below, assumes that it
	// handles SIGALRM. We also explicitly invoke it after the timeout using
	// alarm(2).
	InstallSignalHandler(SIGALRM, OnTimeoutOrTerm);

	// If requested, arrange for the child to be killed (optionally after being
	// asked politely to terminate) once the timeout expires.
	//
	// Note that it's important to set this up before support for SIGTERM and
	// SIGINT. Otherwise one of those signals could arrive before we get here,
	// and then we would reset its opt.kill_delay_secs interval timer.
	if (opt.timeout_secs > 0) {
	alarm(opt.timeout_secs);
	}

	// Also ask/tell the child to quit on SIGTERM, and optionally for SIGINT
	// too.
	InstallSignalHandler(SIGTERM, OnTimeoutOrTerm);
	if (opt.sigint_sends_sigterm) {
	InstallSignalHandler(SIGINT, OnTimeoutOrTerm);
	}

	// Wait for the child to exit, returning an appropriate status.
	return WaitForPid1(child_pid);
	}