Systemd/src/nspawn/nspawn-stub-pid1.c
Lennart Poettering 335d2eadca nspawn: don't become TTY controller just to undo it later again
Instead of first becoming a controlling process of the payload pty
as side effect of opening it (without O_NOCTTY), and then possibly
dropping it again, let's do it cleanly an reverse the logic: let's open
the pty without becoming its controller first. Only after everything
went the way we wanted it to go become the controller explicitly.

This has the benefit that the PID 1 stub process we run (as effect of
--as-pid2) doesn't have to lose the tty explicitly, but can just
continue running with things. And we explicitly make the tty controlling
right before invoking actual payload.

In order to make sure everything works as expected validate that the
stub PID 1 in the container really has no conrolling tty by issuing the
TIOCNOTTY tty and expecting ENOTTY, and log about it.

This shouldn't change behaviour much, it just makes thins a bit cleaner,
in particular as we'll not trigger SIGHUP on ourselves (since we are
controller and session leader) due to TIOCNOTTY which we then have to
explicitly ignore.
2020-09-17 16:39:23 +02:00

201 lines
7.7 KiB
C

/* SPDX-License-Identifier: LGPL-2.1+ */
#include <sys/ioctl.h>
#include <sys/reboot.h>
#include <sys/wait.h>
#include <sys/prctl.h>
#include <unistd.h>
#include "def.h"
#include "exit-status.h"
#include "fd-util.h"
#include "log.h"
#include "nspawn-stub-pid1.h"
#include "process-util.h"
#include "signal-util.h"
#include "time-util.h"
static int reset_environ(const char *new_environment, size_t length) {
unsigned long start, end;
start = (unsigned long) new_environment;
end = start + length;
if (prctl(PR_SET_MM, PR_SET_MM_ENV_START, start, 0, 0) < 0)
return -errno;
if (prctl(PR_SET_MM, PR_SET_MM_ENV_END, end, 0, 0) < 0)
return -errno;
return 0;
}
int stub_pid1(sd_id128_t uuid) {
enum {
STATE_RUNNING,
STATE_REBOOT,
STATE_POWEROFF,
} state = STATE_RUNNING;
sigset_t fullmask, oldmask, waitmask;
usec_t quit_usec = USEC_INFINITY;
pid_t pid;
int r;
/* The new environment we set up, on the stack. */
char new_environment[] =
"container=systemd-nspawn\0"
"container_uuid=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX";
/* Implements a stub PID 1, that reaps all processes and processes a couple of standard signals. This is useful
* for allowing arbitrary processes run in a container, and still have all zombies reaped. */
assert_se(sigfillset(&fullmask) >= 0);
assert_se(sigprocmask(SIG_BLOCK, &fullmask, &oldmask) >= 0);
pid = fork();
if (pid < 0)
return log_error_errno(errno, "Failed to fork child pid: %m");
if (pid == 0) {
/* Return in the child */
assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) >= 0);
if (setsid() < 0)
return log_error_errno(errno, "Failed to become session leader in payload process: %m");
return 0;
}
reset_all_signal_handlers();
log_close();
(void) close_all_fds(NULL, 0);
log_open();
if (ioctl(STDIN_FILENO, TIOCNOTTY) < 0) {
if (errno != ENOTTY)
log_warning_errno(errno, "Unexpected error from TIOCNOTTY ioctl in init stub process, ignoring: %m");
} else
log_warning("Expected TIOCNOTTY to fail, but it succeeded in init stub process, ignoring.");
/* Flush out /proc/self/environ, so that we don't leak the environment from the host into the container. Also,
* set $container= and $container_uuid= so that clients in the container that query it from /proc/1/environ
* find them set. */
sd_id128_to_string(uuid, new_environment + sizeof(new_environment) - SD_ID128_STRING_MAX);
reset_environ(new_environment, sizeof(new_environment));
(void) rename_process("(sd-stubinit)");
assert_se(sigemptyset(&waitmask) >= 0);
assert_se(sigset_add_many(&waitmask,
SIGCHLD, /* posix: process died */
SIGINT, /* sysv: ctrl-alt-del */
SIGRTMIN+3, /* systemd: halt */
SIGRTMIN+4, /* systemd: poweroff */
SIGRTMIN+5, /* systemd: reboot */
SIGRTMIN+6, /* systemd: kexec */
SIGRTMIN+13, /* systemd: halt */
SIGRTMIN+14, /* systemd: poweroff */
SIGRTMIN+15, /* systemd: reboot */
SIGRTMIN+16, /* systemd: kexec */
-1) >= 0);
/* Note that we ignore SIGTERM (sysv's reexec), SIGHUP (reload), and all other signals here, since we don't
* support reexec/reloading in this stub process. */
for (;;) {
siginfo_t si;
usec_t current_usec;
si.si_pid = 0;
r = waitid(P_ALL, 0, &si, WEXITED|WNOHANG);
if (r < 0) {
r = log_error_errno(errno, "Failed to reap children: %m");
goto finish;
}
current_usec = now(CLOCK_MONOTONIC);
if (si.si_pid == pid || current_usec >= quit_usec) {
/* The child we started ourselves died or we reached a timeout. */
if (state == STATE_REBOOT) { /* dispatch a queued reboot */
(void) reboot(RB_AUTOBOOT);
r = log_error_errno(errno, "Failed to reboot: %m");
goto finish;
} else if (state == STATE_POWEROFF)
(void) reboot(RB_POWER_OFF); /* if this fails, fall back to normal exit. */
if (si.si_pid == pid && si.si_code == CLD_EXITED)
r = si.si_status; /* pass on exit code */
else
r = EXIT_EXCEPTION; /* signal, coredump, timeout, … */
goto finish;
}
if (si.si_pid != 0)
/* We reaped something. Retry until there's nothing more to reap. */
continue;
if (quit_usec == USEC_INFINITY)
r = sigwaitinfo(&waitmask, &si);
else {
struct timespec ts;
r = sigtimedwait(&waitmask, &si, timespec_store(&ts, quit_usec - current_usec));
}
if (r < 0) {
if (errno == EINTR) /* strace -p attach can result in EINTR, let's handle this nicely. */
continue;
if (errno == EAGAIN) /* timeout reached */
continue;
r = log_error_errno(errno, "Failed to wait for signal: %m");
goto finish;
}
if (si.si_signo == SIGCHLD)
continue; /* Let's reap this */
if (state != STATE_RUNNING)
continue;
/* Would love to use a switch() statement here, but SIGRTMIN is actually a function call, not a
* constant… */
if (si.si_signo == SIGRTMIN+3 ||
si.si_signo == SIGRTMIN+4 ||
si.si_signo == SIGRTMIN+13 ||
si.si_signo == SIGRTMIN+14)
state = STATE_POWEROFF;
else if (si.si_signo == SIGINT ||
si.si_signo == SIGRTMIN+5 ||
si.si_signo == SIGRTMIN+6 ||
si.si_signo == SIGRTMIN+15 ||
si.si_signo == SIGRTMIN+16)
state = STATE_REBOOT;
else
assert_not_reached("Got unexpected signal");
r = kill_and_sigcont(pid, SIGTERM);
/* Let's send a SIGHUP after the SIGTERM, as shells tend to ignore SIGTERM but do react to SIGHUP. We
* do it strictly in this order, so that the SIGTERM is dispatched first, and SIGHUP second for those
* processes which handle both. That's because services tend to bind configuration reload or something
* else to SIGHUP. */
if (r != -ESRCH)
(void) kill(pid, SIGHUP);
quit_usec = now(CLOCK_MONOTONIC) + DEFAULT_TIMEOUT_USEC;
}
finish:
_exit(r < 0 ? EXIT_FAILURE : r);
}