nspawn: optionally run a stub init process as PID 1

This adds a new switch --as-pid2, which allows running commands as PID 2, while a stub init process is run as PID 1.
This is useful in order to run arbitrary commands in a container, as PID1's semantics are different from all other
processes regarding reaping of unknown children or signal handling.
This commit is contained in:
Lennart Poettering 2016-02-03 20:32:06 +01:00
parent 021dd87bc0
commit 7732f92bad
9 changed files with 426 additions and 43 deletions

View file

@ -2944,6 +2944,8 @@ systemd_nspawn_SOURCES = \
src/nspawn/nspawn-register.h \
src/nspawn/nspawn-setuid.c \
src/nspawn/nspawn-setuid.h \
src/nspawn/nspawn-stub-pid1.c \
src/nspawn/nspawn-stub-pid1.h \
src/core/mount-setup.c \
src/core/mount-setup.h \
src/core/loopback-setup.c \

View file

@ -248,16 +248,69 @@
<option>--ephemeral</option>.</para></listitem>
</varlistentry>
<varlistentry>
<term><option>-a</option></term>
<term><option>--as-pid2</option></term>
<listitem><para>Invoke the shell or specified program as process ID (PID) 2 instead of PID 1 (init). By
default, if neither this option nor <option>--boot</option> is used, the selected binary is run as process with
PID 1, a mode only suitable for programs that are aware of the special semantics that the process with PID 1
has on UNIX. For example, it needs to reap all processes reparented to it, and should implement
<command>sysvinit</command> compatible signal handling (specifically: it needs to reboot on SIGINT, reexecute
on SIGTERM, reload configuration on SIGHUP, and so on). With <option>--as-pid2</option> a minimal stub init
process is run as PID 1 and the selected binary is executed as PID 2 (and hence does not need to implement any
special semantics). The stub init process will reap processes as necessary and react appropriately to
signals. It is recommended to use this mode to invoke arbitrary commands in containers, unless they have been
modified to run correctly as PID 1. Or in other words: this switch should be used for pretty much all commands,
except when the command refers to an init or shell implementation, as these are generally capable of running
correctly as PID 1). This option may not be combined with <option>--boot</option> or
<option>--share-system</option>.</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-b</option></term>
<term><option>--boot</option></term>
<listitem><para>Automatically search for an init binary and
invoke it instead of a shell or a user supplied program. If
this option is used, arguments specified on the command line
are used as arguments for the init binary. This option may not
be combined with <option>--share-system</option>.
</para></listitem>
<listitem><para>Automatically search for an init binary and invoke it as PID 1, instead of a shell or a user
supplied program. If this option is used, arguments specified on the command line are used as arguments for the
init binary. This option may not be combined with <option>--as-pid2</option> or
<option>--share-system</option>.</para>
<para>The following table explains the different modes of invocation and relationship to
<option>--as-pid2</option> (see above):</para>
<table>
<title>Invocation Mode</title>
<tgroup cols='2' align='left' colsep='1' rowsep='1'>
<colspec colname="switch" />
<colspec colname="explanation" />
<thead>
<row>
<entry>Switch</entry>
<entry>Explanation</entry>
</row>
</thead>
<tbody>
<row>
<entry>Neither <option>--as-pid2</option> nor <option>--boot</option> specified</entry>
<entry>The passed parameters are interpreted as command line, which is executed as PID 1 in the container.</entry>
</row>
<row>
<entry><option>--as-pid2</option> specified</entry>
<entry>The passed parameters are interpreted as command line, which are executed as PID 2 in the container. A stub init process is run as PID 1.</entry>
</row>
<row>
<entry><option>--boot</option> specified</entry>
<entry>An init binary as automatically searched and run as PID 1 in the container. The passed parameters are used as invocation parameters for this process.</entry>
</row>
</tbody>
</tgroup>
</table>
</listitem>
</varlistentry>
<varlistentry>

View file

@ -141,15 +141,21 @@
<varlistentry>
<term><varname>Boot=</varname></term>
<listitem><para>Takes a boolean argument, which defaults to off. If
enabled, <command>systemd-nspawn</command> will automatically
search for an <filename>init</filename> executable and invoke
it. In this case, the specified parameters using
<varname>Parameters=</varname> are passed as additional
arguments to the <filename>init</filename> process. This
setting corresponds to the <option>--boot</option> switch on
the <command>systemd-nspawn</command> command
line. </para></listitem>
<listitem><para>Takes a boolean argument, which defaults to off. If enabled, <command>systemd-nspawn</command>
will automatically search for an <filename>init</filename> executable and invoke it. In this case, the
specified parameters using <varname>Parameters=</varname> are passed as additional arguments to the
<filename>init</filename> process. This setting corresponds to the <option>--boot</option> switch on the
<command>systemd-nspawn</command> command line. This option may not be combined with
<varname>ProcessTwo=yes</varname>.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>ProcessTwo=</varname></term>
<listitem><para>Takes a boolean argument, which defaults to off. If enabled, the specified program is run as
PID 2. A stub init process is run as PID 1. This setting corresponds to the <option>--as-pid2</option> switch
on the <command>systemd-nspawn</command> command line. This option may not be combined with
<varname>Boot=yes</varname>.</para></listitem>
</varlistentry>
<varlistentry>

View file

@ -15,7 +15,8 @@ struct ConfigPerfItem;
%struct-type
%includes
%%
Exec.Boot, config_parse_tristate, 0, offsetof(Settings, boot)
Exec.Boot, config_parse_boot, 0, 0
Exec.ProcessTwo, config_parse_pid2, 0, 0,
Exec.Parameters, config_parse_strv, 0, offsetof(Settings, parameters)
Exec.Environment, config_parse_strv, 0, offsetof(Settings, environment)
Exec.User, config_parse_string, 0, offsetof(Settings, user)

View file

@ -24,6 +24,7 @@
#include "conf-parser.h"
#include "nspawn-network.h"
#include "nspawn-settings.h"
#include "parse-util.h"
#include "process-util.h"
#include "strv.h"
#include "util.h"
@ -39,7 +40,7 @@ int settings_load(FILE *f, const char *path, Settings **ret) {
if (!s)
return -ENOMEM;
s->boot = -1;
s->start_mode = _START_MODE_INVALID;
s->personality = PERSONALITY_INVALID;
s->read_only = -1;
@ -303,3 +304,93 @@ int config_parse_veth_extra(
return 0;
}
int config_parse_boot(
const char *unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
Settings *settings = data;
int r;
assert(filename);
assert(lvalue);
assert(rvalue);
r = parse_boolean(rvalue);
if (r < 0) {
log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse Boot= parameter %s, ignoring: %m", rvalue);
return 0;
}
if (r > 0) {
if (settings->start_mode == START_PID2)
goto conflict;
settings->start_mode = START_BOOT;
} else {
if (settings->start_mode == START_BOOT)
goto conflict;
if (settings->start_mode < 0)
settings->start_mode = START_PID1;
}
return 0;
conflict:
log_syntax(unit, LOG_ERR, filename, line, r, "Conflicting Boot= or ProcessTwo= setting found. Ignoring.");
return 0;
}
int config_parse_pid2(
const char *unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
Settings *settings = data;
int r;
assert(filename);
assert(lvalue);
assert(rvalue);
r = parse_boolean(rvalue);
if (r < 0) {
log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse ProcessTwo= parameter %s, ignoring: %m", rvalue);
return 0;
}
if (r > 0) {
if (settings->start_mode == START_BOOT)
goto conflict;
settings->start_mode = START_PID2;
} else {
if (settings->start_mode == START_PID2)
goto conflict;
if (settings->start_mode < 0)
settings->start_mode = START_PID1;
}
return 0;
conflict:
log_syntax(unit, LOG_ERR, filename, line, r, "Conflicting Boot= or ProcessTwo= setting found. Ignoring.");
return 0;
}

View file

@ -27,26 +27,34 @@
#include "nspawn-expose-ports.h"
#include "nspawn-mount.h"
typedef enum StartMode {
START_PID1, /* Run parameters as command line as process 1 */
START_PID2, /* Use stub init process as PID 1, run parameters as command line as process 2 */
START_BOOT, /* Search for init system, pass arguments as parameters */
_START_MODE_MAX,
_START_MODE_INVALID = -1
} StartMode;
typedef enum SettingsMask {
SETTING_BOOT = 1 << 0,
SETTING_ENVIRONMENT = 1 << 1,
SETTING_USER = 1 << 2,
SETTING_CAPABILITY = 1 << 3,
SETTING_KILL_SIGNAL = 1 << 4,
SETTING_PERSONALITY = 1 << 5,
SETTING_MACHINE_ID = 1 << 6,
SETTING_NETWORK = 1 << 7,
SETTING_EXPOSE_PORTS = 1 << 8,
SETTING_READ_ONLY = 1 << 9,
SETTING_VOLATILE_MODE = 1 << 10,
SETTING_CUSTOM_MOUNTS = 1 << 11,
SETTING_START_MODE = 1 << 0,
SETTING_ENVIRONMENT = 1 << 1,
SETTING_USER = 1 << 2,
SETTING_CAPABILITY = 1 << 3,
SETTING_KILL_SIGNAL = 1 << 4,
SETTING_PERSONALITY = 1 << 5,
SETTING_MACHINE_ID = 1 << 6,
SETTING_NETWORK = 1 << 7,
SETTING_EXPOSE_PORTS = 1 << 8,
SETTING_READ_ONLY = 1 << 9,
SETTING_VOLATILE_MODE = 1 << 10,
SETTING_CUSTOM_MOUNTS = 1 << 11,
SETTING_WORKING_DIRECTORY = 1 << 12,
_SETTINGS_MASK_ALL = (1 << 13) -1
_SETTINGS_MASK_ALL = (1 << 13) -1
} SettingsMask;
typedef struct Settings {
/* [Run] */
int boot;
StartMode start_mode;
char **parameters;
char **environment;
char *user;
@ -91,3 +99,5 @@ int config_parse_volatile_mode(const char *unit, const char *filename, unsigned
int config_parse_bind(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_tmpfs(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_veth_extra(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_boot(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_pid2(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);

View file

@ -0,0 +1,170 @@
/***
This file is part of systemd.
Copyright 2016 Lennart Poettering
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include <sys/reboot.h>
#include <sys/unistd.h>
#include <sys/wait.h>
#include "fd-util.h"
#include "log.h"
#include "nspawn-stub-pid1.h"
#include "process-util.h"
#include "signal-util.h"
#include "time-util.h"
#include "def.h"
int stub_pid1(void) {
enum {
STATE_RUNNING,
STATE_REBOOT,
STATE_POWEROFF,
} state = STATE_RUNNING;
sigset_t fullmask, oldmask, waitmask;
usec_t quit_usec = USEC_INFINITY;
pid_t pid;
int r;
/* Implements a stub PID 1, that reaps all processes and processes a couple of standard signals. This is useful
* for allowing arbitrary processes run in a container, and still have all zombies reaped. */
assert_se(sigfillset(&fullmask) >= 0);
assert_se(sigprocmask(SIG_BLOCK, &fullmask, &oldmask) >= 0);
pid = fork();
if (pid < 0)
return log_error_errno(errno, "Failed to fork child pid: %m");
if (pid == 0) {
/* Return in the child */
assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) >= 0);
setsid();
return 0;
}
reset_all_signal_handlers();
log_close();
close_all_fds(NULL, 0);
log_open();
rename_process("STUBINIT");
assert_se(sigemptyset(&waitmask) >= 0);
assert_se(sigset_add_many(&waitmask,
SIGCHLD, /* posix: process died */
SIGINT, /* sysv: ctrl-alt-del */
SIGRTMIN+3, /* systemd: halt */
SIGRTMIN+4, /* systemd: poweroff */
SIGRTMIN+5, /* systemd: reboot */
SIGRTMIN+6, /* systemd: kexec */
SIGRTMIN+13, /* systemd: halt */
SIGRTMIN+14, /* systemd: poweroff */
SIGRTMIN+15, /* systemd: reboot */
SIGRTMIN+16, /* systemd: kexec */
-1) >= 0);
/* Note that we ignore SIGTERM (sysv's reexec), SIGHUP (reload), and all other signals here, since we don't
* support reexec/reloading in this stub process. */
for (;;) {
siginfo_t si;
usec_t current_usec;
si.si_pid = 0;
r = waitid(P_ALL, 0, &si, WEXITED|WNOHANG);
if (r < 0) {
r = log_error_errno(errno, "Failed to reap children: %m");
goto finish;
}
current_usec = now(CLOCK_MONOTONIC);
if (si.si_pid == pid || current_usec >= quit_usec) {
/* The child we started ourselves died or we reached a timeout. */
if (state == STATE_REBOOT) { /* dispatch a queued reboot */
(void) reboot(RB_AUTOBOOT);
r = log_error_errno(errno, "Failed to reboot: %m");
goto finish;
} else if (state == STATE_POWEROFF)
(void) reboot(RB_POWER_OFF); /* if this fails, fall back to normal exit. */
if (si.si_pid == pid && si.si_code == CLD_EXITED)
r = si.si_status; /* pass on exit code */
else
r = 255; /* signal, coredump, timeout, … */
goto finish;
}
if (si.si_pid != 0)
/* We reaped something. Retry until there's nothing more to reap. */
continue;
if (quit_usec == USEC_INFINITY)
r = sigwaitinfo(&waitmask, &si);
else {
struct timespec ts;
r = sigtimedwait(&waitmask, &si, timespec_store(&ts, quit_usec - current_usec));
}
if (r < 0) {
if (errno == EINTR) /* strace -p attach can result in EINTR, let's handle this nicely. */
continue;
if (errno == EAGAIN) /* timeout reached */
continue;
r = log_error_errno(errno, "Failed to wait for signal: %m");
goto finish;
}
if (si.si_signo == SIGCHLD)
continue; /* Let's reap this */
if (state != STATE_RUNNING)
continue;
/* Would love to use a switch() statement here, but SIGRTMIN is actually a function call, not a
* constant */
if (si.si_signo == SIGRTMIN+3 ||
si.si_signo == SIGRTMIN+4 ||
si.si_signo == SIGRTMIN+13 ||
si.si_signo == SIGRTMIN+14)
state = STATE_POWEROFF;
else if (si.si_signo == SIGINT ||
si.si_signo == SIGRTMIN+5 ||
si.si_signo == SIGRTMIN+6 ||
si.si_signo == SIGRTMIN+15 ||
si.si_signo == SIGRTMIN+16)
state = STATE_REBOOT;
else
assert_not_reached("Got unexpected signal");
/* (void) kill_and_sigcont(pid, SIGTERM); */
quit_usec = now(CLOCK_MONOTONIC) + DEFAULT_TIMEOUT_USEC;
}
finish:
_exit(r < 0 ? EXIT_FAILURE : r);
}

View file

@ -0,0 +1,22 @@
#pragma once
/***
This file is part of systemd.
Copyright 2016 Lennart Poettering
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
int stub_pid1(void);

View file

@ -79,6 +79,7 @@
#include "nspawn-register.h"
#include "nspawn-settings.h"
#include "nspawn-setuid.h"
#include "nspawn-stub-pid1.h"
#include "parse-util.h"
#include "path-util.h"
#include "process-util.h"
@ -123,7 +124,7 @@ static const char *arg_selinux_apifs_context = NULL;
static const char *arg_slice = NULL;
static bool arg_private_network = false;
static bool arg_read_only = false;
static bool arg_boot = false;
static StartMode arg_start_mode = START_PID1;
static bool arg_ephemeral = false;
static LinkJournal arg_link_journal = LINK_AUTO;
static bool arg_link_journal_try = false;
@ -193,6 +194,7 @@ static void help(void) {
" -x --ephemeral Run container with snapshot of root directory, and\n"
" remove it after exit\n"
" -i --image=PATH File system device or disk image for the container\n"
" -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
" -b --boot Boot up full system (i.e. invoke init)\n"
" --chdir=PATH Set working directory in the container\n"
" -u --user=USER Run the command under specified user or uid\n"
@ -358,6 +360,7 @@ static int parse_argv(int argc, char *argv[]) {
{ "ephemeral", no_argument, NULL, 'x' },
{ "user", required_argument, NULL, 'u' },
{ "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
{ "as-pid2", no_argument, NULL, 'a' },
{ "boot", no_argument, NULL, 'b' },
{ "uuid", required_argument, NULL, ARG_UUID },
{ "read-only", no_argument, NULL, ARG_READ_ONLY },
@ -404,7 +407,7 @@ static int parse_argv(int argc, char *argv[]) {
assert(argc >= 0);
assert(argv);
while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
switch (c) {
@ -495,8 +498,23 @@ static int parse_argv(int argc, char *argv[]) {
break;
case 'b':
arg_boot = true;
arg_settings_mask |= SETTING_BOOT;
if (arg_start_mode == START_PID2) {
log_error("--boot and --as-pid2 may not be combined.");
return -EINVAL;
}
arg_start_mode = START_BOOT;
arg_settings_mask |= SETTING_START_MODE;
break;
case 'a':
if (arg_start_mode == START_BOOT) {
log_error("--boot and --as-pid2 may not be combined.");
return -EINVAL;
}
arg_start_mode = START_PID2;
arg_settings_mask |= SETTING_START_MODE;
break;
case ARG_UUID:
@ -876,7 +894,7 @@ static int parse_argv(int argc, char *argv[]) {
if (arg_share_system)
arg_register = false;
if (arg_boot && arg_share_system) {
if (arg_start_mode != START_PID1 && arg_share_system) {
log_error("--boot and --share-system may not be combined.");
return -EINVAL;
}
@ -924,7 +942,7 @@ static int parse_argv(int argc, char *argv[]) {
if (!arg_parameters)
return log_oom();
arg_settings_mask |= SETTING_BOOT;
arg_settings_mask |= SETTING_START_MODE;
}
/* Load all settings from .nspawn files */
@ -960,7 +978,7 @@ static int verify_arguments(void) {
return -EINVAL;
}
if (arg_boot && arg_kill_signal <= 0)
if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
arg_kill_signal = SIGRTMIN+3;
return 0;
@ -2584,6 +2602,12 @@ static int inner_child(
if (chdir(arg_chdir) < 0)
return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
if (arg_start_mode == START_PID2) {
r = stub_pid1();
if (r < 0)
return r;
}
/* Now, explicitly close the log, so that we
* then can close all remaining fds. Closing
* the log explicitly first has the benefit
@ -2595,7 +2619,7 @@ static int inner_child(
log_close();
(void) fdset_close_others(fds);
if (arg_boot) {
if (arg_start_mode == START_BOOT) {
char **a;
size_t m;
@ -2917,9 +2941,9 @@ static int load_settings(void) {
/* Copy over bits from the settings, unless they have been
* explicitly masked by command line switches. */
if ((arg_settings_mask & SETTING_BOOT) == 0 &&
settings->boot >= 0) {
arg_boot = settings->boot;
if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
settings->start_mode >= 0) {
arg_start_mode = settings->start_mode;
strv_free(arg_parameters);
arg_parameters = settings->parameters;
@ -3074,6 +3098,10 @@ int main(int argc, char *argv[]) {
log_parse_environment();
log_open();
/* Make sure rename_process() in the stub init process can work */
saved_argv = argv;
saved_argc = argc;
r = parse_argv(argc, argv);
if (r <= 0)
goto finish;
@ -3180,7 +3208,7 @@ int main(int argc, char *argv[]) {
}
}
if (arg_boot) {
if (arg_start_mode == START_BOOT) {
if (path_is_os_tree(arg_directory) <= 0) {
log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
r = -EINVAL;