nspawn: register a scope for the unit if --register=no is specified (#6166)

Previously, only when --register=yes was set (the default) the invoked
container would get its own scope, created by machined on behalf of
nspawn. With this change if --register=no is set nspawn will still get
its own scope (which is a good thing, so that --slice= and --property=
take effect), but this is not done through machined but by registering a
scope unit directly in PID 1.

Summary:

--register=yes             → allocate a new scope through machined (the default)
--register=yes --keep-unit → use the unit we are already running in an register with machined
--register=no              → allocate a new scope directly, but no machined
--register=no --keep-unit  → do not allocate nor register anything

Fixes: #5823
This commit is contained in:
Lennart Poettering 2017-06-28 19:22:46 +02:00 committed by Zbigniew Jędrzejewski-Szmek
parent 694859b5e7
commit cd2dfc6fae
4 changed files with 202 additions and 85 deletions

View file

@ -398,24 +398,19 @@
<varlistentry>
<term><option>--slice=</option></term>
<listitem><para>Make the container part of the specified
slice, instead of the default
<filename>machine.slice</filename>. This is only applies if
the machine is run in its own scope unit, i.e. if
<option>--keep-unit</option> is not used.</para>
<listitem><para>Make the container part of the specified slice, instead of the default
<filename>machine.slice</filename>. This applies only if the machine is run in its own scope unit, i.e. if
<option>--keep-unit</option> isn't used.</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--property=</option></term>
<listitem><para>Set a unit property on the scope unit to
register for the machine. This only applies if the machine is
run in its own scope unit, i.e. if
<option>--keep-unit</option> is not used. Takes unit property
assignments in the same format as <command>systemctl
set-property</command>. This is useful to set memory limits
and similar for machines.</para>
<listitem><para>Set a unit property on the scope unit to register for the machine. This applies only if the
machine is run in its own scope unit, i.e. if <option>--keep-unit</option> isn't used. Takes unit property
assignments in the same format as <command>systemctl set-property</command>. This is useful to set memory
limits and similar for container.</para>
</listitem>
</varlistentry>
@ -888,18 +883,16 @@
<varlistentry>
<term><option>--register=</option></term>
<listitem><para>Controls whether the container is registered
with
<citerefentry><refentrytitle>systemd-machined</refentrytitle><manvolnum>8</manvolnum></citerefentry>.
Takes a boolean argument, which defaults to <literal>yes</literal>.
This option should be enabled when the container runs a full
Operating System (more specifically: an init system), and is
useful to ensure that the container is accessible via
<citerefentry><refentrytitle>machinectl</refentrytitle><manvolnum>1</manvolnum></citerefentry>
and shown by tools such as
<citerefentry project='man-pages'><refentrytitle>ps</refentrytitle><manvolnum>1</manvolnum></citerefentry>.
If the container does not run an init system, it is
recommended to set this option to <literal>no</literal>.</para></listitem>
<listitem><para>Controls whether the container is registered with
<citerefentry><refentrytitle>systemd-machined</refentrytitle><manvolnum>8</manvolnum></citerefentry>. Takes a
boolean argument, which defaults to <literal>yes</literal>. This option should be enabled when the container
runs a full Operating System (more specifically: a system and service manager as PID 1), and is useful to
ensure that the container is accessible via
<citerefentry><refentrytitle>machinectl</refentrytitle><manvolnum>1</manvolnum></citerefentry> and shown by
tools such as <citerefentry
project='man-pages'><refentrytitle>ps</refentrytitle><manvolnum>1</manvolnum></citerefentry>. If the container
does not run a service manager, it is recommended to set this option to
<literal>no</literal>.</para></listitem>
</varlistentry>
<varlistentry>
@ -916,7 +909,9 @@
service unit, and the service unit's sole purpose is to run a
single <command>systemd-nspawn</command> container. This
option is not available if run from a user
session.</para></listitem>
session.</para>
<para>Note that passing <option>--keep-unit</option> disables the effect of <option>--slice=</option> and
<option>--property=</option>.</para></listitem>
</varlistentry>
<varlistentry>

View file

@ -27,6 +27,77 @@
#include "strv.h"
#include "util.h"
static int append_machine_properties(
sd_bus_message *m,
CustomMount *mounts,
unsigned n_mounts,
int kill_signal,
char **properties) {
unsigned j;
int r;
assert(m);
r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "closed");
if (r < 0)
return bus_log_create_error(r);
/* If you make changes here, also make sure to update systemd-nspawn@.service, to keep the device policies in
* sync regardless if we are run with or without the --keep-unit switch. */
r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 2,
/* Allow the container to
* access and create the API
* device nodes, so that
* PrivateDevices= in the
* container can work
* fine */
"/dev/net/tun", "rwm",
/* Allow the container
* access to ptys. However,
* do not permit the
* container to ever create
* these device nodes. */
"char-pts", "rw");
if (r < 0)
return bus_log_create_error(r);
for (j = 0; j < n_mounts; j++) {
CustomMount *cm = mounts + j;
if (cm->type != CUSTOM_MOUNT_BIND)
continue;
r = is_device_node(cm->source);
if (r == -ENOENT) {
/* The bind source might only appear as the image is put together, hence don't complain */
log_debug_errno(r, "Bind mount source %s not found, ignoring: %m", cm->source);
continue;
}
if (r < 0)
return log_error_errno(r, "Failed to stat %s: %m", cm->source);
if (r) {
r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
cm->source, cm->read_only ? "r" : "rw");
if (r < 0)
return log_error_errno(r, "Failed to append message arguments: %m");
}
}
if (kill_signal != 0) {
r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", kill_signal);
if (r < 0)
return bus_log_create_error(r);
r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
if (r < 0)
return bus_log_create_error(r);
}
return 0;
}
int register_machine(
const char *machine_name,
pid_t pid,
@ -68,7 +139,6 @@ int register_machine(
local_ifindex > 0 ? 1 : 0, local_ifindex);
} else {
_cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
unsigned j;
r = sd_bus_message_new_method_call(
bus,
@ -103,63 +173,14 @@ int register_machine(
return bus_log_create_error(r);
}
r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "closed");
r = append_machine_properties(
m,
mounts,
n_mounts,
kill_signal,
properties);
if (r < 0)
return bus_log_create_error(r);
/* If you make changes here, also make sure to update
* systemd-nspawn@.service, to keep the device
* policies in sync regardless if we are run with or
* without the --keep-unit switch. */
r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 2,
/* Allow the container to
* access and create the API
* device nodes, so that
* PrivateDevices= in the
* container can work
* fine */
"/dev/net/tun", "rwm",
/* Allow the container
* access to ptys. However,
* do not permit the
* container to ever create
* these device nodes. */
"char-pts", "rw");
if (r < 0)
return bus_log_create_error(r);
for (j = 0; j < n_mounts; j++) {
CustomMount *cm = mounts + j;
if (cm->type != CUSTOM_MOUNT_BIND)
continue;
r = is_device_node(cm->source);
if (r == -ENOENT) {
/* The bind source might only appear as the image is put together, hence don't complain */
log_debug_errno(r, "Bind mount source %s not found, ignoring: %m", cm->source);
continue;
}
if (r < 0)
return log_error_errno(r, "Failed to stat %s: %m", cm->source);
if (r) {
r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
cm->source, cm->read_only ? "r" : "rw");
if (r < 0)
return log_error_errno(r, "Failed to append message arguments: %m");
}
}
if (kill_signal != 0) {
r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", kill_signal);
if (r < 0)
return bus_log_create_error(r);
r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
if (r < 0)
return bus_log_create_error(r);
}
return r;
r = bus_append_unit_property_assignment_many(m, properties);
if (r < 0)
@ -229,3 +250,90 @@ int terminate_machine(pid_t pid) {
return 0;
}
int allocate_scope(
const char *machine_name,
pid_t pid,
const char *slice,
CustomMount *mounts,
unsigned n_mounts,
int kill_signal,
char **properties) {
_cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
_cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
_cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
_cleanup_free_ char *scope = NULL;
const char *description;
int r;
r = sd_bus_default_system(&bus);
if (r < 0)
return log_error_errno(r, "Failed to open system bus: %m");
r = unit_name_mangle_with_suffix(machine_name, UNIT_NAME_NOGLOB, ".scope", &scope);
if (r < 0)
return log_error_errno(r, "Failed to mangle scope name: %m");
r = sd_bus_message_new_method_call(
bus,
&m,
"org.freedesktop.systemd1",
"/org/freedesktop/systemd1",
"org.freedesktop.systemd1.Manager",
"StartTransientUnit");
if (r < 0)
return bus_log_create_error(r);
r = sd_bus_message_append(m, "ss", scope, "fail");
if (r < 0)
return bus_log_create_error(r);
/* Properties */
r = sd_bus_message_open_container(m, 'a', "(sv)");
if (r < 0)
return bus_log_create_error(r);
description = strjoina("Container ", machine_name);
r = sd_bus_message_append(m, "(sv)(sv)(sv)(sv)",
"PIDs", "au", 1, pid,
"Description", "s", description,
"Delegate", "b", 1,
"Slice", "s", isempty(slice) ? "machine.slice" : slice);
if (r < 0)
return bus_log_create_error(r);
r = append_machine_properties(
m,
mounts,
n_mounts,
kill_signal,
properties);
if (r < 0)
return r;
r = bus_append_unit_property_assignment_many(m, properties);
if (r < 0)
return r;
r = sd_bus_message_close_container(m);
if (r < 0)
return bus_log_create_error(r);
/* No auxiliary units */
r = sd_bus_message_append(
m,
"a(sa(sv))",
0);
if (r < 0)
return bus_log_create_error(r);
r = sd_bus_call(bus, m, 0, &error, NULL);
if (r < 0) {
log_error("Failed to allocate scope: %s", bus_error_message(&error, r));
return r;
}
return 0;
}

View file

@ -27,3 +27,5 @@
int register_machine(const char *machine_name, pid_t pid, const char *directory, sd_id128_t uuid, int local_ifindex, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties, bool keep_unit, const char *service);
int terminate_machine(pid_t pid);
int allocate_scope(const char *machine_name, pid_t pid, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties);

View file

@ -1083,8 +1083,8 @@ static int parse_argv(int argc, char *argv[]) {
if (arg_userns_mode == USER_NAMESPACE_PICK)
arg_userns_chown = true;
if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
log_error("--keep-unit may not be used when invoked from a user session.");
if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) {
log_error("--keep-unit --register=yes may not be used when invoked from a user session.");
return -EINVAL;
}
@ -3387,7 +3387,19 @@ static int run(int master,
arg_container_service_name);
if (r < 0)
return r;
}
} else if (!arg_keep_unit) {
r = allocate_scope(
arg_machine,
*pid,
arg_slice,
arg_custom_mounts, arg_n_custom_mounts,
arg_kill_signal,
arg_property);
if (r < 0)
return r;
} else if (arg_slice || arg_property)
log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
if (r < 0)