Merge pull request #7381 from poettering/cgroup-unified-delegate-rework

Fix delegation in the unified hierarchy + more cgroup work
This commit is contained in:
Zbigniew Jędrzejewski-Szmek 2017-11-22 07:42:08 +01:00 committed by GitHub
commit ffb70e4424
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 232 additions and 145 deletions

9
TODO
View File

@ -32,6 +32,9 @@ Features:
* add async version of sd_bus_add_match and make use of that
* support projid-based quota in machinectl for containers, and then drop
implicit btrfs loopback magic in machined
* let's log the "tainted" string at boot
* Add NetworkNamespacePath= to specify a path to a network namespace
@ -80,7 +83,11 @@ Features:
* maybe hook of xfs/ext4 quotactl() with services? i.e. automatically manage
the quota of a the user indicated in User= via unit file settings, like the
other resource management concepts. Would mix nicely with DynamicUser=1
other resource management concepts. Would mix nicely with DynamicUser=1. Or
alternatively, do this with projids, so that we can also cover services
running as root. Quota should probably cover all the special dirs such as
StateDirectory=, LogsDirectory=, CacheDirectory=, as well as RootDirectory= if it
is set, plus the whole disk space any image configured with RootImage=.
* Introduce "exit" as an EmergencyAction value, and allow to configure a
per-unit success/failure exit code to configure. This would be useful for

View File

@ -720,9 +720,10 @@
of control group controller names. If true, delegation is turned on, and all supported controllers are
enabled for the unit, making them available to the unit's processes for management. If false, delegation is
turned off entirely (and no additional controllers are enabled). If set to a list of controllers, delegation
is turned on, and the specified controllers are enabled for the unit. Note that assigning the empty string
will enable delegation, but reset the list of controllers, all assignments prior to this will have no effect.
Defaults to false.</para>
is turned on, and the specified controllers are enabled for the unit. Note that additional controllers than
the ones specified might be made available as well, depending on configuration of the containing slice unit
or other units contained in it. Note that assigning the empty string will enable delegation, but reset the
list of controllers, all assignments prior to this will have no effect. Defaults to false.</para>
<para>Note that controller delegation to less privileged code is only safe on the unified control group
hierarchy. Accordingly, access to the specified controllers will not be granted to unprivileged services on

View File

@ -1104,6 +1104,11 @@ int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
if (!p)
return -ENOMEM;
/* Truncate suffix indicating the process is a zombie */
e = endswith(p, " (deleted)");
if (e)
*e = 0;
*path = p;
return 0;
}
@ -2369,21 +2374,29 @@ int cg_mask_supported(CGroupMask *ret) {
return 0;
}
int cg_kernel_controllers(Set *controllers) {
int cg_kernel_controllers(Set **ret) {
_cleanup_set_free_free_ Set *controllers = NULL;
_cleanup_fclose_ FILE *f = NULL;
int r;
assert(controllers);
assert(ret);
/* Determines the full list of kernel-known controllers. Might
* include controllers we don't actually support, arbitrary
* named hierarchies and controllers that aren't currently
* accessible (because not mounted). */
controllers = set_new(&string_hash_ops);
if (!controllers)
return -ENOMEM;
f = fopen("/proc/cgroups", "re");
if (!f) {
if (errno == ENOENT)
if (errno == ENOENT) {
*ret = NULL;
return 0;
}
return -errno;
}
@ -2421,6 +2434,9 @@ int cg_kernel_controllers(Set *controllers) {
return r;
}
*ret = controllers;
controllers = NULL;
return 0;
}
@ -2530,6 +2546,7 @@ int cg_unified_flush(void) {
}
int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p) {
_cleanup_fclose_ FILE *f = NULL;
_cleanup_free_ char *fs = NULL;
CGroupController c;
int r;
@ -2563,7 +2580,15 @@ int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p) {
s[0] = mask & bit ? '+' : '-';
strcpy(s + 1, n);
r = write_string_file(fs, s, 0);
if (!f) {
f = fopen(fs, "we");
if (!f) {
log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p);
break;
}
}
r = write_string_stream(f, s, 0);
if (r < 0)
log_debug_errno(r, "Failed to enable controller %s for %s (%s): %m", n, p, fs);
}

View File

@ -32,6 +32,10 @@
#include "macro.h"
#include "set.h"
#define SYSTEMD_CGROUP_CONTROLLER_LEGACY "name=systemd"
#define SYSTEMD_CGROUP_CONTROLLER_HYBRID "name=unified"
#define SYSTEMD_CGROUP_CONTROLLER "_systemd"
/* An enum of well known cgroup controllers */
typedef enum CGroupController {
CGROUP_CONTROLLER_CPU,
@ -239,7 +243,7 @@ int cg_mask_supported(CGroupMask *ret);
int cg_mask_from_string(const char *s, CGroupMask *ret);
int cg_mask_to_string(CGroupMask mask, char **ret);
int cg_kernel_controllers(Set *controllers);
int cg_kernel_controllers(Set **controllers);
bool cg_ns_supported(void);

View File

@ -37,10 +37,6 @@
/* The default value for the net.unix.max_dgram_qlen sysctl */
#define DEFAULT_UNIX_MAX_DGRAM_QLEN 512UL
#define SYSTEMD_CGROUP_CONTROLLER_LEGACY "name=systemd"
#define SYSTEMD_CGROUP_CONTROLLER_HYBRID "name=unified"
#define SYSTEMD_CGROUP_CONTROLLER "_systemd"
#define SIGNALS_CRASH_HANDLER SIGSEGV,SIGILL,SIGFPE,SIGBUS,SIGQUIT,SIGABRT
#define SIGNALS_IGNORE SIGPIPE

View File

@ -1073,7 +1073,7 @@ CGroupMask unit_get_own_mask(Unit *u) {
if (!c)
return 0;
return cgroup_context_get_mask(c);
return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
}
CGroupMask unit_get_delegate_mask(Unit *u) {
@ -1113,7 +1113,7 @@ CGroupMask unit_get_members_mask(Unit *u) {
if (u->cgroup_members_mask_valid)
return u->cgroup_members_mask;
u->cgroup_members_mask = unit_get_delegate_mask(u);
u->cgroup_members_mask = 0;
if (u->type == UNIT_SLICE) {
void *v;
@ -1146,7 +1146,7 @@ CGroupMask unit_get_siblings_mask(Unit *u) {
if (UNIT_ISSET(u->slice))
return unit_get_members_mask(UNIT_DEREF(u->slice));
return unit_get_subtree_mask(u);
return unit_get_subtree_mask(u); /* we are the top-level slice */
}
CGroupMask unit_get_subtree_mask(Unit *u) {

View File

@ -237,11 +237,7 @@ int mount_cgroup_controllers(char ***join_controllers) {
/* Mount all available cgroup controllers that are built into the kernel. */
controllers = set_new(&string_hash_ops);
if (!controllers)
return log_oom();
r = cg_kernel_controllers(controllers);
r = cg_kernel_controllers(&controllers);
if (r < 0)
return log_error_errno(r, "Failed to enumerate cgroup controllers: %m");

View File

@ -750,33 +750,21 @@ static void mount_dump(Unit *u, FILE *f, const char *prefix) {
}
static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) {
pid_t pid;
int r;
ExecParameters exec_params = {
.flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN,
.stdin_fd = -1,
.stdout_fd = -1,
.stderr_fd = -1,
};
pid_t pid;
int r;
assert(m);
assert(c);
assert(_pid);
(void) unit_realize_cgroup(UNIT(m));
if (m->reset_accounting) {
(void) unit_reset_cpu_accounting(UNIT(m));
(void) unit_reset_ip_accounting(UNIT(m));
m->reset_accounting = false;
}
unit_export_state_files(UNIT(m));
r = unit_setup_exec_runtime(UNIT(m));
if (r < 0)
return r;
r = unit_setup_dynamic_creds(UNIT(m));
r = unit_prepare_exec(UNIT(m));
if (r < 0)
return r;
@ -1091,7 +1079,8 @@ static int mount_start(Unit *u) {
m->result = MOUNT_SUCCESS;
m->reload_result = MOUNT_SUCCESS;
m->reset_accounting = true;
u->reset_accounting = true;
mount_enter_mounting(m);
return 1;

View File

@ -68,8 +68,6 @@ struct Mount {
bool just_mounted:1;
bool just_changed:1;
bool reset_accounting:1;
bool sloppy_options;
bool lazy_unmount;

View File

@ -1223,24 +1223,26 @@ static int service_spawn(
ExecFlags flags,
pid_t *_pid) {
_cleanup_strv_free_ char **final_env = NULL, **our_env = NULL, **fd_names = NULL;
_cleanup_free_ int *fds = NULL;
unsigned n_storage_fds = 0, n_socket_fds = 0, n_env = 0;
pid_t pid;
ExecParameters exec_params = {
.flags = flags,
.stdin_fd = -1,
.stdout_fd = -1,
.stderr_fd = -1,
};
_cleanup_strv_free_ char **final_env = NULL, **our_env = NULL, **fd_names = NULL;
unsigned n_storage_fds = 0, n_socket_fds = 0, n_env = 0;
_cleanup_free_ int *fds = NULL;
pid_t pid;
int r;
assert(s);
assert(c);
assert(_pid);
r = unit_prepare_exec(UNIT(s));
if (r < 0)
return r;
if (flags & EXEC_IS_CONTROL) {
/* If this is a control process, mask the permissions/chroot application if this is requested. */
if (s->permissions_start_only)
@ -1249,23 +1251,6 @@ static int service_spawn(
exec_params.flags &= ~EXEC_APPLY_CHROOT;
}
(void) unit_realize_cgroup(UNIT(s));
if (s->reset_accounting) {
(void) unit_reset_cpu_accounting(UNIT(s));
(void) unit_reset_ip_accounting(UNIT(s));
s->reset_accounting = false;
}
unit_export_state_files(UNIT(s));
r = unit_setup_exec_runtime(UNIT(s));
if (r < 0)
return r;
r = unit_setup_dynamic_creds(UNIT(s));
if (r < 0)
return r;
if ((flags & EXEC_PASS_FDS) ||
s->exec_context.std_input == EXEC_INPUT_SOCKET ||
s->exec_context.std_output == EXEC_OUTPUT_SOCKET ||
@ -2185,7 +2170,8 @@ static int service_start(Unit *u) {
s->main_pid_known = false;
s->main_pid_alien = false;
s->forbid_restart = false;
s->reset_accounting = true;
u->reset_accounting = true;
s->status_text = mfree(s->status_text);
s->status_errno = 0;

View File

@ -166,8 +166,6 @@ struct Service {
bool forbid_restart:1;
bool start_timeout_defined:1;
bool reset_accounting:1;
char *bus_name;
char *bus_name_owner; /* unique name of the current owner */

View File

@ -1865,33 +1865,21 @@ static int socket_coldplug(Unit *u) {
}
static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) {
pid_t pid;
int r;
ExecParameters exec_params = {
.flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN,
.stdin_fd = -1,
.stdout_fd = -1,
.stderr_fd = -1,
};
pid_t pid;
int r;
assert(s);
assert(c);
assert(_pid);
(void) unit_realize_cgroup(UNIT(s));
if (s->reset_accounting) {
(void) unit_reset_cpu_accounting(UNIT(s));
(void) unit_reset_ip_accounting(UNIT(s));
s->reset_accounting = false;
}
unit_export_state_files(UNIT(s));
r = unit_setup_exec_runtime(UNIT(s));
if (r < 0)
return r;
r = unit_setup_dynamic_creds(UNIT(s));
r = unit_prepare_exec(UNIT(s));
if (r < 0)
return r;
@ -2471,7 +2459,8 @@ static int socket_start(Unit *u) {
return r;
s->result = SOCKET_SUCCESS;
s->reset_accounting = true;
u->reset_accounting = true;
socket_enter_start_pre(s);
return 1;

View File

@ -162,8 +162,6 @@ struct Socket {
char *user, *group;
bool reset_accounting:1;
char *fdname;
RateLimit trigger_limit;

View File

@ -603,35 +603,23 @@ static void swap_dump(Unit *u, FILE *f, const char *prefix) {
}
static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) {
pid_t pid;
int r;
ExecParameters exec_params = {
.flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN,
.stdin_fd = -1,
.stdout_fd = -1,
.stderr_fd = -1,
};
pid_t pid;
int r;
assert(s);
assert(c);
assert(_pid);
(void) unit_realize_cgroup(UNIT(s));
if (s->reset_accounting) {
(void) unit_reset_cpu_accounting(UNIT(s));
(void) unit_reset_ip_accounting(UNIT(s));
s->reset_accounting = false;
}
unit_export_state_files(UNIT(s));
r = unit_setup_exec_runtime(UNIT(s));
r = unit_prepare_exec(UNIT(s));
if (r < 0)
goto fail;
r = unit_setup_dynamic_creds(UNIT(s));
if (r < 0)
goto fail;
return r;
r = swap_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_usec));
if (r < 0)
@ -868,7 +856,8 @@ static int swap_start(Unit *u) {
return r;
s->result = SWAP_SUCCESS;
s->reset_accounting = true;
u->reset_accounting = true;
swap_enter_activating(s);
return 1;

View File

@ -71,8 +71,6 @@ struct Swap {
bool is_active:1;
bool just_activated:1;
bool reset_accounting:1;
SwapResult result;
usec_t timeout_usec;

View File

@ -5155,6 +5155,34 @@ void unit_unlink_state_files(Unit *u) {
}
}
int unit_prepare_exec(Unit *u) {
int r;
assert(u);
/* Prepares everything so that we can fork of a process for this unit */
(void) unit_realize_cgroup(u);
if (u->reset_accounting) {
(void) unit_reset_cpu_accounting(u);
(void) unit_reset_ip_accounting(u);
u->reset_accounting = false;
}
unit_export_state_files(u);
r = unit_setup_exec_runtime(u);
if (r < 0)
return r;
r = unit_setup_dynamic_creds(u);
if (r < 0)
return r;
return 0;
}
static const char* const collect_mode_table[_COLLECT_MODE_MAX] = {
[COLLECT_INACTIVE] = "inactive",
[COLLECT_INACTIVE_OR_FAILED] = "inactive-or-failed",

View File

@ -345,6 +345,9 @@ struct Unit {
UnitCGroupBPFState cgroup_bpf_state:2;
/* Reset cgroup accounting next time we fork something off */
bool reset_accounting:1;
bool start_limit_hit:1;
/* Did we already invoke unit_coldplug() for this unit? */
@ -763,6 +766,8 @@ void unit_remove_dependencies(Unit *u, UnitDependencyMask mask);
void unit_export_state_files(Unit *u);
void unit_unlink_state_files(Unit *u);
int unit_prepare_exec(Unit *u);
/* Macros which append UNIT= or USER_UNIT= to the message */
#define log_unit_full(unit, level, error, ...) \

View File

@ -867,19 +867,30 @@ int mount_custom(
/* Retrieve existing subsystems. This function is called in a new cgroup
* namespace.
*/
static int get_controllers(Set *subsystems) {
static int get_process_controllers(Set **ret) {
_cleanup_set_free_free_ Set *controllers = NULL;
_cleanup_fclose_ FILE *f = NULL;
char line[LINE_MAX];
int r;
assert(subsystems);
assert(ret);
controllers = set_new(&string_hash_ops);
if (!controllers)
return -ENOMEM;
f = fopen("/proc/self/cgroup", "re");
if (!f)
return errno == ENOENT ? -ESRCH : -errno;
FOREACH_LINE(line, f, return -errno) {
int r;
char *e, *l, *p;
for (;;) {
_cleanup_free_ char *line = NULL;
char *e, *l;
r = read_line(f, LONG_LINE_MAX, &line);
if (r < 0)
return r;
if (r == 0)
break;
l = strchr(line, ':');
if (!l)
@ -895,15 +906,14 @@ static int get_controllers(Set *subsystems) {
if (STR_IN_SET(l, "", "name=systemd", "name=unified"))
continue;
p = strdup(l);
if (!p)
return -ENOMEM;
r = set_consume(subsystems, p);
r = set_put_strdup(controllers, l);
if (r < 0)
return r;
}
*ret = controllers;
controllers = NULL;
return 0;
}
@ -999,11 +1009,7 @@ static int mount_legacy_cgns_supported(
if (r > 0)
goto skip_controllers;
controllers = set_new(&string_hash_ops);
if (!controllers)
return log_oom();
r = get_controllers(controllers);
r = get_process_controllers(&controllers);
if (r < 0)
return log_error_errno(r, "Failed to determine cgroup controllers: %m");
@ -1032,13 +1038,13 @@ static int mount_legacy_cgns_supported(
if (r == 0)
break;
target = prefix_root("/sys/fs/cgroup", tok);
if (!target)
return log_oom();
if (streq(controller, tok))
break;
target = prefix_root("/sys/fs/cgroup/", tok);
if (!target)
return log_oom();
r = symlink_idempotent(controller, target);
if (r == -EINVAL)
return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
@ -1105,11 +1111,7 @@ static int mount_legacy_cgns_unsupported(
if (r > 0)
goto skip_controllers;
controllers = set_new(&string_hash_ops);
if (!controllers)
return log_oom();
r = cg_kernel_controllers(controllers);
r = cg_kernel_controllers(&controllers);
if (r < 0)
return log_error_errno(r, "Failed to determine cgroup controllers: %m");
@ -1213,23 +1215,25 @@ int mount_cgroups(
if (unified_requested >= CGROUP_UNIFIED_ALL)
return mount_unified_cgroups(dest);
else if (use_cgns)
if (use_cgns)
return mount_legacy_cgns_supported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
return mount_legacy_cgns_unsupported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
}
static int mount_systemd_cgroup_writable_one(const char *systemd_own, const char *systemd_root)
{
static int mount_systemd_cgroup_writable_one(const char *root, const char *own) {
int r;
assert(root);
assert(own);
/* Make our own cgroup a (writable) bind mount */
r = mount_verbose(LOG_ERR, systemd_own, systemd_own, NULL, MS_BIND, NULL);
r = mount_verbose(LOG_ERR, own, own, NULL, MS_BIND, NULL);
if (r < 0)
return r;
/* And then remount the systemd cgroup root read-only */
return mount_verbose(LOG_ERR, NULL, systemd_root, NULL,
return mount_verbose(LOG_ERR, NULL, root, NULL,
MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
}
@ -1238,6 +1242,7 @@ int mount_systemd_cgroup_writable(
CGroupUnified unified_requested) {
_cleanup_free_ char *own_cgroup_path = NULL;
const char *root, *own;
int r;
assert(dest);
@ -1250,19 +1255,27 @@ int mount_systemd_cgroup_writable(
if (path_equal(own_cgroup_path, "/"))
return 0;
if (unified_requested >= CGROUP_UNIFIED_ALL)
return mount_systemd_cgroup_writable_one(strjoina(dest, "/sys/fs/cgroup", own_cgroup_path),
prefix_roota(dest, "/sys/fs/cgroup"));
if (unified_requested >= CGROUP_UNIFIED_ALL) {
if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
r = mount_systemd_cgroup_writable_one(strjoina(dest, "/sys/fs/cgroup/unified", own_cgroup_path),
prefix_roota(dest, "/sys/fs/cgroup/unified"));
if (r < 0)
return r;
root = prefix_roota(dest, "/sys/fs/cgroup");
own = strjoina(root, own_cgroup_path);
} else {
if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
root = prefix_roota(dest, "/sys/fs/cgroup/unified");
own = strjoina(root, own_cgroup_path);
r = mount_systemd_cgroup_writable_one(root, own);
if (r < 0)
return r;
}
root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
own = strjoina(root, own_cgroup_path);
}
return mount_systemd_cgroup_writable_one(strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path),
prefix_roota(dest, "/sys/fs/cgroup/systemd"));
return mount_systemd_cgroup_writable_one(root, own);
}
int setup_volatile_state(

View File

@ -0,0 +1,4 @@
include ../Makefile.guess
all setup clean run:
@basedir=../.. TEST_BASE_DIR=../ BUILD_DIR=$(BUILD_DIR) ./test.sh --$@

43
test/TEST-19-DELEGATE/test.sh Executable file
View File

@ -0,0 +1,43 @@
#!/bin/bash
# -*- mode: shell-script; indent-tabs-mode: nil; sh-basic-offset: 4; -*-
# ex: ts=8 sw=4 sts=4 et filetype=sh
set -e
TEST_DESCRIPTION="test cgroup delegation in the unifier hierarchy"
TEST_NO_NSPAWN=1
. $TEST_BASE_DIR/test-functions
QEMU_TIMEOUT=180
UNIFIED_CGROUP_HIERARCHY=yes
test_setup() {
create_empty_image
mkdir -p $TESTDIR/root
mount ${LOOPDEV}p1 $TESTDIR/root
(
LOG_LEVEL=5
eval $(udevadm info --export --query=env --name=${LOOPDEV}p2)
setup_basic_environment
# setup the testsuite service
cat >$initdir/etc/systemd/system/testsuite.service <<EOF
[Unit]
Description=Testsuite service
[Service]
ExecStart=/bin/bash -x /testsuite.sh
Type=oneshot
StandardOutput=tty
StandardError=tty
EOF
cp testsuite.sh $initdir/
setup_testsuite
) || return 1
ddebug "umount $TESTDIR/root"
umount $TESTDIR/root
}
do_test "$@"

View File

@ -0,0 +1,20 @@
#!/bin/bash
# -*- mode: shell-script; indent-tabs-mode: nil; sh-basic-offset: 4; -*-
# ex: ts=8 sw=4 sts=4 et filetype=sh
set -ex
set -o pipefail
systemd-run --wait --unit=test0.service -p "DynamicUser=1" -p "Delegate=" \
test -w /sys/fs/cgroup/system.slice/test0.service/ -a \
-w /sys/fs/cgroup/system.slice/test0.service/cgroup.procs -a \
-w /sys/fs/cgroup/system.slice/test0.service/cgroup.subtree_control
systemd-run --wait --unit=test1.service -p "DynamicUser=1" -p "Delegate=memory pids" \
grep memory /sys/fs/cgroup/system.slice/test1.service/cgroup.controllers
systemd-run --wait --unit=test2.service -p "DynamicUser=1" -p "Delegate=memory pids" \
grep pids /sys/fs/cgroup/system.slice/test2.service/cgroup.controllers
echo OK > /testok
exit 0