core: make hybrid cgroup unified mode keep compat /sys/fs/cgroup/systemd hierarchy

Currently the hybrid mode mounts cgroup v2 on /sys/fs/cgroup instead of the v1
name=systemd hierarchy.  While this works fine for systemd itself, it breaks
tools which expect cgroup v1 hierarchy on /sys/fs/cgroup/systemd.

This patch updates the hybrid mode so that it mounts v2 hierarchy on
/sys/fs/cgroup/unified and keeps v1 "name=systemd" hierarchy on
/sys/fs/cgroup/systemd for compatibility.  systemd itself doesn't depend on the
"name=systemd" hierarchy at all.  All operations take place on the v2 hierarchy
as before but the v1 hierarchy is kept in sync so that any tools which expect
it to be there can keep doing so.  This allows systemd to take advantage of
cgroup v2 process management without requiring other tools to be aware of the
hybrid mode.

The hybrid mode is implemented by mapping the special systemd controller to
/sys/fs/cgroup/unified and making the basic cgroup utility operations -
cg_attach(), cg_create(), cg_rmdir() and cg_trim() - also operate on the
/sys/fs/cgroup/systemd hierarchy whenever the cgroup2 hierarchy is updated.

While a bit messy, this will allow dropping complications from using cgroup v1
for process management a lot sooner than otherwise possible which should make
it a net gain in terms of maintainability.

v2: Fixed !cgns breakage reported by @evverx and renamed the unified mount
    point to /sys/fs/cgroup/unified as suggested by @brauner.

v3: chown the compat hierarchy too on delegation.  Suggested by @evverx.

v4: [zj]
- drop the change to default, full "legacy" is still the default.
This commit is contained in:
Tejun Heo 2016-11-21 14:45:53 -05:00 committed by Zbigniew Jędrzejewski-Szmek
parent 2dcb526d7a
commit 2977724b09
6 changed files with 131 additions and 56 deletions

View File

@ -208,6 +208,12 @@ int cg_rmdir(const char *controller, const char *path) {
if (r < 0 && errno != ENOENT)
return -errno;
if (streq(controller, SYSTEMD_CGROUP_CONTROLLER) && cg_hybrid_unified()) {
r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
if (r < 0)
log_warning_errno(r, "Failed to remove compat systemd cgroup %s: %m", path);
}
return 0;
}
@ -542,8 +548,12 @@ static const char *controller_to_dirname(const char *controller) {
* just cuts off the name= prefixed used for named
* hierarchies, if it is specified. */
if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
if (cg_hybrid_unified())
controller = SYSTEMD_CGROUP_CONTROLLER_HYBRID;
else
controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
}
e = startswith(controller, "name=");
if (e)
@ -703,7 +713,7 @@ static int trim_cb(const char *path, const struct stat *sb, int typeflag, struct
int cg_trim(const char *controller, const char *path, bool delete_root) {
_cleanup_free_ char *fs = NULL;
int r = 0;
int r = 0, q;
assert(path);
@ -726,6 +736,12 @@ int cg_trim(const char *controller, const char *path, bool delete_root) {
return -errno;
}
if (streq(controller, SYSTEMD_CGROUP_CONTROLLER) && cg_hybrid_unified()) {
q = cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, delete_root);
if (q < 0)
log_warning_errno(q, "Failed to trim compat systemd cgroup %s: %m", path);
}
return r;
}
@ -749,6 +765,12 @@ int cg_create(const char *controller, const char *path) {
return -errno;
}
if (streq(controller, SYSTEMD_CGROUP_CONTROLLER) && cg_hybrid_unified()) {
r = cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
if (r < 0)
log_warning_errno(r, "Failed to create compat systemd cgroup %s: %m", path);
}
return 1;
}
@ -786,7 +808,17 @@ int cg_attach(const char *controller, const char *path, pid_t pid) {
xsprintf(c, PID_FMT "\n", pid);
return write_string_file(fs, c, 0);
r = write_string_file(fs, c, 0);
if (r < 0)
return r;
if (streq(controller, SYSTEMD_CGROUP_CONTROLLER) && cg_hybrid_unified()) {
r = cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, pid);
if (r < 0)
log_warning_errno(r, "Failed to attach %d to compat systemd cgroup %s: %m", pid, path);
}
return 0;
}
int cg_attach_fallback(const char *controller, const char *path, pid_t pid) {
@ -835,7 +867,17 @@ int cg_set_group_access(
if (r < 0)
return r;
return chmod_and_chown(fs, mode, uid, gid);
r = chmod_and_chown(fs, mode, uid, gid);
if (r < 0)
return r;
if (streq(controller, SYSTEMD_CGROUP_CONTROLLER) && cg_hybrid_unified()) {
r = cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, mode, uid, gid);
if (r < 0)
log_warning_errno(r, "Failed to set group access on compat systemd cgroup %s: %m", path);
}
return 0;
}
int cg_set_task_access(
@ -864,13 +906,18 @@ int cg_set_task_access(
if (r < 0)
return r;
if (cg_unified(controller))
return 0;
if (!cg_unified(controller)) {
/* Compatibility, Always keep values for "tasks" in sync with
* "cgroup.procs" */
if (cg_get_path(controller, path, "tasks", &procs) >= 0)
(void) chmod_and_chown(procs, mode, uid, gid);
}
/* Compatibility, Always keep values for "tasks" in sync with
* "cgroup.procs" */
if (cg_get_path(controller, path, "tasks", &procs) >= 0)
(void) chmod_and_chown(procs, mode, uid, gid);
if (streq(controller, SYSTEMD_CGROUP_CONTROLLER) && cg_hybrid_unified()) {
r = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, mode, uid, gid);
if (r < 0)
log_warning_errno(r, "Failed to set task access on compat systemd cgroup %s: %m", path);
}
return 0;
}
@ -2254,11 +2301,16 @@ static int cg_update_unified(void) {
if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC))
unified_cache = CGROUP_UNIFIED_ALL;
else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0)
return -errno;
unified_cache = F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC) ?
CGROUP_UNIFIED_SYSTEMD : CGROUP_UNIFIED_NONE;
if (statfs("/sys/fs/cgroup/unified/", &fs) == 0 &&
F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC))
unified_cache = CGROUP_UNIFIED_SYSTEMD;
else {
if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0)
return -errno;
if (!F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC))
return -ENOMEDIUM;
unified_cache = CGROUP_UNIFIED_NONE;
}
} else
return -ENOMEDIUM;
@ -2280,6 +2332,13 @@ bool cg_all_unified(void) {
return cg_unified(NULL);
}
bool cg_hybrid_unified(void) {
assert(cg_update_unified() >= 0);
return unified_cache == CGROUP_UNIFIED_SYSTEMD;
}
int cg_unified_flush(void) {
unified_cache = CGROUP_UNIFIED_UNKNOWN;
@ -2383,10 +2442,6 @@ bool cg_is_unified_systemd_controller_wanted(void) {
return (wanted = r > 0 ? !b : false);
}
bool cg_is_legacy_systemd_controller_wanted(void) {
return cg_is_legacy_wanted() && !cg_is_unified_systemd_controller_wanted();
}
int cg_weight_parse(const char *s, uint64_t *ret) {
uint64_t u;
int r;

View File

@ -241,13 +241,13 @@ int cg_kernel_controllers(Set *controllers);
bool cg_ns_supported(void);
bool cg_all_unified(void);
bool cg_hybrid_unified(void);
bool cg_unified(const char *controller);
int cg_unified_flush(void);
bool cg_is_unified_wanted(void);
bool cg_is_legacy_wanted(void);
bool cg_is_unified_systemd_controller_wanted(void);
bool cg_is_legacy_systemd_controller_wanted(void);
const char* cgroup_controller_to_string(CGroupController c) _const_;
CGroupController cgroup_controller_from_string(const char *s) _pure_;

View File

@ -37,6 +37,7 @@
#define DEFAULT_UNIX_MAX_DGRAM_QLEN 512UL
#define SYSTEMD_CGROUP_CONTROLLER_LEGACY "name=systemd"
#define SYSTEMD_CGROUP_CONTROLLER_HYBRID "name=unified"
#define SYSTEMD_CGROUP_CONTROLLER "_systemd"
#define SIGNALS_CRASH_HANDLER SIGSEGV,SIGILL,SIGFPE,SIGBUS,SIGQUIT,SIGABRT

View File

@ -99,12 +99,12 @@ static const MountPoint mount_table[] = {
cg_is_unified_wanted, MNT_FATAL|MNT_IN_CONTAINER },
{ "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
{ "cgroup", "/sys/fs/cgroup/systemd", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "cgroup", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_unified_systemd_controller_wanted, MNT_IN_CONTAINER },
{ "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_legacy_systemd_controller_wanted, MNT_IN_CONTAINER },
cg_is_legacy_wanted, MNT_IN_CONTAINER },
{ "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_legacy_systemd_controller_wanted, MNT_FATAL|MNT_IN_CONTAINER },
cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
{ "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
NULL, MNT_NONE },
#ifdef ENABLE_EFI

View File

@ -890,7 +890,7 @@ static int get_controllers(Set *subsystems) {
*e = 0;
if (STR_IN_SET(l, "", "name=systemd"))
if (STR_IN_SET(l, "", "name=systemd", "name=unified"))
continue;
p = strdup(l);
@ -909,7 +909,6 @@ static int mount_legacy_cgroup_hierarchy(
const char *dest,
const char *controller,
const char *hierarchy,
CGroupUnified unified_requested,
bool read_only) {
const char *to, *fstype, *opts;
@ -927,14 +926,12 @@ static int mount_legacy_cgroup_hierarchy(
/* The superblock mount options of the mount point need to be
* identical to the hosts', and hence writable... */
if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
fstype = "cgroup2";
opts = NULL;
} else {
fstype = "cgroup";
opts = "none,name=systemd,xattr";
}
if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_HYBRID)) {
fstype = "cgroup2";
opts = NULL;
} else if (streq(controller, SYSTEMD_CGROUP_CONTROLLER_LEGACY)) {
fstype = "cgroup";
opts = "none,name=systemd,xattr";
} else {
fstype = "cgroup";
opts = controller;
@ -1012,7 +1009,7 @@ static int mount_legacy_cgns_supported(
if (!controller)
break;
r = mount_legacy_cgroup_hierarchy("", controller, controller, unified_requested, !userns);
r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns);
if (r < 0)
return r;
@ -1046,7 +1043,13 @@ static int mount_legacy_cgns_supported(
}
skip_controllers:
r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER, "systemd", unified_requested, false);
if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
if (r < 0)
return r;
}
r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
if (r < 0)
return r;
@ -1117,7 +1120,7 @@ static int mount_legacy_cgns_unsupported(
if (r == -EINVAL) {
/* Not a symbolic link, but directly a single cgroup hierarchy */
r = mount_legacy_cgroup_hierarchy(dest, controller, controller, unified_requested, true);
r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
if (r < 0)
return r;
@ -1137,7 +1140,7 @@ static int mount_legacy_cgns_unsupported(
continue;
}
r = mount_legacy_cgroup_hierarchy(dest, combined, combined, unified_requested, true);
r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
if (r < 0)
return r;
@ -1150,7 +1153,13 @@ static int mount_legacy_cgns_unsupported(
}
skip_controllers:
r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER, "systemd", unified_requested, false);
if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_HYBRID, "unified", false);
if (r < 0)
return r;
}
r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER_LEGACY, "systemd", false);
if (r < 0)
return r;
@ -1202,12 +1211,25 @@ int mount_cgroups(
return mount_legacy_cgns_unsupported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
}
static int mount_systemd_cgroup_writable_one(const char *systemd_own, const char *systemd_root)
{
int r;
/* Make our own cgroup a (writable) bind mount */
r = mount_verbose(LOG_ERR, systemd_own, systemd_own, NULL, MS_BIND, NULL);
if (r < 0)
return r;
/* And then remount the systemd cgroup root read-only */
return mount_verbose(LOG_ERR, NULL, systemd_root, NULL,
MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
}
int mount_systemd_cgroup_writable(
const char *dest,
CGroupUnified unified_requested) {
_cleanup_free_ char *own_cgroup_path = NULL;
const char *systemd_root, *systemd_own;
int r;
assert(dest);
@ -1220,22 +1242,19 @@ int mount_systemd_cgroup_writable(
if (path_equal(own_cgroup_path, "/"))
return 0;
if (unified_requested >= CGROUP_UNIFIED_ALL) {
systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
} else {
systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
if (unified_requested >= CGROUP_UNIFIED_ALL)
return mount_systemd_cgroup_writable_one(strjoina(dest, "/sys/fs/cgroup", own_cgroup_path),
prefix_roota(dest, "/sys/fs/cgroup"));
if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
r = mount_systemd_cgroup_writable_one(strjoina(dest, "/sys/fs/cgroup/unified", own_cgroup_path),
prefix_roota(dest, "/sys/fs/cgroup/unified"));
if (r < 0)
return r;
}
/* Make our own cgroup a (writable) bind mount */
r = mount_verbose(LOG_ERR, systemd_own, systemd_own, NULL, MS_BIND, NULL);
if (r < 0)
return r;
/* And then remount the systemd cgroup root read-only */
return mount_verbose(LOG_ERR, NULL, systemd_root, NULL,
MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
return mount_systemd_cgroup_writable_one(strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path),
prefix_roota(dest, "/sys/fs/cgroup/systemd"));
}
int setup_volatile_state(

View File

@ -344,8 +344,8 @@ static int detect_unified_cgroup_hierarchy(const char *directory) {
else
arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
} else if (cg_unified(SYSTEMD_CGROUP_CONTROLLER)) {
/* Mixed cgroup hierarchy support was added in 232 */
r = systemd_installation_has_version(directory, 232);
/* Mixed cgroup hierarchy support was added in 233 */
r = systemd_installation_has_version(directory, 233);
if (r < 0)
return log_error_errno(r, "Failed to determine systemd version in container: %m");
if (r > 0)