From de8a711a5849f9239c93aefa5554a62986dfce42 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 20 Nov 2018 19:45:02 +0100 Subject: [PATCH 01/29] cgroup: use structured initialization --- src/core/cgroup.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 0fb09935e1..815fca28cc 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -72,29 +72,30 @@ static void cgroup_compat_warn(void) { void cgroup_context_init(CGroupContext *c) { assert(c); - /* Initialize everything to the kernel defaults, assuming the - * structure is preinitialized to 0 */ + /* Initialize everything to the kernel defaults. */ - c->cpu_weight = CGROUP_WEIGHT_INVALID; - c->startup_cpu_weight = CGROUP_WEIGHT_INVALID; - c->cpu_quota_per_sec_usec = USEC_INFINITY; + *c = (CGroupContext) { + .cpu_weight = CGROUP_WEIGHT_INVALID, + .startup_cpu_weight = CGROUP_WEIGHT_INVALID, + .cpu_quota_per_sec_usec = USEC_INFINITY, - c->cpu_shares = CGROUP_CPU_SHARES_INVALID; - c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID; + .cpu_shares = CGROUP_CPU_SHARES_INVALID, + .startup_cpu_shares = CGROUP_CPU_SHARES_INVALID, - c->memory_high = CGROUP_LIMIT_MAX; - c->memory_max = CGROUP_LIMIT_MAX; - c->memory_swap_max = CGROUP_LIMIT_MAX; + .memory_high = CGROUP_LIMIT_MAX, + .memory_max = CGROUP_LIMIT_MAX, + .memory_swap_max = CGROUP_LIMIT_MAX, - c->memory_limit = CGROUP_LIMIT_MAX; + .memory_limit = CGROUP_LIMIT_MAX, - c->io_weight = CGROUP_WEIGHT_INVALID; - c->startup_io_weight = CGROUP_WEIGHT_INVALID; + .io_weight = CGROUP_WEIGHT_INVALID, + .startup_io_weight = CGROUP_WEIGHT_INVALID, - c->blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID; - c->startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID; + .blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID, + .startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID, - c->tasks_max = (uint64_t) -1; + .tasks_max = CGROUP_LIMIT_MAX, + }; } void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) { From a0c339ed4b16cc00b3176ae63cd6b863763a4a79 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 20 Nov 2018 19:45:09 +0100 Subject: [PATCH 02/29] cgroup: only install cgroup release agent when we own the root cgroup If we run in a container we shouldn't patch around this, and most likely we can't anyway, and there's not much point in complaining about this. Hence let's strictly say: the agent is private property of the host's system instance, nothing else. --- src/core/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 815fca28cc..4ce6c9f6f4 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -2372,7 +2372,7 @@ int manager_setup_cgroup(Manager *m) { (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify"); - } else if (MANAGER_IS_SYSTEM(m) && !MANAGER_IS_TEST_RUN(m)) { + } else if (MANAGER_IS_SYSTEM(m) && manager_owns_root_cgroup(m) && !MANAGER_IS_TEST_RUN(m)) { /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable, * since it does not generate events when control groups with children run empty. */ From 73fe5314bf0cc220da3487d9be322e46e674e692 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 20 Nov 2018 19:48:43 +0100 Subject: [PATCH 03/29] cgroup: suffix settings with "=" in log messages where appropriate --- src/core/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 4ce6c9f6f4..01ce890247 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1013,7 +1013,7 @@ static void cgroup_context_apply( max = c->memory_limit; if (max != CGROUP_LIMIT_MAX) - log_cgroup_compat(u, "Applying MemoryLimit %" PRIu64 " as MemoryMax", max); + log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max); } cgroup_apply_unified_memory_limit(u, "memory.min", c->memory_min); @@ -1027,7 +1027,7 @@ static void cgroup_context_apply( if (cgroup_context_has_unified_memory_config(c)) { val = c->memory_max; - log_cgroup_compat(u, "Applying MemoryMax %" PRIi64 " as MemoryLimit", val); + log_cgroup_compat(u, "Applying MemoryMax=%" PRIi64 " as MemoryLimit=", val); } else val = c->memory_limit; From 8c83840772e6239435e1897ebdf4bf9fc5fd68cc Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 20 Nov 2018 20:14:59 +0100 Subject: [PATCH 04/29] cgroup: add comment explaining why we ignore EINVAL at two places These are just copies from further down. --- src/core/cgroup.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 01ce890247..da677b8f22 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -431,6 +431,8 @@ static int whitelist_device(BPFProgram *prog, const char *path, const char *node major(st.st_rdev), minor(st.st_rdev), acc); + /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore EINVAL here. */ + r = cg_set_attribute("devices", path, "devices.allow", buf); if (r < 0) return log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, @@ -517,6 +519,9 @@ static int whitelist_major(BPFProgram *prog, const char *path, const char *name, maj, acc); + /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore EINVAL + * here. */ + r = cg_set_attribute("devices", path, "devices.allow", buf); if (r < 0) log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, @@ -1052,8 +1057,7 @@ static void cgroup_context_apply( if (r < 0) log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m"); } else { - /* Changing the devices list of a populated cgroup - * might result in EINVAL, hence ignore EINVAL + /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore EINVAL * here. */ if (c->device_allow || c->device_policy != CGROUP_AUTO) From 2c74e12bb3b93c4bfb6dbad2bd9ec54626c5cb26 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 20 Nov 2018 20:16:03 +0100 Subject: [PATCH 05/29] cgroup: ignore EPERM for a couple of more attribute writes --- src/core/cgroup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index da677b8f22..8f63f4ba5a 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -435,7 +435,7 @@ static int whitelist_device(BPFProgram *prog, const char *path, const char *node r = cg_set_attribute("devices", path, "devices.allow", buf); if (r < 0) - return log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, + return log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r, "Failed to set devices.allow on %s: %m", path); return 0; @@ -524,7 +524,7 @@ static int whitelist_major(BPFProgram *prog, const char *path, const char *name, r = cg_set_attribute("devices", path, "devices.allow", buf); if (r < 0) - log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, + log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r, "Failed to set devices.allow on %s: %m", path); } } @@ -1065,8 +1065,8 @@ static void cgroup_context_apply( else r = cg_set_attribute("devices", path, "devices.allow", "a"); if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to reset devices.list: %m"); + log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to reset devices.allow/devices.deny: %m"); } if (c->device_policy == CGROUP_CLOSED || From 39b9fefb2e9b281e6c7d7fe44927700db2601140 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 20 Nov 2018 20:18:07 +0100 Subject: [PATCH 06/29] cgroup: add a new macro for determining log level for cgroup attr write failures For now, let's use it only at one place, but a follow-up commit will make more use of it. --- src/core/cgroup.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 8f63f4ba5a..952c9a9f2d 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -26,6 +26,11 @@ #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC) +/* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access + * problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask + * out specific attributes from us. */ +#define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(abs(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING) + bool manager_owns_root_cgroup(Manager *m) { assert(m); @@ -1156,7 +1161,7 @@ static void cgroup_context_apply( r = 0; if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, + log_unit_full(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to write to tasks limit sysctls: %m"); } else { From 293d32df3939e5fec3df730738d9ef41bcc11952 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 20 Nov 2018 20:19:58 +0100 Subject: [PATCH 07/29] cgroup: add a common routine for writing to attributes, and logging about it We can use this at quite a few places, and this allows us to shorten our code quite a bit. --- src/core/cgroup.c | 99 ++++++++++++++--------------------------------- 1 file changed, 29 insertions(+), 70 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 952c9a9f2d..3bbfead728 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -57,6 +57,17 @@ bool unit_has_root_cgroup(Unit *u) { return unit_has_name(u, SPECIAL_ROOT_SLICE); } +static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) { + int r; + + r = cg_set_attribute(controller, u->cgroup_path, attribute, value); + if (r < 0) + log_unit_full(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m", + strna(attribute), isempty(u->cgroup_path) ? "/" : u->cgroup_path, (int) strcspn(value, NEWLINE), value); + + return r; +} + static void cgroup_compat_warn(void) { static bool cgroup_compat_warned = false; @@ -569,13 +580,9 @@ static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) { char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)]; - int r; xsprintf(buf, "%" PRIu64 "\n", weight); - r = cg_set_attribute("cpu", u->cgroup_path, "cpu.weight", buf); - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set cpu.weight: %m"); + (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf); if (quota != USEC_INFINITY) xsprintf(buf, USEC_FMT " " USEC_FMT "\n", @@ -583,37 +590,23 @@ static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t q else xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC); - r = cg_set_attribute("cpu", u->cgroup_path, "cpu.max", buf); - - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set cpu.max: %m"); + (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf); } static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) { char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1]; - int r; xsprintf(buf, "%" PRIu64 "\n", shares); - r = cg_set_attribute("cpu", u->cgroup_path, "cpu.shares", buf); - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set cpu.shares: %m"); + (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf); xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC); - r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_period_us", buf); - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set cpu.cfs_period_us: %m"); + (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf); if (quota != USEC_INFINITY) { xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC); - r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", buf); + (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf); } else - r = cg_set_attribute("cpu", u->cgroup_path, "cpu.cfs_quota_us", "-1"); - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set cpu.cfs_quota_us: %m"); + (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1"); } static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) { @@ -683,10 +676,7 @@ static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_ return; xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight); - r = cg_set_attribute("io", u->cgroup_path, "io.weight", buf); - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set io.weight: %m"); + (void) set_attribute_and_warn(u, "io", "io.weight", buf); } static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) { @@ -699,10 +689,7 @@ static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint return; xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight); - r = cg_set_attribute("blkio", u->cgroup_path, "blkio.weight_device", buf); - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set blkio.weight_device: %m"); + (void) set_attribute_and_warn(u, "blkio", "blkio.weight_device", buf); } static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t target) { @@ -719,10 +706,7 @@ static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t else xsprintf(buf, "%u:%u target=max\n", major(dev), minor(dev)); - r = cg_set_attribute("io", u->cgroup_path, "io.latency", buf); - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set io.latency on cgroup %s: %m", u->cgroup_path); + (void) set_attribute_and_warn(u, "io", "io.latency", buf); } static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) { @@ -745,10 +729,7 @@ static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev), limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX], limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]); - r = cg_set_attribute("io", u->cgroup_path, "io.max", buf); - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set io.max: %m"); + (void) set_attribute_and_warn(u, "io", "io.max", buf); } static void cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) { @@ -761,16 +742,10 @@ static void cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint6 return; sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps); - r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.read_bps_device", buf); - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set blkio.throttle.read_bps_device: %m"); + (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.read_bps_device", buf); sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps); - r = cg_set_attribute("blkio", u->cgroup_path, "blkio.throttle.write_bps_device", buf); - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set blkio.throttle.write_bps_device: %m"); + (void) set_attribute_and_warn(u, "blkio", "blkio.throttle.write_bps_device", buf); } static bool cgroup_context_has_unified_memory_config(CGroupContext *c) { @@ -779,15 +754,11 @@ static bool cgroup_context_has_unified_memory_config(CGroupContext *c) { static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) { char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max"; - int r; if (v != CGROUP_LIMIT_MAX) xsprintf(buf, "%" PRIu64 "\n", v); - r = cg_set_attribute("memory", u->cgroup_path, file, buf); - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set %s: %m", file); + (void) set_attribute_and_warn(u, "memory", file, buf); } static void cgroup_apply_firewall(Unit *u) { @@ -892,10 +863,7 @@ static void cgroup_context_apply( weight = CGROUP_WEIGHT_DEFAULT; xsprintf(buf, "default %" PRIu64 "\n", weight); - r = cg_set_attribute("io", path, "io.weight", buf); - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set io.weight: %m"); + (void) set_attribute_and_warn(u, "io", "io.weight", buf); if (has_io) { CGroupIODeviceWeight *w; @@ -971,10 +939,7 @@ static void cgroup_context_apply( weight = CGROUP_BLKIO_WEIGHT_DEFAULT; xsprintf(buf, "%" PRIu64 "\n", weight); - r = cg_set_attribute("blkio", path, "blkio.weight", buf); - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set blkio.weight: %m"); + (void) set_attribute_and_warn(u, "blkio", "blkio.weight", buf); if (has_io) { CGroupIODeviceWeight *w; @@ -1046,10 +1011,7 @@ static void cgroup_context_apply( else xsprintf(buf, "%" PRIu64 "\n", val); - r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf); - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set memory.limit_in_bytes: %m"); + (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf); } } @@ -1169,12 +1131,9 @@ static void cgroup_context_apply( char buf[DECIMAL_STR_MAX(uint64_t) + 2]; sprintf(buf, "%" PRIu64 "\n", c->tasks_max); - r = cg_set_attribute("pids", path, "pids.max", buf); + (void) set_attribute_and_warn(u, "pids", "pids.max", buf); } else - r = cg_set_attribute("pids", path, "pids.max", "max"); - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set pids.max: %m"); + (void) set_attribute_and_warn(u, "pids", "pids.max", "max"); } } From 611c4f8afb2a941b8d77ff8fa9b661cce44f7329 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 20 Nov 2018 22:42:16 +0100 Subject: [PATCH 08/29] =?UTF-8?q?cgroup:=20rename=20{manager=5Fowns|unit?= =?UTF-8?q?=5Fhas}=5Froot=5Fcgroup()=20=E2=86=92=20..=5Fhost=5Froot=5Fcgro?= =?UTF-8?q?up()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's emphasize that this function checks for the host root cgroup, i.e. returns false for the root cgroup when we run in a container where CLONE_NEWCGROUP is used. There has been some confusion around this already, for example cgroup_context_apply() uses the function incorrectly (which we'll fix in a later commit). Just some refactoring, not change in behaviour. --- src/core/cgroup.c | 16 ++++++++-------- src/core/cgroup.h | 4 ++-- src/core/slice.c | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 3bbfead728..823ed6cd42 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -31,7 +31,7 @@ * out specific attributes from us. */ #define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(abs(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING) -bool manager_owns_root_cgroup(Manager *m) { +bool manager_owns_host_root_cgroup(Manager *m) { assert(m); /* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the @@ -45,13 +45,13 @@ bool manager_owns_root_cgroup(Manager *m) { return empty_or_root(m->cgroup_root); } -bool unit_has_root_cgroup(Unit *u) { +bool unit_has_host_root_cgroup(Unit *u) { assert(u); /* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and * the manager manages the root cgroup. */ - if (!manager_owns_root_cgroup(u->manager)) + if (!manager_owns_host_root_cgroup(u->manager)) return false; return unit_has_name(u, SPECIAL_ROOT_SLICE); @@ -789,7 +789,7 @@ static void cgroup_context_apply( return; /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */ - is_root = unit_has_root_cgroup(u); + is_root = unit_has_host_root_cgroup(u); assert_se(c = unit_get_cgroup_context(u)); assert_se(path = u->cgroup_path); @@ -2340,7 +2340,7 @@ int manager_setup_cgroup(Manager *m) { (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify"); - } else if (MANAGER_IS_SYSTEM(m) && manager_owns_root_cgroup(m) && !MANAGER_IS_TEST_RUN(m)) { + } else if (MANAGER_IS_SYSTEM(m) && manager_owns_host_root_cgroup(m) && !MANAGER_IS_TEST_RUN(m)) { /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable, * since it does not generate events when control groups with children run empty. */ @@ -2518,7 +2518,7 @@ int unit_get_memory_current(Unit *u, uint64_t *ret) { return -ENODATA; /* The root cgroup doesn't expose this information, let's get it from /proc instead */ - if (unit_has_root_cgroup(u)) + if (unit_has_host_root_cgroup(u)) return procfs_memory_get_current(ret); if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0) @@ -2553,7 +2553,7 @@ int unit_get_tasks_current(Unit *u, uint64_t *ret) { return -ENODATA; /* The root cgroup doesn't expose this information, let's get it from /proc instead */ - if (unit_has_root_cgroup(u)) + if (unit_has_host_root_cgroup(u)) return procfs_tasks_get_current(ret); if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0) @@ -2580,7 +2580,7 @@ static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) { return -ENODATA; /* The root cgroup doesn't expose this information, let's get it from /proc instead */ - if (unit_has_root_cgroup(u)) + if (unit_has_host_root_cgroup(u)) return procfs_cpu_get_usage(ret); r = cg_all_unified(); diff --git a/src/core/cgroup.h b/src/core/cgroup.h index d7daed3fe0..43970f0827 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -204,8 +204,8 @@ int unit_reset_ip_accounting(Unit *u); cc ? cc->name : false; \ }) -bool manager_owns_root_cgroup(Manager *m); -bool unit_has_root_cgroup(Unit *u); +bool manager_owns_host_root_cgroup(Manager *m); +bool unit_has_host_root_cgroup(Unit *u); int manager_notify_cgroup_empty(Manager *m, const char *group); diff --git a/src/core/slice.c b/src/core/slice.c index 74d056f7bc..dc087680e1 100644 --- a/src/core/slice.c +++ b/src/core/slice.c @@ -330,7 +330,7 @@ static void slice_enumerate_perpetual(Manager *m) { assert(m); r = slice_make_perpetual(m, SPECIAL_ROOT_SLICE, &u); - if (r >= 0 && manager_owns_root_cgroup(m)) { + if (r >= 0 && manager_owns_host_root_cgroup(m)) { Slice *s = SLICE(u); /* If we are managing the root cgroup then this means our root slice covers the whole system, which From 28cfdc5aebc044c82b6716fa1fedf847c060d043 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 20 Nov 2018 22:46:03 +0100 Subject: [PATCH 09/29] cgroup: tighten manager_owns_host_root_cgroup() a bit This tightening is not strictly necessary (as the m->cgroup_root check further down does the same), but let's make this explicit. --- src/core/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 823ed6cd42..bc676fc698 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -39,6 +39,9 @@ bool manager_owns_host_root_cgroup(Manager *m) { * appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if * we run in any kind of container virtualization. */ + if (MANAGER_IS_USER(m)) + return false; + if (detect_container() > 0) return false; From 589a5f7a389e586bab065dbc1700c96dd5a9a967 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 20 Nov 2018 22:50:13 +0100 Subject: [PATCH 10/29] cgroup: append \n to static strings we write to cgroup attributes This is a bit cleaner since we when we format numeric limits we append it. And this way write_string_file() doesn't have to append it. --- src/core/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index bc676fc698..e7b1b2bac8 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -609,7 +609,7 @@ static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t qu xsprintf(buf, USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC); (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf); } else - (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1"); + (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n"); } static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) { @@ -756,7 +756,7 @@ static bool cgroup_context_has_unified_memory_config(CGroupContext *c) { } static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) { - char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max"; + char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n"; if (v != CGROUP_LIMIT_MAX) xsprintf(buf, "%" PRIu64 "\n", v); @@ -1136,7 +1136,7 @@ static void cgroup_context_apply( sprintf(buf, "%" PRIu64 "\n", c->tasks_max); (void) set_attribute_and_warn(u, "pids", "pids.max", buf); } else - (void) set_attribute_and_warn(u, "pids", "pids.max", "max"); + (void) set_attribute_and_warn(u, "pids", "pids.max", "max\n"); } } From 52fecf20b9269f4675226a8e600cac0bad0539ff Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 20 Nov 2018 23:05:12 +0100 Subject: [PATCH 11/29] cgroup: fine tune when to apply cgroup attributes to the root cgroup Let's tweak when precisely to apply cgroup attributes on the root cgroup. With this we now follow the following rules: 1. On cgroupsv2 we never apply any regular cgroups to the host root, since the attributes generally do not exist there. 2. On cgroupsv1 we do not apply any "weight" or "shares" style attributes to the host root cgroup, since they don't make much sense on the top level where there's only one group, hence no need to compare weights against each other. The other attributes are applied to the host root cgroup however. 3. In any case we don't apply attributes to the root of container environments (and --user roots), under the assumption that this is managed by the manager further up. (Note that on cgroupsv2 this is even enforced by the kernel) 4. BPF pseudo-attributes are applied in all cases (since we can have as many of them as we want) --- src/core/cgroup.c | 297 +++++++++++++++++++++++++++------------------- 1 file changed, 174 insertions(+), 123 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index e7b1b2bac8..6b648c163a 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -581,26 +581,33 @@ static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) return CGROUP_CPU_SHARES_DEFAULT; } -static void cgroup_apply_unified_cpu_config(Unit *u, uint64_t weight, uint64_t quota) { - char buf[MAX(DECIMAL_STR_MAX(uint64_t) + 1, (DECIMAL_STR_MAX(usec_t) + 1) * 2)]; +static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) { + char buf[DECIMAL_STR_MAX(uint64_t) + 2]; xsprintf(buf, "%" PRIu64 "\n", weight); (void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf); +} + +static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota) { + char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1]; if (quota != USEC_INFINITY) xsprintf(buf, USEC_FMT " " USEC_FMT "\n", quota * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC, CGROUP_CPU_QUOTA_PERIOD_USEC); else xsprintf(buf, "max " USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC); - (void) set_attribute_and_warn(u, "cpu", "cpu.max", buf); } -static void cgroup_apply_legacy_cpu_config(Unit *u, uint64_t shares, uint64_t quota) { - char buf[MAX(DECIMAL_STR_MAX(uint64_t), DECIMAL_STR_MAX(usec_t)) + 1]; +static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) { + char buf[DECIMAL_STR_MAX(uint64_t) + 2]; xsprintf(buf, "%" PRIu64 "\n", shares); (void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf); +} + +static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota) { + char buf[DECIMAL_STR_MAX(usec_t) + 2]; xsprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC); (void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf); @@ -782,7 +789,7 @@ static void cgroup_context_apply( const char *path; CGroupContext *c; - bool is_root; + bool is_host_root, is_local_root; int r; assert(u); @@ -791,118 +798,136 @@ static void cgroup_context_apply( if (apply_mask == 0) return; - /* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */ - is_root = unit_has_host_root_cgroup(u); + /* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other + * attributes should only be managed for cgroups further down the tree. */ + is_local_root = unit_has_name(u, SPECIAL_ROOT_SLICE); + is_host_root = unit_has_host_root_cgroup(u); assert_se(c = unit_get_cgroup_context(u)); assert_se(path = u->cgroup_path); - if (is_root) /* Make sure we don't try to display messages with an empty path. */ + if (is_local_root) /* Make sure we don't try to display messages with an empty path. */ path = "/"; /* We generally ignore errors caused by read-only mounted * cgroup trees (assuming we are running in a container then), * and missing cgroups, i.e. EROFS and ENOENT. */ - if ((apply_mask & CGROUP_MASK_CPU) && !is_root) { + if (apply_mask & CGROUP_MASK_CPU) { bool has_weight, has_shares; has_weight = cgroup_context_has_cpu_weight(c); has_shares = cgroup_context_has_cpu_shares(c); if (cg_all_unified() > 0) { - uint64_t weight; - if (has_weight) - weight = cgroup_context_cpu_weight(c, state); - else if (has_shares) { - uint64_t shares = cgroup_context_cpu_shares(c, state); + /* In fully unified mode these attributes don't exist on the host cgroup root, and inside of + * containers we want to leave control of these to the container manager (and if delegation is + * used we couldn't even write to them if we wanted to). */ + if (!is_local_root) { + uint64_t weight; - weight = cgroup_cpu_shares_to_weight(shares); + if (has_weight) + weight = cgroup_context_cpu_weight(c, state); + else if (has_shares) { + uint64_t shares; - log_cgroup_compat(u, "Applying [Startup]CPUShares %" PRIu64 " as [Startup]CPUWeight %" PRIu64 " on %s", - shares, weight, path); - } else - weight = CGROUP_WEIGHT_DEFAULT; + shares = cgroup_context_cpu_shares(c, state); + weight = cgroup_cpu_shares_to_weight(shares); + + log_cgroup_compat(u, "Applying [Startup]CPUShares %" PRIu64 " as [Startup]CPUWeight %" PRIu64 " on %s", + shares, weight, path); + } else + weight = CGROUP_WEIGHT_DEFAULT; + + cgroup_apply_unified_cpu_weight(u, weight); + cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec); + } - cgroup_apply_unified_cpu_config(u, weight, c->cpu_quota_per_sec_usec); } else { - uint64_t shares; + /* Setting the weight makes very little sense on the host root cgroup, as there are no other + * cgroups at this level. And for containers we want to leave management of this to the + * container manager */ + if (!is_local_root) { + uint64_t shares; - if (has_weight) { - uint64_t weight = cgroup_context_cpu_weight(c, state); + if (has_weight) { + uint64_t weight; - shares = cgroup_cpu_weight_to_shares(weight); + weight = cgroup_context_cpu_weight(c, state); + shares = cgroup_cpu_weight_to_shares(weight); - log_cgroup_compat(u, "Applying [Startup]CPUWeight %" PRIu64 " as [Startup]CPUShares %" PRIu64 " on %s", - weight, shares, path); - } else if (has_shares) - shares = cgroup_context_cpu_shares(c, state); - else - shares = CGROUP_CPU_SHARES_DEFAULT; + log_cgroup_compat(u, "Applying [Startup]CPUWeight %" PRIu64 " as [Startup]CPUShares %" PRIu64 " on %s", + weight, shares, path); + } else if (has_shares) + shares = cgroup_context_cpu_shares(c, state); + else + shares = CGROUP_CPU_SHARES_DEFAULT; - cgroup_apply_legacy_cpu_config(u, shares, c->cpu_quota_per_sec_usec); + cgroup_apply_legacy_cpu_shares(u, shares); + } + + /* The "cpu" quota attribute is available on the host root, hence manage it there. But in + * containers let's leave this to the container manager. */ + if (is_host_root || !is_local_root) + cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec); } } - if (apply_mask & CGROUP_MASK_IO) { - bool has_io = cgroup_context_has_io_config(c); - bool has_blockio = cgroup_context_has_blockio_config(c); + /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroupsv2 + * controller), and in case of containers we want to leave control of these attributes to the container manager + * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */ + if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) { + char buf[8+DECIMAL_STR_MAX(uint64_t)+1]; + bool has_io, has_blockio; + uint64_t weight; - if (!is_root) { - char buf[8+DECIMAL_STR_MAX(uint64_t)+1]; - uint64_t weight; + has_io = cgroup_context_has_io_config(c); + has_blockio = cgroup_context_has_blockio_config(c); - if (has_io) - weight = cgroup_context_io_weight(c, state); - else if (has_blockio) { - uint64_t blkio_weight = cgroup_context_blkio_weight(c, state); + if (has_io) + weight = cgroup_context_io_weight(c, state); + else if (has_blockio) { + uint64_t blkio_weight; - weight = cgroup_weight_blkio_to_io(blkio_weight); + blkio_weight = cgroup_context_blkio_weight(c, state); + weight = cgroup_weight_blkio_to_io(blkio_weight); - log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64, - blkio_weight, weight); - } else - weight = CGROUP_WEIGHT_DEFAULT; + log_cgroup_compat(u, "Applying [Startup]BlockIOWeight %" PRIu64 " as [Startup]IOWeight %" PRIu64, + blkio_weight, weight); + } else + weight = CGROUP_WEIGHT_DEFAULT; - xsprintf(buf, "default %" PRIu64 "\n", weight); - (void) set_attribute_and_warn(u, "io", "io.weight", buf); - - if (has_io) { - CGroupIODeviceWeight *w; - - LIST_FOREACH(device_weights, w, c->io_device_weights) - cgroup_apply_io_device_weight(u, w->path, w->weight); - } else if (has_blockio) { - CGroupBlockIODeviceWeight *w; - - LIST_FOREACH(device_weights, w, c->blockio_device_weights) { - weight = cgroup_weight_blkio_to_io(w->weight); - - log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s", - w->weight, weight, w->path); - - cgroup_apply_io_device_weight(u, w->path, weight); - } - } - - if (has_io) { - CGroupIODeviceLatency *l; - - LIST_FOREACH(device_latencies, l, c->io_device_latencies) - cgroup_apply_io_device_latency(u, l->path, l->target_usec); - } - } + xsprintf(buf, "default %" PRIu64 "\n", weight); + (void) set_attribute_and_warn(u, "io", "io.weight", buf); if (has_io) { - CGroupIODeviceLimit *l; + CGroupIODeviceLatency *latency; + CGroupIODeviceLimit *limit; + CGroupIODeviceWeight *w; - LIST_FOREACH(device_limits, l, c->io_device_limits) - cgroup_apply_io_device_limit(u, l->path, l->limits); + LIST_FOREACH(device_weights, w, c->io_device_weights) + cgroup_apply_io_device_weight(u, w->path, w->weight); + + LIST_FOREACH(device_limits, limit, c->io_device_limits) + cgroup_apply_io_device_limit(u, limit->path, limit->limits); + + LIST_FOREACH(device_latencies, latency, c->io_device_latencies) + cgroup_apply_io_device_latency(u, latency->path, latency->target_usec); } else if (has_blockio) { + CGroupBlockIODeviceWeight *w; CGroupBlockIODeviceBandwidth *b; + LIST_FOREACH(device_weights, w, c->blockio_device_weights) { + weight = cgroup_weight_blkio_to_io(w->weight); + + log_cgroup_compat(u, "Applying BlockIODeviceWeight %" PRIu64 " as IODeviceWeight %" PRIu64 " for %s", + w->weight, weight, w->path); + + cgroup_apply_io_device_weight(u, w->path, weight); + } + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) { uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX]; CGroupIOLimitType type; @@ -922,16 +947,21 @@ static void cgroup_context_apply( } if (apply_mask & CGROUP_MASK_BLKIO) { - bool has_io = cgroup_context_has_io_config(c); - bool has_blockio = cgroup_context_has_blockio_config(c); + bool has_io, has_blockio; - if (!is_root) { + has_io = cgroup_context_has_io_config(c); + has_blockio = cgroup_context_has_blockio_config(c); + + /* Applying a 'weight' never makes sense for the host root cgroup, and for containers this should be + * left to our container manager, too. */ + if (!is_local_root) { char buf[DECIMAL_STR_MAX(uint64_t)+1]; uint64_t weight; if (has_io) { - uint64_t io_weight = cgroup_context_io_weight(c, state); + uint64_t io_weight; + io_weight = cgroup_context_io_weight(c, state); weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state)); log_cgroup_compat(u, "Applying [Startup]IOWeight %" PRIu64 " as [Startup]BlockIOWeight %" PRIu64, @@ -963,62 +993,80 @@ static void cgroup_context_apply( } } - if (has_io) { - CGroupIODeviceLimit *l; + /* The bandwith limits are something that make sense to be applied to the host's root but not container + * roots, as there we want the container manager to handle it */ + if (is_host_root || !is_local_root) { + if (has_io) { + CGroupIODeviceLimit *l; - LIST_FOREACH(device_limits, l, c->io_device_limits) { - log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s", - l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path); + LIST_FOREACH(device_limits, l, c->io_device_limits) { + log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth %" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax for %s", + l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path); - cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]); + cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]); + } + } else if (has_blockio) { + CGroupBlockIODeviceBandwidth *b; + + LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) + cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps); } - } else if (has_blockio) { - CGroupBlockIODeviceBandwidth *b; - - LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) - cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps); } } - if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) { + if (apply_mask & CGROUP_MASK_MEMORY) { + if (cg_all_unified() > 0) { - uint64_t max, swap_max = CGROUP_LIMIT_MAX; + /* In unified mode 'memory' attributes do not exist on the root cgroup. And if we run in a + * container we want to leave control to the container manager (and if proper delegation is + * used we couldn't even write to this if we wanted to. */ + if (!is_local_root) { + uint64_t max, swap_max = CGROUP_LIMIT_MAX; - if (cgroup_context_has_unified_memory_config(c)) { - max = c->memory_max; - swap_max = c->memory_swap_max; - } else { - max = c->memory_limit; + if (cgroup_context_has_unified_memory_config(c)) { + max = c->memory_max; + swap_max = c->memory_swap_max; + } else { + max = c->memory_limit; - if (max != CGROUP_LIMIT_MAX) - log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max); + if (max != CGROUP_LIMIT_MAX) + log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max); + } + + cgroup_apply_unified_memory_limit(u, "memory.min", c->memory_min); + cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low); + cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high); + cgroup_apply_unified_memory_limit(u, "memory.max", max); + cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max); } - - cgroup_apply_unified_memory_limit(u, "memory.min", c->memory_min); - cgroup_apply_unified_memory_limit(u, "memory.low", c->memory_low); - cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high); - cgroup_apply_unified_memory_limit(u, "memory.max", max); - cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max); } else { - char buf[DECIMAL_STR_MAX(uint64_t) + 1]; - uint64_t val; - if (cgroup_context_has_unified_memory_config(c)) { - val = c->memory_max; - log_cgroup_compat(u, "Applying MemoryMax=%" PRIi64 " as MemoryLimit=", val); - } else - val = c->memory_limit; + /* In legacy mode 'memory' exists on the host root, but in container mode we want to leave it + * to the container manager around us */ + if (is_host_root || !is_local_root) { + char buf[DECIMAL_STR_MAX(uint64_t) + 1]; + uint64_t val; - if (val == CGROUP_LIMIT_MAX) - strncpy(buf, "-1\n", sizeof(buf)); - else - xsprintf(buf, "%" PRIu64 "\n", val); + if (cgroup_context_has_unified_memory_config(c)) { + val = c->memory_max; + log_cgroup_compat(u, "Applying MemoryMax=%" PRIi64 " as MemoryLimit=", val); + } else + val = c->memory_limit; - (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf); + if (val == CGROUP_LIMIT_MAX) + strncpy(buf, "-1\n", sizeof(buf)); + else + xsprintf(buf, "%" PRIu64 "\n", val); + + (void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf); + } } } - if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) && !is_root) { + /* On cgroupsv2 we can apply BPF everywhre. On cgroupsv1 we apply it everywhere except for the root of + * containers, where we leave this to the manager */ + if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) && + (is_host_root || cg_all_unified() > 0 || !is_local_root)) { _cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL; CGroupDeviceAllow *a; @@ -1103,7 +1151,7 @@ static void cgroup_context_apply( if (apply_mask & CGROUP_MASK_PIDS) { - if (is_root) { + if (is_host_root) { /* So, the "pids" controller does not expose anything on the root cgroup, in order not to * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a @@ -1128,8 +1176,11 @@ static void cgroup_context_apply( if (r < 0) log_unit_full(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to write to tasks limit sysctls: %m"); + } - } else { + /* The attribute itself is not available on the host root cgroup, and in the container case we want to + * leave it for the container manager. */ + if (!is_local_root) { if (c->tasks_max != CGROUP_LIMIT_MAX) { char buf[DECIMAL_STR_MAX(uint64_t) + 2]; From 53aea74a6006cf6d2a2e56cbe678c4d5d0603145 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 21 Nov 2018 17:42:40 +0100 Subject: [PATCH 12/29] cgroup: make some functions static --- src/core/cgroup.c | 4 ++-- src/core/cgroup.h | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 6b648c163a..4b9ddd176c 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1195,7 +1195,7 @@ static void cgroup_context_apply( cgroup_apply_firewall(u); } -CGroupMask cgroup_context_get_mask(CGroupContext *c) { +static CGroupMask cgroup_context_get_mask(CGroupContext *c) { CGroupMask mask = 0; /* Figure out which controllers we need, based on the cgroup context object */ @@ -1227,7 +1227,7 @@ CGroupMask cgroup_context_get_mask(CGroupContext *c) { return CGROUP_MASK_EXTEND_JOINED(mask); } -CGroupMask unit_get_bpf_mask(Unit *u) { +static CGroupMask unit_get_bpf_mask(Unit *u) { CGroupMask mask = 0; /* Figure out which controllers we need, based on the cgroup context, possibly taking into account children diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 43970f0827..5068e6971f 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -137,8 +137,6 @@ void cgroup_context_init(CGroupContext *c); void cgroup_context_done(CGroupContext *c); void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix); -CGroupMask cgroup_context_get_mask(CGroupContext *c); - void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a); void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w); void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l); @@ -158,7 +156,6 @@ CGroupMask unit_get_target_mask(Unit *u); CGroupMask unit_get_enable_mask(Unit *u); bool unit_get_needs_bpf_firewall(Unit *u); -CGroupMask unit_get_bpf_mask(Unit *u); void unit_update_cgroup_members_masks(Unit *u); From 164924458866948e74c39e6a2f98bbfd03533003 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 21 Nov 2018 17:44:14 +0100 Subject: [PATCH 13/29] cgroup: make unit_get_needs_bpf_firewall() static too --- src/core/cgroup.c | 56 +++++++++++++++++++++++------------------------ src/core/cgroup.h | 2 -- 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 4b9ddd176c..1aab238866 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1195,6 +1195,34 @@ static void cgroup_context_apply( cgroup_apply_firewall(u); } +static bool unit_get_needs_bpf_firewall(Unit *u) { + CGroupContext *c; + Unit *p; + assert(u); + + c = unit_get_cgroup_context(u); + if (!c) + return false; + + if (c->ip_accounting || + c->ip_address_allow || + c->ip_address_deny) + return true; + + /* If any parent slice has an IP access list defined, it applies too */ + for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) { + c = unit_get_cgroup_context(p); + if (!c) + return false; + + if (c->ip_address_allow || + c->ip_address_deny) + return true; + } + + return false; +} + static CGroupMask cgroup_context_get_mask(CGroupContext *c) { CGroupMask mask = 0; @@ -1356,34 +1384,6 @@ CGroupMask unit_get_enable_mask(Unit *u) { return mask; } -bool unit_get_needs_bpf_firewall(Unit *u) { - CGroupContext *c; - Unit *p; - assert(u); - - c = unit_get_cgroup_context(u); - if (!c) - return false; - - if (c->ip_accounting || - c->ip_address_allow || - c->ip_address_deny) - return true; - - /* If any parent slice has an IP access list defined, it applies too */ - for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) { - c = unit_get_cgroup_context(p); - if (!c) - return false; - - if (c->ip_address_allow || - c->ip_address_deny) - return true; - } - - return false; -} - /* Recurse from a unit up through its containing slices, propagating * mask bits upward. A unit is also member of itself. */ void unit_update_cgroup_members_masks(Unit *u) { diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 5068e6971f..2c7287841a 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -155,8 +155,6 @@ CGroupMask unit_get_subtree_mask(Unit *u); CGroupMask unit_get_target_mask(Unit *u); CGroupMask unit_get_enable_mask(Unit *u); -bool unit_get_needs_bpf_firewall(Unit *u); - void unit_update_cgroup_members_masks(Unit *u); const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask); From 0adf88b68ce71b49009d731ac6d96d9d59c4f2a9 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 21 Nov 2018 17:48:41 +0100 Subject: [PATCH 14/29] cgroup: dump delegation mask too --- src/core/unit.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/core/unit.c b/src/core/unit.c index f6c2e08b55..392cc2d7c5 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -1155,17 +1155,20 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) { (void) cg_mask_to_string(u->cgroup_realized_mask, &s); fprintf(f, "%s\tCGroup realized mask: %s\n", prefix, strnull(s)); } + if (u->cgroup_enabled_mask != 0) { _cleanup_free_ char *s = NULL; (void) cg_mask_to_string(u->cgroup_enabled_mask, &s); fprintf(f, "%s\tCGroup enabled mask: %s\n", prefix, strnull(s)); } + m = unit_get_own_mask(u); if (m != 0) { _cleanup_free_ char *s = NULL; (void) cg_mask_to_string(m, &s); fprintf(f, "%s\tCGroup own mask: %s\n", prefix, strnull(s)); } + m = unit_get_members_mask(u); if (m != 0) { _cleanup_free_ char *s = NULL; @@ -1173,6 +1176,13 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) { fprintf(f, "%s\tCGroup members mask: %s\n", prefix, strnull(s)); } + m = unit_get_delegate_mask(u); + if (m != 0) { + _cleanup_free_ char *s = NULL; + (void) cg_mask_to_string(m, &s); + fprintf(f, "%s\tCGroup delegate mask: %s\n", prefix, strnull(s)); + } + SET_FOREACH(t, u->names, i) fprintf(f, "%s\tName: %s\n", prefix, t); From 442ce7759c668bf9857eff13a90b0cfa6be8d426 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 21 Nov 2018 18:25:37 +0100 Subject: [PATCH 15/29] cgroup: units that aren't loaded properly should not result in cgroup controllers being pulled in This shouldn't make much difference in real life, but is a bit cleaner. --- src/core/cgroup.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 1aab238866..bc6df17c0b 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1270,7 +1270,11 @@ static CGroupMask unit_get_bpf_mask(Unit *u) { CGroupMask unit_get_own_mask(Unit *u) { CGroupContext *c; - /* Returns the mask of controllers the unit needs for itself */ + /* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty + * mask, as we shouldn't reflect it in the cgroup hierarchy then. */ + + if (u->load_state != UNIT_LOADED) + return 0; c = unit_get_cgroup_context(u); if (!c) From 26a17ca280a8b3c28d5514dbe7e3635783a5515b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 21 Nov 2018 18:27:15 +0100 Subject: [PATCH 16/29] cgroup: add explanatory comment --- src/core/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index bc6df17c0b..7e5ee1b25b 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1312,7 +1312,7 @@ CGroupMask unit_get_members_mask(Unit *u) { /* Returns the mask of controllers all of the unit's children require, merged */ if (u->cgroup_members_mask_valid) - return u->cgroup_members_mask; + return u->cgroup_members_mask; /* Use cached value if possible */ u->cgroup_members_mask = 0; From 54b5ba1d1f4dcfe181bb3f1e22c7b8d229cae633 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Nov 2018 12:01:32 +0100 Subject: [PATCH 17/29] cgroup: propagate errors when we cannot open cgroup.subtree_control --- src/basic/cgroup-util.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index 94ae37dcb0..0cd77e7f29 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -2635,10 +2635,8 @@ int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p) { if (!f) { f = fopen(fs, "we"); - if (!f) { - log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p); - break; - } + if (!f) + return log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p); } r = write_string_stream(f, s, WRITE_STRING_FILE_DISABLE_BUFFER); From 94f344fb03101273471eeaa8c821a657255fd076 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Nov 2018 12:02:13 +0100 Subject: [PATCH 18/29] cgroup: tweak log message, so that it doesn't claim we always enable controllers when we actually disable them --- src/basic/cgroup-util.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index 0cd77e7f29..2d30391489 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -2641,7 +2641,8 @@ int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p) { r = write_string_stream(f, s, WRITE_STRING_FILE_DISABLE_BUFFER); if (r < 0) { - log_debug_errno(r, "Failed to enable controller %s for %s (%s): %m", n, p, fs); + log_debug_errno(r, "Failed to %s controller %s for %s (%s): %m", + FLAGS_SET(mask, bit) ? "enable" : "disable", n, p, fs); clearerr(f); } } From 27adcc973771a998433635672e2eee0a4489b8a4 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Nov 2018 21:45:33 +0100 Subject: [PATCH 19/29] cgroup: be more careful with which controllers we can enable/disable on a cgroup This changes cg_enable_everywhere() to return which controllers are enabled for the specified cgroup. This information is then used to correctly track the enablement mask currently in effect for a unit. Moreover, when we try to turn off a controller, and this works, then this is indicates that the parent unit might succesfully turn it off now, too as our unit might have kept it busy. So far, when realizing cgroups, i.e. when syncing up the kernel representation of relevant cgroups with our own idea we would strictly work from the root to the leaves. This is generally a good approach, as when controllers are enabled this has to happen in root-to-leaves order. However, when controllers are disabled this has to happen in the opposite order: in leaves-to-root order (this is because controllers can only be enabled in a child if it is already enabled in the parent, and if it shall be disabled in the parent then it has to be disabled in the child first, otherwise it is considered busy when it is attempted to remove it in the parent). To make things complicated when invalidating a unit's cgroup membershup systemd can actually turn off some controllers previously turned on at the very same time as it turns on other controllers previously turned off. In such a case we have to work up leaves-to-root *and* root-to-leaves right after each other. With this patch this is implemented: we still generally operate root-to-leaves, but as soon as we noticed we successfully turned off a controller previously turned on for a cgroup we'll re-enqueue the cgroup realization for all parents of a unit, thus implementing leaves-to-root where necessary. --- src/basic/cgroup-util.c | 58 ++++++++++++++++++++++++++++++++++++-- src/basic/cgroup-util.h | 2 +- src/core/cgroup.c | 31 ++++++++++++++++---- src/core/cgroup.h | 3 +- src/nspawn/nspawn-cgroup.c | 2 +- 5 files changed, 84 insertions(+), 12 deletions(-) diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index 2d30391489..d38452cfcb 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -2595,22 +2595,45 @@ int cg_unified_flush(void) { return cg_unified_update(); } -int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p) { +int cg_enable_everywhere( + CGroupMask supported, + CGroupMask mask, + const char *p, + CGroupMask *ret_result_mask) { + _cleanup_fclose_ FILE *f = NULL; _cleanup_free_ char *fs = NULL; CGroupController c; + CGroupMask ret = 0; int r; assert(p); - if (supported == 0) + if (supported == 0) { + if (ret_result_mask) + *ret_result_mask = 0; return 0; + } r = cg_all_unified(); if (r < 0) return r; - if (r == 0) /* on the legacy hiearchy there's no joining of controllers defined */ + if (r == 0) { + /* On the legacy hiearchy there's no concept of "enabling" controllers in cgroups defined. Let's claim + * complete success right away. (If you wonder why we return the full mask here, rather than zero: the + * caller tends to use the returned mask later on to compare if all controllers where properly joined, + * and if not requeues realization. This use is the primary purpose of the return value, hence let's + * minimize surprises here and reduce triggers for re-realization by always saying we fully + * succeeded.) */ + if (ret_result_mask) + *ret_result_mask = mask & supported & CGROUP_MASK_V2; /* If you wonder why we mask this with + * CGROUP_MASK_V2: The 'supported' mask + * might contain pure-V1 or BPF + * controllers, and we never want to + * claim that we could enable those with + * cgroup.subtree_control */ return 0; + } r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs); if (r < 0) @@ -2644,10 +2667,39 @@ int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p) { log_debug_errno(r, "Failed to %s controller %s for %s (%s): %m", FLAGS_SET(mask, bit) ? "enable" : "disable", n, p, fs); clearerr(f); + + /* If we can't turn off a controller, leave it on in the reported resulting mask. This + * happens for example when we attempt to turn off a controller up in the tree that is + * used down in the tree. */ + if (!FLAGS_SET(mask, bit) && r == -EBUSY) /* You might wonder why we check for EBUSY + * only here, and not follow the same logic + * for other errors such as EINVAL or + * EOPNOTSUPP or anything else. That's + * because EBUSY indicates that the + * controllers is currently enabled and + * cannot be disabled because something down + * the hierarchy is still using it. Any other + * error most likely means something like "I + * never heard of this controller" or + * similar. In the former case it's hence + * safe to assume the controller is still on + * after the failed operation, while in the + * latter case it's safer to assume the + * controller is unknown and hence certainly + * not enabled. */ + ret |= bit; + } else { + /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */ + if (FLAGS_SET(mask, bit)) + ret |= bit; } } } + /* Let's return the precise set of controllers now enabled for the cgroup. */ + if (ret_result_mask) + *ret_result_mask = ret; + return 0; } diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h index 7919864df9..ea9a333290 100644 --- a/src/basic/cgroup-util.h +++ b/src/basic/cgroup-util.h @@ -245,7 +245,7 @@ int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_m int cg_attach_many_everywhere(CGroupMask supported, const char *path, Set* pids, cg_migrate_callback_t callback, void *userdata); int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to, cg_migrate_callback_t callback, void *userdata); int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root); -int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p); +int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p, CGroupMask *ret_result_mask); int cg_mask_supported(CGroupMask *ret); int cg_mask_from_string(const char *s, CGroupMask *ret); diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 7e5ee1b25b..ec58d7820b 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1593,8 +1593,8 @@ static int unit_create_cgroup( CGroupMask enable_mask) { CGroupContext *c; - int r; bool created; + int r; assert(u); @@ -1618,18 +1618,37 @@ static int unit_create_cgroup( /* Preserve enabled controllers in delegated units, adjust others. */ if (created || !unit_cgroup_delegate(u)) { + CGroupMask result_mask = 0; /* Enable all controllers we need */ - r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path); + r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path, &result_mask); if (r < 0) - log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", - u->cgroup_path); + log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", u->cgroup_path); + + /* If we just turned off a controller, this might release the controller for our parent too, let's + * enqueue the parent for re-realization in that case again. */ + if (UNIT_ISSET(u->slice)) { + CGroupMask turned_off; + + turned_off = (u->cgroup_realized ? u->cgroup_enabled_mask & ~result_mask : 0); + if (turned_off != 0) { + Unit *parent; + + /* Force the parent to propagate the enable mask to the kernel again, by invalidating + * the controller we just turned off. */ + + for (parent = UNIT_DEREF(u->slice); parent; parent = UNIT_DEREF(parent->slice)) + unit_invalidate_cgroup(parent, turned_off); + } + } + + /* Remember what's actually enabled now */ + u->cgroup_enabled_mask = result_mask; } /* Keep track that this is now realized */ u->cgroup_realized = true; u->cgroup_realized_mask = target_mask; - u->cgroup_enabled_mask = enable_mask; if (u->type != UNIT_SLICE && !unit_cgroup_delegate(u)) { @@ -1815,7 +1834,7 @@ static bool unit_has_mask_realized( u->cgroup_invalidated_mask == 0; } -static void unit_add_to_cgroup_realize_queue(Unit *u) { +void unit_add_to_cgroup_realize_queue(Unit *u) { assert(u); if (u->in_cgroup_realize_queue) diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 2c7287841a..64dec408cc 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -151,12 +151,13 @@ CGroupMask unit_get_delegate_mask(Unit *u); CGroupMask unit_get_members_mask(Unit *u); CGroupMask unit_get_siblings_mask(Unit *u); CGroupMask unit_get_subtree_mask(Unit *u); - CGroupMask unit_get_target_mask(Unit *u); CGroupMask unit_get_enable_mask(Unit *u); void unit_update_cgroup_members_masks(Unit *u); +void unit_add_to_cgroup_realize_queue(Unit *u); + const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask); char *unit_default_cgroup_path(Unit *u); int unit_set_cgroup_path(Unit *u, const char *path); diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c index 0197474dbc..53c42f0ee4 100644 --- a/src/nspawn/nspawn-cgroup.c +++ b/src/nspawn/nspawn-cgroup.c @@ -189,7 +189,7 @@ int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) } /* Try to enable as many controllers as possible for the new payload. */ - (void) cg_enable_everywhere(supported, supported, cgroup); + (void) cg_enable_everywhere(supported, supported, cgroup, NULL); return 0; } From e00068e71fcf33ccd9190cb18d1539cd921cf444 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Nov 2018 22:02:53 +0100 Subject: [PATCH 20/29] cgroup: in unit_invalidate_cgroup() actually modify invalidation mask Previously this would manipulate the realization mask for invalidating the realization. This is a bit ugly though as the realization mask's primary purpose to is to reflect in which hierarchies a cgroup currently exists, and it's probably a good idea to keep that in sync with realities. We nowadays have the an explicit fields for invalidating cgroup controller information, the "cgroup_invalidated_mask", let's use this one instead. The effect is pretty much the same, as the main consumer of these masks (unit_has_mask_realize()) checks both anyway. --- src/core/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index ec58d7820b..0e198ce025 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -2824,10 +2824,10 @@ void unit_invalidate_cgroup(Unit *u, CGroupMask m) { if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT)) m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT; - if ((u->cgroup_realized_mask & m) == 0) /* NOP? */ + if (FLAGS_SET(u->cgroup_invalidated_mask, m)) /* NOP? */ return; - u->cgroup_realized_mask &= ~m; + u->cgroup_invalidated_mask |= m; unit_add_to_cgroup_realize_queue(u); } From 27c4ed790a1e6be83862d3e562baadf2ba48f447 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Nov 2018 22:11:07 +0100 Subject: [PATCH 21/29] cgroup: simplify check whether it makes sense to realize a cgroup --- src/core/cgroup.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 0e198ce025..3c5963dd09 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1592,14 +1592,12 @@ static int unit_create_cgroup( CGroupMask target_mask, CGroupMask enable_mask) { - CGroupContext *c; bool created; int r; assert(u); - c = unit_get_cgroup_context(u); - if (!c) + if (!UNIT_HAS_CGROUP_CONTEXT(u)) return 0; /* Figure out our cgroup path */ From 67558d15ece919977fae1b15af75fcc24e00e4a9 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Nov 2018 00:48:31 +0100 Subject: [PATCH 22/29] cgroup: extend cg_mask_supported() comment a bit --- src/basic/cgroup-util.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index d38452cfcb..3f94602c8f 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -2368,9 +2368,9 @@ int cg_mask_supported(CGroupMask *ret) { CGroupMask mask; int r; - /* Determines the mask of supported cgroup controllers. Only - * includes controllers we can make sense of and that are - * actually accessible. */ + /* Determines the mask of supported cgroup controllers. Only includes controllers we can make sense of and that + * are actually accessible. Only covers real controllers, i.e. not the CGROUP_CONTROLLER_BPF_xyz + * pseudo-controllers. */ r = cg_all_unified(); if (r < 0) From 5a62e5e2ac564dfa5bfc53ad378985ddc3a7a9d7 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Nov 2018 00:49:31 +0100 Subject: [PATCH 23/29] cgroup: document what the various masks variables are used for --- src/core/unit.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/core/unit.h b/src/core/unit.h index 15cd02e23a..180f852a25 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -247,11 +247,11 @@ typedef struct Unit { /* Counterparts in the cgroup filesystem */ char *cgroup_path; - CGroupMask cgroup_realized_mask; - CGroupMask cgroup_enabled_mask; - CGroupMask cgroup_invalidated_mask; + CGroupMask cgroup_realized_mask; /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroupsv1) */ + CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroupsv2) */ + CGroupMask cgroup_invalidated_mask; /* A mask specifiying controllers which shall be considered invalidated, and require re-realization */ CGroupMask cgroup_subtree_mask; - CGroupMask cgroup_members_mask; + CGroupMask cgroup_members_mask; /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */ int cgroup_inotify_wd; /* Device Controller BPF program */ From d5095dcd307fc67b7a641d3dd26fed2bb48f2bb8 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Nov 2018 01:02:17 +0100 Subject: [PATCH 24/29] cgroup: tighten call that detects whether we need to realize a unit's cgroup a bit, and comment why --- src/core/cgroup.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 3c5963dd09..1dc62ba45f 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1826,9 +1826,24 @@ static bool unit_has_mask_realized( assert(u); + /* Returns true if this unit is fully realized. We check four things: + * + * 1. Whether the cgroup was created at all + * 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroupsv1) + * 3. Whether the cgroup has all the right controllers enabled (in case of cgroupsv2) + * 4. Whether the invalidation mask is currently zero + * + * If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note + * that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroupv1 controllers), CGROUP_MASK_V2 (for + * real cgroupv2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask + * is only matters for cgroupsv1 controllers, and cgroup_enabled_mask only used for cgroupsv2, and if they + * differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are + * enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they + * simply don't matter. */ + return u->cgroup_realized && - u->cgroup_realized_mask == target_mask && - u->cgroup_enabled_mask == enable_mask && + ((u->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 && + ((u->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 && u->cgroup_invalidated_mask == 0; } From 1fd3a10c38a0be4fc42ae94cf9f8401003187b80 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Nov 2018 01:03:18 +0100 Subject: [PATCH 25/29] cgroup: extend reasons when we realize the enable mask After creating a cgroup we need to initialize its "cgroup.subtree_control" file with the controllers its children want to use. Currently we do so whenever the mkdir() on the cgroup succeeded, i.e. when we know the cgroup is "fresh". Let's update the condition slightly that we also do so when internally we assume a cgroup doesn't exist yet, even if it already does (maybe left-over from a previous run). This shouldn't change anything IRL but make things a bit more robust. --- src/core/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 1dc62ba45f..988813c38b 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1615,7 +1615,7 @@ static int unit_create_cgroup( (void) unit_watch_cgroup(u); /* Preserve enabled controllers in delegated units, adjust others. */ - if (created || !unit_cgroup_delegate(u)) { + if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) { CGroupMask result_mask = 0; /* Enable all controllers we need */ From 8a0d538815b72563eb227fa21a67e78f5b290bfe Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Nov 2018 01:07:14 +0100 Subject: [PATCH 26/29] cgroup: extend comment on what unit_release_cgroup() is for --- src/core/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 988813c38b..3803231141 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -2005,7 +2005,8 @@ int unit_realize_cgroup(Unit *u) { void unit_release_cgroup(Unit *u) { assert(u); - /* Forgets all cgroup details for this cgroup */ + /* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call + * when we close down everything for reexecution, where we really want to leave the cgroup in place. */ if (u->cgroup_path) { (void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path); From 5af8805872809e6de4cc4d9495cb1a904772ab4e Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Nov 2018 01:07:34 +0100 Subject: [PATCH 27/29] cgroup: drastically simplify caching of cgroups members mask Previously we tried to be smart: when a new unit appeared and it only added controllers to the cgroup mask we'd update the cached members mask in all parents by ORing in the controller flags in their cached values. Unfortunately this was quite broken, as we missed some conditions when this cache had to be reset (for example, when a unit got unloaded), moreover the optimization doesn't work when a controller is removed anyway (as in that case there's no other way for the parent to iterate though all children if any other, remaining child unit still needs it). Hence, let's simplify the logic substantially: instead of updating the cache on the right events (which we didn't get right), let's simply invalidate the cache, and generate it lazily when we encounter it later. This should actually result in better behaviour as we don't have to calculate the new members mask for a whole subtree whever we have the suspicion something changed, but can delay it to the point where we actually need the members mask. This allows us to simplify things quite a bit, which is good, since validating this cache for correctness is hard enough. Fixes: #9512 --- src/core/cgroup.c | 49 +++++------------------------------------ src/core/cgroup.h | 2 +- src/core/dbus-mount.c | 2 +- src/core/dbus-scope.c | 2 +- src/core/dbus-service.c | 2 +- src/core/dbus-slice.c | 2 +- src/core/dbus-socket.c | 2 +- src/core/dbus-swap.c | 2 +- src/core/unit.c | 3 ++- src/core/unit.h | 2 -- 10 files changed, 14 insertions(+), 54 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 3803231141..598b396186 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1388,53 +1388,14 @@ CGroupMask unit_get_enable_mask(Unit *u) { return mask; } -/* Recurse from a unit up through its containing slices, propagating - * mask bits upward. A unit is also member of itself. */ -void unit_update_cgroup_members_masks(Unit *u) { - CGroupMask m; - bool more; - +void unit_invalidate_cgroup_members_masks(Unit *u) { assert(u); - /* Calculate subtree mask */ - m = unit_get_subtree_mask(u); + /* Recurse invalidate the member masks cache all the way up the tree */ + u->cgroup_members_mask_valid = false; - /* See if anything changed from the previous invocation. If - * not, we're done. */ - if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask) - return; - - more = - u->cgroup_subtree_mask_valid && - ((m & ~u->cgroup_subtree_mask) != 0) && - ((~m & u->cgroup_subtree_mask) == 0); - - u->cgroup_subtree_mask = m; - u->cgroup_subtree_mask_valid = true; - - if (UNIT_ISSET(u->slice)) { - Unit *s = UNIT_DEREF(u->slice); - - if (more) - /* There's more set now than before. We - * propagate the new mask to the parent's mask - * (not caring if it actually was valid or - * not). */ - - s->cgroup_members_mask |= m; - - else - /* There's less set now than before (or we - * don't know), we need to recalculate - * everything, so let's invalidate the - * parent's members mask */ - - s->cgroup_members_mask_valid = false; - - /* And now make sure that this change also hits our - * grandparents */ - unit_update_cgroup_members_masks(s); - } + if (UNIT_ISSET(u->slice)) + unit_invalidate_cgroup_members_masks(UNIT_DEREF(u->slice)); } const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) { diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 64dec408cc..828b6f0795 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -154,7 +154,7 @@ CGroupMask unit_get_subtree_mask(Unit *u); CGroupMask unit_get_target_mask(Unit *u); CGroupMask unit_get_enable_mask(Unit *u); -void unit_update_cgroup_members_masks(Unit *u); +void unit_invalidate_cgroup_members_masks(Unit *u); void unit_add_to_cgroup_realize_queue(Unit *u); diff --git a/src/core/dbus-mount.c b/src/core/dbus-mount.c index 3f98d3ecf0..b6d61627eb 100644 --- a/src/core/dbus-mount.c +++ b/src/core/dbus-mount.c @@ -145,7 +145,7 @@ int bus_mount_set_property( int bus_mount_commit_properties(Unit *u) { assert(u); - unit_update_cgroup_members_masks(u); + unit_invalidate_cgroup_members_masks(u); unit_realize_cgroup(u); return 0; diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c index 5d9fe98857..bb807df2e9 100644 --- a/src/core/dbus-scope.c +++ b/src/core/dbus-scope.c @@ -186,7 +186,7 @@ int bus_scope_set_property( int bus_scope_commit_properties(Unit *u) { assert(u); - unit_update_cgroup_members_masks(u); + unit_invalidate_cgroup_members_masks(u); unit_realize_cgroup(u); return 0; diff --git a/src/core/dbus-service.c b/src/core/dbus-service.c index fdf6120610..10f53ef401 100644 --- a/src/core/dbus-service.c +++ b/src/core/dbus-service.c @@ -424,7 +424,7 @@ int bus_service_set_property( int bus_service_commit_properties(Unit *u) { assert(u); - unit_update_cgroup_members_masks(u); + unit_invalidate_cgroup_members_masks(u); unit_realize_cgroup(u); return 0; diff --git a/src/core/dbus-slice.c b/src/core/dbus-slice.c index 722a5688a5..effd5fa5d7 100644 --- a/src/core/dbus-slice.c +++ b/src/core/dbus-slice.c @@ -28,7 +28,7 @@ int bus_slice_set_property( int bus_slice_commit_properties(Unit *u) { assert(u); - unit_update_cgroup_members_masks(u); + unit_invalidate_cgroup_members_masks(u); unit_realize_cgroup(u); return 0; diff --git a/src/core/dbus-socket.c b/src/core/dbus-socket.c index 4ea5b6c6e5..3819653908 100644 --- a/src/core/dbus-socket.c +++ b/src/core/dbus-socket.c @@ -461,7 +461,7 @@ int bus_socket_set_property( int bus_socket_commit_properties(Unit *u) { assert(u); - unit_update_cgroup_members_masks(u); + unit_invalidate_cgroup_members_masks(u); unit_realize_cgroup(u); return 0; diff --git a/src/core/dbus-swap.c b/src/core/dbus-swap.c index b272d10113..353fa20132 100644 --- a/src/core/dbus-swap.c +++ b/src/core/dbus-swap.c @@ -63,7 +63,7 @@ int bus_swap_set_property( int bus_swap_commit_properties(Unit *u) { assert(u); - unit_update_cgroup_members_masks(u); + unit_invalidate_cgroup_members_masks(u); unit_realize_cgroup(u); return 0; diff --git a/src/core/unit.c b/src/core/unit.c index 392cc2d7c5..a8c0f08e95 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -1547,7 +1547,8 @@ int unit_load(Unit *u) { if (u->job_running_timeout != USEC_INFINITY && u->job_running_timeout > u->job_timeout) log_unit_warning(u, "JobRunningTimeoutSec= is greater than JobTimeoutSec=, it has no effect."); - unit_update_cgroup_members_masks(u); + /* We finished loading, let's ensure our parents recalculate the members mask */ + unit_invalidate_cgroup_members_masks(u); } assert((u->load_state != UNIT_MERGED) == !u->merged_into); diff --git a/src/core/unit.h b/src/core/unit.h index 180f852a25..613d7b32e6 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -250,7 +250,6 @@ typedef struct Unit { CGroupMask cgroup_realized_mask; /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroupsv1) */ CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroupsv2) */ CGroupMask cgroup_invalidated_mask; /* A mask specifiying controllers which shall be considered invalidated, and require re-realization */ - CGroupMask cgroup_subtree_mask; CGroupMask cgroup_members_mask; /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */ int cgroup_inotify_wd; @@ -330,7 +329,6 @@ typedef struct Unit { bool cgroup_realized:1; bool cgroup_members_mask_valid:1; - bool cgroup_subtree_mask_valid:1; /* Reset cgroup accounting next time we fork something off */ bool reset_accounting:1; From b8b6f321044ab085358de91a1f72a7d86593dfda Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Nov 2018 01:13:47 +0100 Subject: [PATCH 28/29] cgroup: when we unload a unit, also update all its parent's members mask This way we can corectly ensure that when a unit that requires some controller goes away, we propagate the removal of it all the way up, so that the controller is turned off in all the parents too. --- src/core/unit.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/core/unit.c b/src/core/unit.c index a8c0f08e95..df340548bb 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -570,6 +570,14 @@ void unit_free(Unit *u) { if (!u) return; + if (UNIT_ISSET(u->slice)) { + /* A unit is being dropped from the tree, make sure our parent slice recalculates the member mask */ + unit_invalidate_cgroup_members_masks(UNIT_DEREF(u->slice)); + + /* And make sure the parent is realized again, updating cgroup memberships */ + unit_add_to_cgroup_realize_queue(UNIT_DEREF(u->slice)); + } + u->transient_file = safe_fclose(u->transient_file); if (!MANAGER_IS_RELOADING(u->manager)) From 43738e001e64ba0ec6522e0c35b67a006abf10e9 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Nov 2018 01:15:19 +0100 Subject: [PATCH 29/29] test: extend testcase to ensure controller membership doesn't regress --- test/TEST-19-DELEGATE/testsuite.sh | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/test/TEST-19-DELEGATE/testsuite.sh b/test/TEST-19-DELEGATE/testsuite.sh index c738bea10e..abfcee3cd2 100755 --- a/test/TEST-19-DELEGATE/testsuite.sh +++ b/test/TEST-19-DELEGATE/testsuite.sh @@ -11,10 +11,27 @@ if grep -q cgroup2 /proc/filesystems ; then -w /sys/fs/cgroup/system.slice/test0.service/cgroup.subtree_control systemd-run --wait --unit=test1.service -p "DynamicUser=1" -p "Delegate=memory pids" \ - grep memory /sys/fs/cgroup/system.slice/test1.service/cgroup.controllers + grep -q memory /sys/fs/cgroup/system.slice/test1.service/cgroup.controllers systemd-run --wait --unit=test2.service -p "DynamicUser=1" -p "Delegate=memory pids" \ - grep pids /sys/fs/cgroup/system.slice/test2.service/cgroup.controllers + grep -q pids /sys/fs/cgroup/system.slice/test2.service/cgroup.controllers + + # "io" is not among the controllers enabled by default for all units, verify that + grep -qv io /sys/fs/cgroup/system.slice/cgroup.controllers + + # Run a service with "io" enabled, and verify it works + systemd-run --wait --unit=test3.service -p "IOAccounting=yes" -p "Slice=system-foo-bar-baz.slice" \ + grep -q io /sys/fs/cgroup/system.slice/system-foo.slice/system-foo-bar.slice/system-foo-bar-baz.slice/test3.service/cgroup.controllers + + # We want to check if "io" is removed again from the controllers + # list. However, PID 1 (rightfully) does this asynchronously. In order + # to force synchronization on this, let's start a short-lived service + # which requires PID 1 to refresh the cgroup tree, so that we can + # verify that this all works. + systemd-run --wait --unit=test4.service true + + # And now check again, "io" should have vanished + grep -qv io /sys/fs/cgroup/system.slice/cgroup.controllers else echo "Skipping TEST-19-DELEGATE, as the kernel doesn't actually support cgroupsv2" >&2 fi