diff --git a/src/basic/memory-util.h b/src/basic/memory-util.h index 9cb8ac3c10..46a6907a0c 100644 --- a/src/basic/memory-util.h +++ b/src/basic/memory-util.h @@ -11,6 +11,7 @@ size_t page_size(void) _pure_; #define PAGE_ALIGN(l) ALIGN_TO((l), page_size()) +#define PAGE_ALIGN_DOWN(l) (l & ~(page_size() - 1)) /* Normal memcpy requires src to be nonnull. We do nothing if n is 0. */ static inline void memcpy_safe(void *dst, const void *src, size_t n) { diff --git a/src/core/cgroup.c b/src/core/cgroup.c index edd10fc31d..c67ecc37c5 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -208,7 +208,158 @@ void cgroup_context_done(CGroupContext *c) { cpu_set_reset(&c->cpuset_mems); } -void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { +static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) { + _cleanup_free_ char *raw_kval = NULL; + uint64_t kval; + int r; + + assert(u); + + if (!u->cgroup_realized) + return -EOWNERDEAD; + + r = cg_get_attribute("memory", u->cgroup_path, file, &raw_kval); + if (r < 0) + return r; + + if (streq(raw_kval, "max")) { + *ret = CGROUP_LIMIT_MAX; + return 0; + } + + r = safe_atou64(raw_kval, &kval); + if (r < 0) + return r; + + *ret = kval; + + return 0; +} + +static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) { + CGroupContext *c; + CGroupMask m; + const char *file; + uint64_t unit_value; + int r; + + /* Compare kernel memcg configuration against our internal systemd state. Unsupported (and will + * return -ENODATA) on cgroup v1. + * + * Returns: + * + * <0: On error. + * 0: If the kernel memory setting doesn't match our configuration. + * >0: If the kernel memory setting matches our configuration. + * + * The following values are only guaranteed to be populated on return >=0: + * + * - ret_unit_value will contain our internal expected value for the unit, page-aligned. + * - ret_kernel_value will contain the actual value presented by the kernel. */ + + assert(u); + + r = cg_all_unified(); + if (r < 0) + return log_debug_errno(r, "Failed to determine cgroup hierarchy version: %m"); + + /* Unsupported on v1. + * + * We don't return ENOENT, since that could actually mask a genuine problem where somebody else has + * silently masked the controller. */ + if (r == 0) + return -ENODATA; + + /* The root slice doesn't have any controller files, so we can't compare anything. */ + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + return -ENODATA; + + /* It's possible to have MemoryFoo set without systemd wanting to have the memory controller enabled, + * for example, in the case of DisableControllers= or cgroup_disable on the kernel command line. To + * avoid specious errors in these scenarios, check that we even expect the memory controller to be + * enabled at all. */ + m = unit_get_target_mask(u); + if (!FLAGS_SET(m, CGROUP_MASK_MEMORY)) + return -ENODATA; + + c = unit_get_cgroup_context(u); + assert(c); + + if (streq(property_name, "MemoryLow")) { + unit_value = unit_get_ancestor_memory_low(u); + file = "memory.low"; + } else if (streq(property_name, "MemoryMin")) { + unit_value = unit_get_ancestor_memory_min(u); + file = "memory.min"; + } else if (streq(property_name, "MemoryHigh")) { + unit_value = c->memory_high; + file = "memory.high"; + } else if (streq(property_name, "MemoryMax")) { + unit_value = c->memory_max; + file = "memory.max"; + } else if (streq(property_name, "MemorySwapMax")) { + unit_value = c->memory_swap_max; + file = "memory.swap.max"; + } else + return -EINVAL; + + r = unit_get_kernel_memory_limit(u, file, ret_kernel_value); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to parse %s: %m", file); + + /* It's intended (soon) in a future kernel to not expose cgroup memory limits rounded to page + * boundaries, but instead separate the user-exposed limit, which is whatever userspace told us, from + * our internal page-counting. To support those future kernels, just check the value itself first + * without any page-alignment. */ + if (*ret_kernel_value == unit_value) { + *ret_unit_value = unit_value; + return 1; + } + + /* The current kernel behaviour, by comparison, is that even if you write a particular number of + * bytes into a cgroup memory file, it always returns that number page-aligned down (since the kernel + * internally stores cgroup limits in pages). As such, so long as it aligns properly, everything is + * cricket. */ + if (unit_value != CGROUP_LIMIT_MAX) + unit_value = PAGE_ALIGN_DOWN(unit_value); + + *ret_unit_value = unit_value; + + return *ret_kernel_value == *ret_unit_value; +} + +#define FORMAT_CGROUP_DIFF_MAX 128 + +static char *format_cgroup_memory_limit_comparison(char *buf, size_t l, Unit *u, const char *property_name) { + uint64_t kval, sval; + int r; + + assert(u); + assert(buf); + assert(l > 0); + + r = unit_compare_memory_limit(u, property_name, &sval, &kval); + + /* memory.swap.max is special in that it relies on CONFIG_MEMCG_SWAP (and the default swapaccount=1). + * In the absence of reliably being able to detect whether memcg swap support is available or not, + * only complain if the error is not ENOENT. */ + if (r > 0 || IN_SET(r, -ENODATA, -EOWNERDEAD) || + (r == -ENOENT && streq(property_name, "MemorySwapMax"))) { + buf[0] = 0; + return buf; + } + + if (r < 0) { + snprintf(buf, l, " (error getting kernel value: %s)", strerror_safe(r)); + return buf; + } + + snprintf(buf, l, " (different value in kernel: %" PRIu64 ")", kval); + + return buf; +} + +void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { _cleanup_free_ char *disable_controllers_str = NULL; _cleanup_free_ char *cpuset_cpus = NULL; _cleanup_free_ char *cpuset_mems = NULL; @@ -218,14 +369,24 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { CGroupBlockIODeviceBandwidth *b; CGroupBlockIODeviceWeight *w; CGroupDeviceAllow *a; + CGroupContext *c; IPAddressAccessItem *iaai; char **path; - char u[FORMAT_TIMESPAN_MAX]; + char q[FORMAT_TIMESPAN_MAX]; char v[FORMAT_TIMESPAN_MAX]; - assert(c); + char cda[FORMAT_CGROUP_DIFF_MAX]; + char cdb[FORMAT_CGROUP_DIFF_MAX]; + char cdc[FORMAT_CGROUP_DIFF_MAX]; + char cdd[FORMAT_CGROUP_DIFF_MAX]; + char cde[FORMAT_CGROUP_DIFF_MAX]; + + assert(u); assert(f); + c = unit_get_cgroup_context(u); + assert(c); + prefix = strempty(prefix); (void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str); @@ -234,36 +395,36 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems); fprintf(f, - "%sCPUAccounting=%s\n" - "%sIOAccounting=%s\n" - "%sBlockIOAccounting=%s\n" - "%sMemoryAccounting=%s\n" - "%sTasksAccounting=%s\n" - "%sIPAccounting=%s\n" - "%sCPUWeight=%" PRIu64 "\n" - "%sStartupCPUWeight=%" PRIu64 "\n" - "%sCPUShares=%" PRIu64 "\n" - "%sStartupCPUShares=%" PRIu64 "\n" - "%sCPUQuotaPerSecSec=%s\n" - "%sCPUQuotaPeriodSec=%s\n" - "%sAllowedCPUs=%s\n" - "%sAllowedMemoryNodes=%s\n" - "%sIOWeight=%" PRIu64 "\n" - "%sStartupIOWeight=%" PRIu64 "\n" - "%sBlockIOWeight=%" PRIu64 "\n" - "%sStartupBlockIOWeight=%" PRIu64 "\n" - "%sDefaultMemoryMin=%" PRIu64 "\n" - "%sDefaultMemoryLow=%" PRIu64 "\n" - "%sMemoryMin=%" PRIu64 "\n" - "%sMemoryLow=%" PRIu64 "\n" - "%sMemoryHigh=%" PRIu64 "\n" - "%sMemoryMax=%" PRIu64 "\n" - "%sMemorySwapMax=%" PRIu64 "\n" - "%sMemoryLimit=%" PRIu64 "\n" - "%sTasksMax=%" PRIu64 "\n" - "%sDevicePolicy=%s\n" - "%sDisableControllers=%s\n" - "%sDelegate=%s\n", + "%sCPUAccounting: %s\n" + "%sIOAccounting: %s\n" + "%sBlockIOAccounting: %s\n" + "%sMemoryAccounting: %s\n" + "%sTasksAccounting: %s\n" + "%sIPAccounting: %s\n" + "%sCPUWeight: %" PRIu64 "\n" + "%sStartupCPUWeight: %" PRIu64 "\n" + "%sCPUShares: %" PRIu64 "\n" + "%sStartupCPUShares: %" PRIu64 "\n" + "%sCPUQuotaPerSecSec: %s\n" + "%sCPUQuotaPeriodSec: %s\n" + "%sAllowedCPUs: %s\n" + "%sAllowedMemoryNodes: %s\n" + "%sIOWeight: %" PRIu64 "\n" + "%sStartupIOWeight: %" PRIu64 "\n" + "%sBlockIOWeight: %" PRIu64 "\n" + "%sStartupBlockIOWeight: %" PRIu64 "\n" + "%sDefaultMemoryMin: %" PRIu64 "\n" + "%sDefaultMemoryLow: %" PRIu64 "\n" + "%sMemoryMin: %" PRIu64 "%s\n" + "%sMemoryLow: %" PRIu64 "%s\n" + "%sMemoryHigh: %" PRIu64 "%s\n" + "%sMemoryMax: %" PRIu64 "%s\n" + "%sMemorySwapMax: %" PRIu64 "%s\n" + "%sMemoryLimit: %" PRIu64 "\n" + "%sTasksMax: %" PRIu64 "\n" + "%sDevicePolicy: %s\n" + "%sDisableControllers: %s\n" + "%sDelegate: %s\n", prefix, yes_no(c->cpu_accounting), prefix, yes_no(c->io_accounting), prefix, yes_no(c->blockio_accounting), @@ -274,7 +435,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { prefix, c->startup_cpu_weight, prefix, c->cpu_shares, prefix, c->startup_cpu_shares, - prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1), + prefix, format_timespan(q, sizeof(q), c->cpu_quota_per_sec_usec, 1), prefix, format_timespan(v, sizeof(v), c->cpu_quota_period_usec, 1), prefix, cpuset_cpus, prefix, cpuset_mems, @@ -284,11 +445,11 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { prefix, c->startup_blockio_weight, prefix, c->default_memory_min, prefix, c->default_memory_low, - prefix, c->memory_min, - prefix, c->memory_low, - prefix, c->memory_high, - prefix, c->memory_max, - prefix, c->memory_swap_max, + prefix, c->memory_min, format_cgroup_memory_limit_comparison(cda, sizeof(cda), u, "MemoryMin"), + prefix, c->memory_low, format_cgroup_memory_limit_comparison(cdb, sizeof(cdb), u, "MemoryLow"), + prefix, c->memory_high, format_cgroup_memory_limit_comparison(cdc, sizeof(cdc), u, "MemoryHigh"), + prefix, c->memory_max, format_cgroup_memory_limit_comparison(cdd, sizeof(cdd), u, "MemoryMax"), + prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(cde, sizeof(cde), u, "MemorySwapMax"), prefix, c->memory_limit, prefix, c->tasks_max, prefix, cgroup_device_policy_to_string(c->device_policy), @@ -300,31 +461,31 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { (void) cg_mask_to_string(c->delegate_controllers, &t); - fprintf(f, "%sDelegateControllers=%s\n", + fprintf(f, "%sDelegateControllers: %s\n", prefix, strempty(t)); } LIST_FOREACH(device_allow, a, c->device_allow) fprintf(f, - "%sDeviceAllow=%s %s%s%s\n", + "%sDeviceAllow: %s %s%s%s\n", prefix, a->path, a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : ""); LIST_FOREACH(device_weights, iw, c->io_device_weights) fprintf(f, - "%sIODeviceWeight=%s %" PRIu64 "\n", + "%sIODeviceWeight: %s %" PRIu64 "\n", prefix, iw->path, iw->weight); LIST_FOREACH(device_latencies, l, c->io_device_latencies) fprintf(f, - "%sIODeviceLatencyTargetSec=%s %s\n", + "%sIODeviceLatencyTargetSec: %s %s\n", prefix, l->path, - format_timespan(u, sizeof(u), l->target_usec, 1)); + format_timespan(q, sizeof(q), l->target_usec, 1)); LIST_FOREACH(device_limits, il, c->io_device_limits) { char buf[FORMAT_BYTES_MAX]; @@ -333,7 +494,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++) if (il->limits[type] != cgroup_io_limit_defaults[type]) fprintf(f, - "%s%s=%s %s\n", + "%s%s: %s %s\n", prefix, cgroup_io_limit_type_to_string(type), il->path, @@ -342,7 +503,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { LIST_FOREACH(device_weights, w, c->blockio_device_weights) fprintf(f, - "%sBlockIODeviceWeight=%s %" PRIu64, + "%sBlockIODeviceWeight: %s %" PRIu64, prefix, w->path, w->weight); @@ -352,13 +513,13 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { if (b->rbps != CGROUP_LIMIT_MAX) fprintf(f, - "%sBlockIOReadBandwidth=%s %s\n", + "%sBlockIOReadBandwidth: %s %s\n", prefix, b->path, format_bytes(buf, sizeof(buf), b->rbps)); if (b->wbps != CGROUP_LIMIT_MAX) fprintf(f, - "%sBlockIOWriteBandwidth=%s %s\n", + "%sBlockIOWriteBandwidth: %s %s\n", prefix, b->path, format_bytes(buf, sizeof(buf), b->wbps)); @@ -368,21 +529,21 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { _cleanup_free_ char *k = NULL; (void) in_addr_to_string(iaai->family, &iaai->address, &k); - fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen); + fprintf(f, "%sIPAddressAllow: %s/%u\n", prefix, strnull(k), iaai->prefixlen); } LIST_FOREACH(items, iaai, c->ip_address_deny) { _cleanup_free_ char *k = NULL; (void) in_addr_to_string(iaai->family, &iaai->address, &k); - fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen); + fprintf(f, "%sIPAddressDeny: %s/%u\n", prefix, strnull(k), iaai->prefixlen); } STRV_FOREACH(path, c->ip_filters_ingress) - fprintf(f, "%sIPIngressFilterPath=%s\n", prefix, *path); + fprintf(f, "%sIPIngressFilterPath: %s\n", prefix, *path); STRV_FOREACH(path, c->ip_filters_egress) - fprintf(f, "%sIPEgressFilterPath=%s\n", prefix, *path); + fprintf(f, "%sIPEgressFilterPath: %s\n", prefix, *path); } int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode) { diff --git a/src/core/cgroup.h b/src/core/cgroup.h index bca53fb980..a66c702125 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -166,7 +166,7 @@ usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, void cgroup_context_init(CGroupContext *c); void cgroup_context_done(CGroupContext *c); -void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix); +void cgroup_context_dump(Unit *u, FILE* f, const char *prefix); void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a); void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w); diff --git a/src/core/mount.c b/src/core/mount.c index 8a0c8be120..09d08f3990 100644 --- a/src/core/mount.c +++ b/src/core/mount.c @@ -769,7 +769,7 @@ static void mount_dump(Unit *u, FILE *f, const char *prefix) { exec_context_dump(&m->exec_context, f, prefix); kill_context_dump(&m->kill_context, f, prefix); - cgroup_context_dump(&m->cgroup_context, f, prefix); + cgroup_context_dump(UNIT(m), f, prefix); } static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) { diff --git a/src/core/scope.c b/src/core/scope.c index bb1e60dd11..79470a0a9b 100644 --- a/src/core/scope.c +++ b/src/core/scope.c @@ -234,7 +234,7 @@ static void scope_dump(Unit *u, FILE *f, const char *prefix) { prefix, scope_state_to_string(s->state), prefix, scope_result_to_string(s->result)); - cgroup_context_dump(&s->cgroup_context, f, prefix); + cgroup_context_dump(UNIT(s), f, prefix); kill_context_dump(&s->kill_context, f, prefix); } diff --git a/src/core/service.c b/src/core/service.c index 1c677b4355..ada25e634a 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -905,7 +905,7 @@ static void service_dump(Unit *u, FILE *f, const char *prefix) { prefix, s->n_fd_store_max, prefix, s->n_fd_store); - cgroup_context_dump(&s->cgroup_context, f, prefix); + cgroup_context_dump(UNIT(s), f, prefix); } static int service_is_suitable_main_pid(Service *s, pid_t pid, int prio) { diff --git a/src/core/slice.c b/src/core/slice.c index 489d5ace6a..c12328b3b7 100644 --- a/src/core/slice.c +++ b/src/core/slice.c @@ -215,7 +215,7 @@ static void slice_dump(Unit *u, FILE *f, const char *prefix) { "%sSlice State: %s\n", prefix, slice_state_to_string(t->state)); - cgroup_context_dump(&t->cgroup_context, f, prefix); + cgroup_context_dump(UNIT(t), f, prefix); } static int slice_start(Unit *u) { diff --git a/src/core/socket.c b/src/core/socket.c index 7696490f8a..f31d3bd971 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -843,7 +843,7 @@ static void socket_dump(Unit *u, FILE *f, const char *prefix) { exec_command_dump_list(s->exec_command[c], f, prefix2); } - cgroup_context_dump(&s->cgroup_context, f, prefix); + cgroup_context_dump(UNIT(s), f, prefix); } static int instance_from_socket(int fd, unsigned nr, char **instance) { diff --git a/src/core/swap.c b/src/core/swap.c index 0924b35050..ad1da6dddb 100644 --- a/src/core/swap.c +++ b/src/core/swap.c @@ -621,7 +621,7 @@ static void swap_dump(Unit *u, FILE *f, const char *prefix) { exec_context_dump(&s->exec_context, f, prefix); kill_context_dump(&s->kill_context, f, prefix); - cgroup_context_dump(&s->cgroup_context, f, prefix); + cgroup_context_dump(UNIT(s), f, prefix); } static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) {