/* SPDX-License-Identifier: LGPL-2.1+ */ #include #include #include "cgroup-util.h" #include "fd-util.h" #include "format-util.h" #include "oomd-util.h" #include "parse-util.h" #include "path-util.h" #include "procfs-util.h" #include "signal-util.h" #include "sort-util.h" #include "stat-util.h" #include "stdio-util.h" DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR( oomd_cgroup_ctx_hash_ops, char, string_hash_func, string_compare_func, OomdCGroupContext, oomd_cgroup_context_free); static int log_kill(pid_t pid, int sig, void *userdata) { log_debug("oomd attempting to kill " PID_FMT " with %s", pid, signal_to_string(sig)); return 0; } static int increment_oomd_xattr(const char *path, const char *xattr, uint64_t num_procs_killed) { _cleanup_free_ char *value = NULL; char buf[DECIMAL_STR_MAX(uint64_t) + 1]; uint64_t curr_count = 0; int r; assert(path); assert(xattr); r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, path, xattr, &value); if (r < 0 && r != -ENODATA) return r; if (!isempty(value)) { r = safe_atou64(value, &curr_count); if (r < 0) return r; } if (curr_count > UINT64_MAX - num_procs_killed) return -EOVERFLOW; xsprintf(buf, "%"PRIu64, curr_count + num_procs_killed); r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, path, xattr, buf, strlen(buf), 0); if (r < 0) return r; return 0; } OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) { if (!ctx) return NULL; free(ctx->path); return mfree(ctx); } int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) { _cleanup_set_free_ Set *targets = NULL; OomdCGroupContext *ctx; char *key; int r; assert(h); assert(ret); targets = set_new(NULL); if (!targets) return -ENOMEM; HASHMAP_FOREACH_KEY(ctx, key, h) { if (ctx->memory_pressure.avg10 > ctx->mem_pressure_limit) { usec_t diff; if (ctx->last_hit_mem_pressure_limit == 0) ctx->last_hit_mem_pressure_limit = now(CLOCK_MONOTONIC); diff = now(CLOCK_MONOTONIC) - ctx->last_hit_mem_pressure_limit; if (diff >= duration) { r = set_put(targets, ctx); if (r < 0) return -ENOMEM; } } else ctx->last_hit_mem_pressure_limit = 0; } if (!set_isempty(targets)) { *ret = TAKE_PTR(targets); return 1; } *ret = NULL; return 0; } bool oomd_memory_reclaim(Hashmap *h) { uint64_t pgscan = 0, pgscan_of = 0, last_pgscan = 0, last_pgscan_of = 0; OomdCGroupContext *ctx; assert(h); /* If sum of all the current pgscan values are greater than the sum of all the last_pgscan values, * there was reclaim activity. Used along with pressure checks to decide whether to take action. */ HASHMAP_FOREACH(ctx, h) { uint64_t sum; sum = pgscan + ctx->pgscan; if (sum < pgscan || sum < ctx->pgscan) pgscan_of++; /* count overflows */ pgscan = sum; sum = last_pgscan + ctx->last_pgscan; if (sum < last_pgscan || sum < ctx->last_pgscan) last_pgscan_of++; /* count overflows */ last_pgscan = sum; } /* overflow counts are the same, return sums comparison */ if (last_pgscan_of == pgscan_of) return pgscan > last_pgscan; return pgscan_of > last_pgscan_of; } bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent) { uint64_t swap_threshold; assert(ctx); assert(threshold_percent <= 100); swap_threshold = ctx->swap_total * threshold_percent / ((uint64_t) 100); return (ctx->swap_total - ctx->swap_used) < swap_threshold; } int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const char *prefix, OomdCGroupContext ***ret) { _cleanup_free_ OomdCGroupContext **sorted = NULL; OomdCGroupContext *item; size_t k = 0; assert(h); assert(compare_func); assert(ret); sorted = new0(OomdCGroupContext*, hashmap_size(h)); if (!sorted) return -ENOMEM; HASHMAP_FOREACH(item, h) { if (item->path && prefix && !path_startswith(item->path, prefix)) continue; sorted[k++] = item; } typesafe_qsort(sorted, k, compare_func); *ret = TAKE_PTR(sorted); assert(k <= INT_MAX); return (int) k; } int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) { _cleanup_set_free_ Set *pids_killed = NULL; int r; assert(path); if (dry_run) { _cleanup_free_ char *cg_path = NULL; r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &cg_path); if (r < 0) return r; log_debug("oomd dry-run: Would have tried to kill %s with recurse=%s", cg_path, true_false(recurse)); return 0; } pids_killed = set_new(NULL); if (!pids_killed) return -ENOMEM; if (recurse) r = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); else r = cg_kill(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); if (r < 0) return r; r = increment_oomd_xattr(path, "user.systemd_oomd_kill", set_size(pids_killed)); if (r < 0) log_debug_errno(r, "Failed to set user.systemd_oomd_kill on kill: %m"); return set_size(pids_killed) != 0; } int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) { _cleanup_free_ OomdCGroupContext **sorted = NULL; int r; assert(h); r = oomd_sort_cgroup_contexts(h, compare_pgscan, prefix, &sorted); if (r < 0) return r; for (int i = 0; i < r; i++) { if (sorted[i]->pgscan == 0) break; r = oomd_cgroup_kill(sorted[i]->path, true, dry_run); if (r > 0 || r == -ENOMEM) break; } return r; } int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run) { _cleanup_free_ OomdCGroupContext **sorted = NULL; int r; assert(h); r = oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted); if (r < 0) return r; /* Try to kill cgroups with non-zero swap usage until we either succeed in * killing or we get to a cgroup with no swap usage. */ for (int i = 0; i < r; i++) { if (sorted[i]->swap_usage == 0) break; r = oomd_cgroup_kill(sorted[i]->path, true, dry_run); if (r > 0 || r == -ENOMEM) break; } return r; } int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) { _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL; _cleanup_free_ char *p = NULL, *val = NULL; bool is_root; int r; assert(path); assert(ret); ctx = new0(OomdCGroupContext, 1); if (!ctx) return -ENOMEM; is_root = empty_or_root(path); r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "memory.pressure", &p); if (r < 0) return log_debug_errno(r, "Error getting cgroup memory pressure path from %s: %m", path); r = read_resource_pressure(p, PRESSURE_TYPE_FULL, &ctx->memory_pressure); if (r < 0) return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p); if (is_root) { r = procfs_memory_get_used(&ctx->current_memory_usage); if (r < 0) return log_debug_errno(r, "Error getting memory used from procfs: %m"); } else { r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.current", &ctx->current_memory_usage); if (r < 0) return log_debug_errno(r, "Error getting memory.current from %s: %m", path); r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.min", &ctx->memory_min); if (r < 0) return log_debug_errno(r, "Error getting memory.min from %s: %m", path); r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.low", &ctx->memory_low); if (r < 0) return log_debug_errno(r, "Error getting memory.low from %s: %m", path); r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.swap.current", &ctx->swap_usage); if (r < 0) return log_debug_errno(r, "Error getting memory.swap.current from %s: %m", path); r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, path, "memory.stat", STRV_MAKE("pgscan"), &val); if (r < 0) return log_debug_errno(r, "Error getting pgscan from memory.stat under %s: %m", path); r = safe_atou64(val, &ctx->pgscan); if (r < 0) return log_debug_errno(r, "Error converting pgscan value to uint64_t: %m"); } ctx->path = strdup(empty_to_root(path)); if (!ctx->path) return -ENOMEM; *ret = TAKE_PTR(ctx); return 0; } int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret) { _cleanup_fclose_ FILE *f = NULL; OomdSystemContext ctx = {}; int r; assert(proc_swaps_path); assert(ret); f = fopen(proc_swaps_path, "re"); if (!f) return -errno; (void) fscanf(f, "%*s %*s %*s %*s %*s\n"); for (;;) { uint64_t total, used; r = fscanf(f, "%*s " /* device/file */ "%*s " /* type of swap */ "%" PRIu64 " " /* swap size */ "%" PRIu64 " " /* used */ "%*s\n", /* priority */ &total, &used); if (r == EOF && feof(f)) break; if (r != 2) { if (ferror(f)) return log_debug_errno(errno, "Error reading from %s: %m", proc_swaps_path); return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Failed to parse values from %s: %m", proc_swaps_path); } ctx.swap_total += total * 1024U; ctx.swap_used += used * 1024U; } *ret = ctx; return 0; } int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path) { _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *curr_ctx = NULL; OomdCGroupContext *old_ctx, *ctx; int r; assert(new_h); assert(path); r = oomd_cgroup_context_acquire(path, &curr_ctx); if (r < 0) return log_debug_errno(r, "Failed to get OomdCGroupContext for %s: %m", path); old_ctx = hashmap_get(old_h, path); if (old_ctx) { curr_ctx->last_pgscan = old_ctx->pgscan; curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit; curr_ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit; } ctx = TAKE_PTR(curr_ctx); r = hashmap_put(new_h, ctx->path, ctx); if (r < 0) return r; return 0; } void oomd_dump_swap_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) { char swap[FORMAT_BYTES_MAX]; assert(ctx); assert(f); if (!empty_or_root(ctx->path)) fprintf(f, "%sPath: %s\n" "%s\tSwap Usage: %s\n", strempty(prefix), ctx->path, strempty(prefix), format_bytes(swap, sizeof(swap), ctx->swap_usage)); else fprintf(f, "%sPath: %s\n" "%s\tSwap Usage: (see System Context)\n", strempty(prefix), ctx->path, strempty(prefix)); } void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) { char tbuf[FORMAT_TIMESPAN_MAX], mem_use[FORMAT_BYTES_MAX]; char mem_min[FORMAT_BYTES_MAX], mem_low[FORMAT_BYTES_MAX]; assert(ctx); assert(f); fprintf(f, "%sPath: %s\n" "%s\tMemory Pressure Limit: %lu%%\n" "%s\tPressure: Avg10: %lu.%02lu Avg60: %lu.%02lu Avg300: %lu.%02lu Total: %s\n" "%s\tCurrent Memory Usage: %s\n", strempty(prefix), ctx->path, strempty(prefix), LOAD_INT(ctx->mem_pressure_limit), strempty(prefix), LOAD_INT(ctx->memory_pressure.avg10), LOAD_FRAC(ctx->memory_pressure.avg10), LOAD_INT(ctx->memory_pressure.avg60), LOAD_FRAC(ctx->memory_pressure.avg60), LOAD_INT(ctx->memory_pressure.avg300), LOAD_FRAC(ctx->memory_pressure.avg300), format_timespan(tbuf, sizeof(tbuf), ctx->memory_pressure.total, USEC_PER_SEC), strempty(prefix), format_bytes(mem_use, sizeof(mem_use), ctx->current_memory_usage)); if (!empty_or_root(ctx->path)) fprintf(f, "%s\tMemory Min: %s\n" "%s\tMemory Low: %s\n" "%s\tPgscan: %" PRIu64 "\n", strempty(prefix), format_bytes_cgroup_protection(mem_min, sizeof(mem_min), ctx->memory_min), strempty(prefix), format_bytes_cgroup_protection(mem_low, sizeof(mem_low), ctx->memory_low), strempty(prefix), ctx->pgscan); } void oomd_dump_system_context(const OomdSystemContext *ctx, FILE *f, const char *prefix) { char used[FORMAT_BYTES_MAX], total[FORMAT_BYTES_MAX]; assert(ctx); assert(f); fprintf(f, "%sSwap: Used: %s Total: %s\n", strempty(prefix), format_bytes(used, sizeof(used), ctx->swap_used), format_bytes(total, sizeof(total), ctx->swap_total)); }