diff --git a/docs/TRANSIENT-SETTINGS.md b/docs/TRANSIENT-SETTINGS.md index f8ff413d28..f0dc2ee20f 100644 --- a/docs/TRANSIENT-SETTINGS.md +++ b/docs/TRANSIENT-SETTINGS.md @@ -270,6 +270,9 @@ All cgroup/resource control settings are available for transient units ✓ IPAccounting= ✓ IPAddressAllow= ✓ IPAddressDeny= +✓ ManagedOOMSwap= +✓ ManagedOOMMemoryPressure= +✓ ManagedOOMMemoryPressureLimitPercent= ``` ## Process Killing Settings diff --git a/man/oomctl.xml b/man/oomctl.xml new file mode 100644 index 0000000000..10633b92fc --- /dev/null +++ b/man/oomctl.xml @@ -0,0 +1,86 @@ + + + + + + + + oomctl + systemd + + + + oomctl + 1 + + + + oomctl + Analyze the state stored in systemd-oomd + + + + + oomctl + OPTIONS + COMMAND + + + + + Description + + oomctl may be used to get information about the various contexts read in by + the systemd1 userspace + out-of-memory (OOM) killer, + systemd-oomd8. + + + + + Commands + + The following commands are understood: + + + + dump + + Show the current state of the cgroup(s) and system context(s) stored by + systemd-oomd. + + + + + + + + Options + + The following options are understood: + + + + + + + + + + Exit status + + On success, 0 is returned, a non-zero failure code otherwise. + + + + See Also + + systemd1, + systemd-oomd.service8, + oomd.conf5 + + + + diff --git a/man/oomd.conf.xml b/man/oomd.conf.xml new file mode 100644 index 0000000000..e6be947c5b --- /dev/null +++ b/man/oomd.conf.xml @@ -0,0 +1,88 @@ + + + + + + + oomd.conf + systemd + + + + oomd.conf + 5 + + + + oomd.conf + oomd.conf.d + Global systemd-oomd configuration files + + + + /etc/systemd/oomd.conf + /etc/systemd/oomd.conf.d/*.conf + /usr/lib/systemd/oomd.conf.d/*.conf + + + + Description + + These files configure the various parameters of the + systemd1 userspace + out-of-memory (OOM) killer, + systemd-oomd.service8. + See systemd.syntax7 + for a general description of the syntax. + + + + + + + [OOM] Section Options + + The following options are available in the [OOM] section: + + + + SwapUsedLimitPercent= + + Sets the limit for swap usage on the system before systemd-oomd will + take action. If the percentage of swap used on the system is more than what is defined here, + systemd-oomd will act on eligible descendant cgroups, starting from the ones with the + highest swap usage to the lowest swap usage. Which cgroups are monitored and what + action gets taken depends on what the unit has configured for ManagedOOMSwap=. + Takes a percentage value between 0% and 100%, inclusive. Defaults to 90%. + + + + DefaultMemoryPressureLimitPercent= + + Sets the limit for memory pressure on the unit's cgroup before systemd-oomd + will take action. A unit can override this value with ManagedOOMMemoryPressureLimitPercent=. + The memory pressure for this property represents the fraction of time in a 10 second window in which all tasks + in the cgroup were delayed. For each monitored cgroup, if the memory pressure on that cgroup exceeds the + limit set for more than 30 seconds, systemd-oomd will act on eligible descendant cgroups, + starting from the ones with the most reclaim activity to the least reclaim activity. Which cgroups are + monitored and what action gets taken depends on what the unit has configured for + ManagedOOMMemoryPressure=. Takes a percentage value between 0% and 100%, inclusive. + Defaults to 60%. + + + + + + + See Also + + systemd1, + systemd.resource-control5, + systemd-oomd.service8, + oomctl1 + + + + diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index 02f7293288..3c0e5b6eb1 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -2414,6 +2414,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { readonly as IPEgressFilterPath = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DisableControllers = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMSwap = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressureLimitPercent = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as Environment = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -2928,6 +2934,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + + + + @@ -3478,6 +3490,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + + + + @@ -4121,6 +4139,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { readonly as IPEgressFilterPath = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DisableControllers = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMSwap = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressureLimitPercent = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as Environment = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -4661,6 +4685,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + + + + @@ -5211,6 +5241,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + + + + @@ -5780,6 +5816,12 @@ node /org/freedesktop/systemd1/unit/home_2emount { readonly as IPEgressFilterPath = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DisableControllers = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMSwap = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressureLimitPercent = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as Environment = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -6250,6 +6292,12 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + + + + @@ -6720,6 +6768,12 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + + + + @@ -7404,6 +7458,12 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { readonly as IPEgressFilterPath = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DisableControllers = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMSwap = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressureLimitPercent = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as Environment = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -7860,6 +7920,12 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + + + + @@ -8316,6 +8382,12 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + + + + @@ -8859,6 +8931,12 @@ node /org/freedesktop/systemd1/unit/system_2eslice { readonly as IPEgressFilterPath = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DisableControllers = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMSwap = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressureLimitPercent = '...'; }; interface org.freedesktop.DBus.Peer { ... }; interface org.freedesktop.DBus.Introspectable { ... }; @@ -8989,6 +9067,12 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + + + + @@ -9123,6 +9207,12 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + + + + @@ -9276,6 +9366,12 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { readonly as IPEgressFilterPath = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly as DisableControllers = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMSwap = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressureLimitPercent = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s KillMode = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -9422,6 +9518,12 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + + + + @@ -9582,6 +9684,12 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + + + + diff --git a/man/rules/meson.build b/man/rules/meson.build index 00cd57420e..806561a412 100644 --- a/man/rules/meson.build +++ b/man/rules/meson.build @@ -45,6 +45,8 @@ manpages = [ ['nss-mymachines', '8', ['libnss_mymachines.so.2'], 'ENABLE_NSS_MYMACHINES'], ['nss-resolve', '8', ['libnss_resolve.so.2'], 'ENABLE_NSS_RESOLVE'], ['nss-systemd', '8', ['libnss_systemd.so.2'], 'ENABLE_NSS_SYSTEMD'], + ['oomctl', '1', [], 'ENABLE_OOMD'], + ['oomd.conf', '5', ['oomd.conf.d'], 'ENABLE_OOMD'], ['org.freedesktop.LogControl1', '5', [], ''], ['org.freedesktop.home1', '5', [], 'ENABLE_HOMED'], ['org.freedesktop.hostname1', '5', [], 'ENABLE_HOSTNAMED'], @@ -907,6 +909,7 @@ manpages = [ ['systemd-networkd.service', '8', ['systemd-networkd'], 'ENABLE_NETWORKD'], ['systemd-notify', '1', [], ''], ['systemd-nspawn', '1', [], ''], + ['systemd-oomd.service', '8', ['systemd-oomd'], 'ENABLE_OOMD'], ['systemd-path', '1', [], ''], ['systemd-portabled.service', '8', ['systemd-portabled'], 'ENABLE_PORTABLED'], ['systemd-pstore.service', '8', ['systemd-pstore'], 'ENABLE_PSTORE'], diff --git a/man/systemd-oomd.service.xml b/man/systemd-oomd.service.xml new file mode 100644 index 0000000000..9d72373d1e --- /dev/null +++ b/man/systemd-oomd.service.xml @@ -0,0 +1,98 @@ + + + + + + + + systemd-oomd.service + systemd + + + + systemd-oomd.service + 8 + + + + systemd-oomd.service + systemd-oomd + A userspace out-of-memory (OOM) killer + + + + systemd-oomd.service + /usr/lib/systemd/systemd-oomd + + + + Description + + systemd-oomd is a system service that uses cgroups-v2 and pressure stall information (PSI) + to monitor and take action on processes before an OOM occurs in kernel space. + + You can enable monitoring and actions on units by setting ManagedOOMSwap= and/or + ManagedOOMMemoryPressure= to the appropriate value. systemd-oomd will + periodically poll enabled units' cgroup data to detect when corrective action needs to occur. When an action needs + to happen, it will only be performed on the descendant cgroups of the enabled units. More precisely, only cgroups with + memory.oom.group set to 1 and leaf cgroup nodes are eligible candidates. + Action will be taken recursively on all of the processes under the chosen candidate. + + See + oomd.conf5 + for more information about the configuration of this service. + + + + Setup Information + + The system must be running systemd with a full unified cgroup hierarchy for the expected cgroups-v2 features. + Furthermore, resource accounting must be turned on for all units monitored by systemd-oomd. + The easiest way to turn on resource accounting is by ensuring the values for DefaultCPUAccounting, + DefaultIOAccounting, DefaultMemoryAccounting, and + DefaultTasksAccounting are set to true in + systemd-system.conf5. + + You will need a kernel compiled with PSI support. This is available in Linux 4.20 and above. + + The system must also have swap enabled for systemd-oomd to function correctly. With swap + enabled, the system spends enough time swapping pages to let systemd-oomd react. + Without swap, the system enters a livelocked state much more quickly and may prevent systemd-oomd + from responding in a reasonable amount of time. See + "In defence of swap: common misconceptions" + for more details on swap. + + Be aware that if you intend to enable monitoring and actions on user.slice, + user-$UID.slice, or their ancestor cgroups, it is highly recommended that your programs be + managed by the systemd user manager to prevent running too many processes under the same session scope (and thus + avoid a situation where memory intensive tasks trigger systemd-oomd to kill everything under the + cgroup). If you're using a desktop environment like GNOME, it already spawns many session components with the + systemd user manager. + + + + Usage Recommendations + + ManagedOOMSwap= works with the system-wide swap values, so setting it on the root slice + -.slice, and allowing all descendant cgroups to be eligible candidates may make the most + sense. + + ManagedOOMMemoryPressure= tends to work better on the cgroups below the root slice + -.slice. For units which tend to have processes that are less latency sensitive (e.g. + system.slice), a higher limit like the default of 60% may be acceptable, as those processes + can usually ride out slowdowns caused by lack of memory without serious consequences. However, something like + user@$UID.service may prefer a much lower value like 40%. + + + + See Also + + systemd1, + systemd-system.conf5, + systemd.resource-control5, + oomd.conf5, + oomctl1 + + + diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index d72f9048e7..b40fa86145 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -869,6 +869,49 @@ DeviceAllow=/dev/loop-control + + + ManagedOOMSwap=auto|kill + ManagedOOMMemoryPressure=auto|kill + + + Specifies how + systemd-oomd.service8 + will act on this unit's cgroups. Defaults to . + + When set to , systemd-oomd will actively monitor this unit's + cgroup metrics to decide whether it needs to act. If the cgroup passes the limits set by + oomd.conf5 or its + overrides, systemd-oomd will send a SIGKILL to all of the processes + under the chosen candidate cgroup. Note that only descendant cgroups can be eligible candidates for killing; + the unit that set its property to is not a candidate (unless one of its ancestors set + their property to ). You can find more details on candidates and kill behavior at + systemd-oomd.service8 + and oomd.conf5. Setting + either of these properties to will also automatically acquire + After= and Wants= dependencies on + systemd-oomd.service unless DefaultDependencies=no. + + + When set to , systemd-oomd will not actively use this cgroup's + data for monitoring and detection. However, if an ancestor cgroup has one of these properties set to + , a unit with can still be an eligible candidate for + systemd-oomd to act on. + + + + + ManagedOOMMemoryPressureLimitPercent= + + + Overrides the default memory pressure limit set by + oomd.conf5 for this unit + (cgroup). Takes a percentage value between 0% and 100%, inclusive. This property is ignored unless + ManagedOOMMemoryPressure=. Defaults to 0%, which means use the + default set by oomd.conf5. + + + @@ -1030,6 +1073,7 @@ DeviceAllow=/dev/loop-control systemd.exec5, systemd.directives7, systemd.special7, + systemd-oomd.service8, The documentation for control groups and specific controllers in the Linux kernel: Control Groups v2. diff --git a/meson.build b/meson.build index 04cb63d921..43cf7bf2bb 100644 --- a/meson.build +++ b/meson.build @@ -1412,6 +1412,9 @@ conf.set10('ENABLE_HOMED', have) have = have and conf.get('HAVE_PAM') == 1 conf.set10('ENABLE_PAM_HOME', have) +have = get_option('oomd') and get_option('mode') == 'developer' +conf.set10('ENABLE_OOMD', have) + want_remote = get_option('remote') if want_remote != 'false' have_deps = [conf.get('HAVE_MICROHTTPD') == 1, @@ -1451,6 +1454,7 @@ foreach term : ['analyze', 'networkd', 'nss-myhostname', 'nss-systemd', + 'oomd', 'portabled', 'pstore', 'quotacheck', @@ -1671,6 +1675,7 @@ subdir('src/analyze') subdir('src/journal-remote') subdir('src/coredump') subdir('src/pstore') +subdir('src/oom') subdir('src/hostname') subdir('src/import') subdir('src/partition') @@ -2730,6 +2735,27 @@ if conf.get('ENABLE_PSTORE') == 1 install_dir : rootlibexecdir) endif +if conf.get('ENABLE_OOMD') == 1 + executable('systemd-oomd', + systemd_oomd_sources, + include_directories : includes, + link_with : [libshared], + dependencies : [], + install_rpath : rootlibexecdir, + install : true, + install_dir : rootlibexecdir) + + public_programs += executable( + 'oomctl', + oomctl_sources, + include_directories : includes, + link_with : [libshared], + dependencies : [], + install_rpath : rootlibexecdir, + install : true, + install_dir : rootbindir) +endif + if conf.get('ENABLE_BINFMT') == 1 public_programs += executable( 'systemd-binfmt', @@ -3748,6 +3774,7 @@ foreach tuple : [ ['DNS-over-TLS(openssl)', conf.get('DNS_OVER_TLS_USE_OPENSSL') == 1], ['coredump'], ['pstore'], + ['oomd'], ['polkit'], ['legacy pkla', install_polkit_pkla], ['efi'], diff --git a/meson_options.txt b/meson_options.txt index d5ce647ae6..a6a0c1e4b8 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -97,6 +97,8 @@ option('coredump', type : 'boolean', description : 'install the coredump handler') option('pstore', type : 'boolean', description : 'install the pstore archival tool') +option('oomd', type : 'boolean', value : 'false', + description : 'install the userspace oom killer') option('logind', type : 'boolean', description : 'install the systemd-logind stack') option('hostnamed', type : 'boolean', diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index 6210347553..d2655673fd 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -1685,6 +1685,26 @@ int cg_get_attribute_as_uint64(const char *controller, const char *path, const c return 0; } +int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret) { + _cleanup_free_ char *value = NULL; + int r; + + assert(ret); + + r = cg_get_attribute(controller, path, attribute, &value); + if (r == -ENOENT) + return -ENODATA; + if (r < 0) + return r; + + r = parse_boolean(value); + if (r < 0) + return r; + + *ret = r; + return 0; +} + int cg_get_keyed_attribute_full( const char *controller, const char *path, @@ -2161,3 +2181,10 @@ CGroupMask get_cpu_accounting_mask(void) { bool cpu_accounting_is_cheap(void) { return get_cpu_accounting_mask() == 0; } + +static const char* const managed_oom_mode_table[_MANAGED_OOM_MODE_MAX] = { + [MANAGED_OOM_AUTO] = "auto", + [MANAGED_OOM_KILL] = "kill", +}; + +DEFINE_STRING_TABLE_LOOKUP(managed_oom_mode, ManagedOOMMode); diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h index 2b88571bc1..eda2b16a1b 100644 --- a/src/basic/cgroup-util.h +++ b/src/basic/cgroup-util.h @@ -208,6 +208,9 @@ static inline int cg_get_keyed_attribute_graceful( int cg_get_attribute_as_uint64(const char *controller, const char *path, const char *attribute, uint64_t *ret); +/* Does a parse_boolean() on the attribute contents and sets ret accordingly */ +int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret); + int cg_set_access(const char *controller, const char *path, uid_t uid, gid_t gid); int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags); @@ -275,3 +278,13 @@ CGroupController cgroup_controller_from_string(const char *s) _pure_; bool is_cgroup_fs(const struct statfs *s); bool fd_is_cgroup_fs(int fd); + +typedef enum ManagedOOMMode { + MANAGED_OOM_AUTO, + MANAGED_OOM_KILL, + _MANAGED_OOM_MODE_MAX, + _MANAGED_OOM_MODE_INVALID = -1, +} ManagedOOMMode; + +const char* managed_oom_mode_to_string(ManagedOOMMode m) _const_; +ManagedOOMMode managed_oom_mode_from_string(const char *s) _pure_; diff --git a/src/basic/def.h b/src/basic/def.h index 970654a1ad..9f1f3c229c 100644 --- a/src/basic/def.h +++ b/src/basic/def.h @@ -63,3 +63,5 @@ .un.sun_family = AF_UNIX, \ .un.sun_path = "\0/org/freedesktop/plymouthd", \ } + +#define VARLINK_ADDR_PATH_MANAGED_OOM "/run/systemd/io.system.ManagedOOM" diff --git a/src/basic/linux/loadavg.h b/src/basic/linux/loadavg.h new file mode 100644 index 0000000000..521a787e8a --- /dev/null +++ b/src/basic/linux/loadavg.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_LOADAVG_H +#define _LINUX_SCHED_LOADAVG_H + +/* + * These are the constant used to fake the fixed-point load-average + * counting. Some notes: + * - 11 bit fractions expand to 22 bits by the multiplies: this gives + * a load-average precision of 10 bits integer + 11 bits fractional + * - if you want to count load-averages more often, you need more + * precision, or rounding will get you. With 2-second counting freq, + * the EXP_n values would be 1981, 2034 and 2043 if still using only + * 11 bit fractions. + */ +extern unsigned long avenrun[]; /* Load averages */ +extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); + +#define FSHIFT 11 /* nr of bits of precision */ +#define FIXED_1 (1<= load) + newload += FIXED_1-1; + + return newload / FIXED_1; +} + +extern unsigned long calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n); + +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + +extern void calc_global_load(unsigned long ticks); + +#endif /* _LINUX_SCHED_LOADAVG_H */ diff --git a/src/basic/meson.build b/src/basic/meson.build index d144bc2f87..09eb612c3a 100644 --- a/src/basic/meson.build +++ b/src/basic/meson.build @@ -111,6 +111,7 @@ basic_sources = files(''' linux/ipv6_route.h linux/l2tp.h linux/libc-compat.h + linux/loadavg.h linux/netdevice.h linux/netlink.h linux/rtnetlink.h diff --git a/src/basic/parse-util.c b/src/basic/parse-util.c index 818c9054d6..dca2ef9f92 100644 --- a/src/basic/parse-util.c +++ b/src/basic/parse-util.c @@ -862,3 +862,45 @@ int parse_oom_score_adjust(const char *s, int *ret) { *ret = v; return 0; } + +int store_loadavg_fixed_point(unsigned long i, unsigned long f, loadavg_t *ret) { + assert(ret); + + if (i >= (~0UL << FSHIFT)) + return -ERANGE; + + i = i << FSHIFT; + f = DIV_ROUND_UP((f << FSHIFT), 100); + + if (f >= FIXED_1) + return -ERANGE; + + *ret = i | f; + return 0; +} + +int parse_loadavg_fixed_point(const char *s, loadavg_t *ret) { + const char *d, *f_str, *i_str; + unsigned long i, f; + int r; + + assert(s); + assert(ret); + + d = strchr(s, '.'); + if (!d) + return -EINVAL; + + i_str = strndupa(s, d - s); + f_str = d + 1; + + r = safe_atolu_full(i_str, 10, &i); + if (r < 0) + return r; + + r = safe_atolu_full(f_str, 10, &f); + if (r < 0) + return r; + + return store_loadavg_fixed_point(i, f, ret); +} diff --git a/src/basic/parse-util.h b/src/basic/parse-util.h index 2cee65c49a..f22a19c5c6 100644 --- a/src/basic/parse-util.h +++ b/src/basic/parse-util.h @@ -3,12 +3,15 @@ #include #include +#include #include #include #include #include "macro.h" +typedef unsigned long loadavg_t; + int parse_boolean(const char *v) _pure_; int parse_dev(const char *s, dev_t *ret); int parse_pid(const char *s, pid_t* ret_pid); @@ -88,18 +91,18 @@ static inline int safe_atoux64(const char *s, uint64_t *ret) { } #if LONG_MAX == INT_MAX -static inline int safe_atolu(const char *s, unsigned long *ret_u) { +static inline int safe_atolu_full(const char *s, unsigned base, long unsigned *ret_u) { assert_cc(sizeof(unsigned long) == sizeof(unsigned)); - return safe_atou(s, (unsigned*) ret_u); + return safe_atou_full(s, base, (unsigned*) ret_u); } static inline int safe_atoli(const char *s, long int *ret_u) { assert_cc(sizeof(long int) == sizeof(int)); return safe_atoi(s, (int*) ret_u); } #else -static inline int safe_atolu(const char *s, unsigned long *ret_u) { +static inline int safe_atolu_full(const char *s, unsigned base, unsigned long *ret_u) { assert_cc(sizeof(unsigned long) == sizeof(unsigned long long)); - return safe_atollu(s, (unsigned long long*) ret_u); + return safe_atollu_full(s, base, (unsigned long long*) ret_u); } static inline int safe_atoli(const char *s, long int *ret_u) { assert_cc(sizeof(long int) == sizeof(long long int)); @@ -107,6 +110,10 @@ static inline int safe_atoli(const char *s, long int *ret_u) { } #endif +static inline int safe_atolu(const char *s, unsigned long *ret_u) { + return safe_atolu_full(s, 0, ret_u); +} + #if SIZE_MAX == UINT_MAX static inline int safe_atozu(const char *s, size_t *ret_u) { assert_cc(sizeof(size_t) == sizeof(unsigned)); @@ -137,3 +144,8 @@ int parse_ip_port_range(const char *s, uint16_t *low, uint16_t *high); int parse_ip_prefix_length(const char *s, int *ret); int parse_oom_score_adjust(const char *s, int *ret); + +/* Given a Linux load average (e.g. decimal number 34.89 where 34 is passed as i and 89 is passed as f), convert it + * to a loadavg_t. */ +int store_loadavg_fixed_point(unsigned long i, unsigned long f, loadavg_t *ret); +int parse_loadavg_fixed_point(const char *s, loadavg_t *ret); diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 211e4a5945..1958c1be2b 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -128,6 +128,9 @@ void cgroup_context_init(CGroupContext *c) { .startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID, .tasks_max = TASKS_MAX_UNSET, + + .moom_swap = MANAGED_OOM_AUTO, + .moom_mem_pressure = MANAGED_OOM_AUTO, }; } @@ -411,7 +414,10 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { "%sTasksMax: %" PRIu64 "\n" "%sDevicePolicy: %s\n" "%sDisableControllers: %s\n" - "%sDelegate: %s\n", + "%sDelegate: %s\n" + "%sManagedOOMSwap: %s\n" + "%sManagedOOMMemoryPressure: %s\n" + "%sManagedOOMMemoryPressureLimitPercent: %d%%\n", prefix, yes_no(c->cpu_accounting), prefix, yes_no(c->io_accounting), prefix, yes_no(c->blockio_accounting), @@ -441,7 +447,10 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { prefix, tasks_max_resolve(&c->tasks_max), prefix, cgroup_device_policy_to_string(c->device_policy), prefix, strempty(disable_controllers_str), - prefix, yes_no(c->delegate)); + prefix, yes_no(c->delegate), + prefix, managed_oom_mode_to_string(c->moom_swap), + prefix, managed_oom_mode_to_string(c->moom_mem_pressure), + prefix, c->moom_mem_pressure_limit); if (c->delegate) { _cleanup_free_ char *t = NULL; @@ -2672,6 +2681,47 @@ static void unit_remove_from_cgroup_empty_queue(Unit *u) { u->in_cgroup_empty_queue = false; } +int unit_check_oomd_kill(Unit *u) { + _cleanup_free_ char *value = NULL; + bool increased; + uint64_t n = 0; + int r; + + if (!u->cgroup_path) + return 0; + + r = cg_all_unified(); + if (r < 0) + return log_unit_debug_errno(u, r, "Couldn't determine whether we are in all unified mode: %m"); + else if (r == 0) + return 0; + + r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.systemd_oomd_kill", &value); + if (r < 0 && r != -ENODATA) + return r; + + if (!isempty(value)) { + r = safe_atou64(value, &n); + if (r < 0) + return r; + } + + increased = n > u->managed_oom_kill_last; + u->managed_oom_kill_last = n; + + if (!increased) + return 0; + + if (n > 0) + log_struct(LOG_NOTICE, + "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR, + LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "systemd-oomd killed %"PRIu64" process(es) in this unit.", n)); + + return 1; +} + int unit_check_oom(Unit *u) { _cleanup_free_ char *oom_kill = NULL; bool increased; diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 9ac5c8bfc0..881b3f3dfe 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -159,6 +159,11 @@ struct CGroupContext { /* Common */ TasksMax tasks_max; + + /* Settings for systemd-oomd */ + ManagedOOMMode moom_swap; + ManagedOOMMode moom_mem_pressure; + int moom_mem_pressure_limit; }; /* Used when querying IP accounting data */ @@ -224,6 +229,7 @@ int unit_watch_cgroup(Unit *u); int unit_watch_cgroup_memory(Unit *u); void unit_add_to_cgroup_empty_queue(Unit *u); +int unit_check_oomd_kill(Unit *u); int unit_check_oom(Unit *u); int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path); diff --git a/src/core/core-varlink.c b/src/core/core-varlink.c index 4bb262bd93..18219097f4 100644 --- a/src/core/core-varlink.c +++ b/src/core/core-varlink.c @@ -2,6 +2,7 @@ #include "core-varlink.h" #include "mkdir.h" +#include "strv.h" #include "user-util.h" #include "varlink.h" @@ -15,6 +16,11 @@ typedef struct LookupParameters { const char *service; } LookupParameters; +static const char* const managed_oom_mode_properties[] = { + "ManagedOOMSwap", + "ManagedOOMMemoryPressure", +}; + static int build_user_json(const char *user_name, uid_t uid, JsonVariant **ret) { assert(user_name); assert(uid_is_valid(uid)); @@ -45,6 +51,150 @@ static bool user_match_lookup_parameters(LookupParameters *p, const char *name, return true; } +static int build_managed_oom_json_array_element(Unit *u, const char *property, JsonVariant **ret_v) { + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL; + CGroupContext *c; + const char *mode; + int r; + + assert(u); + assert(property); + assert(ret_v); + + if (!UNIT_VTABLE(u)->can_set_managed_oom) + return -EOPNOTSUPP; + + c = unit_get_cgroup_context(u); + if (!c) + return -EINVAL; + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u))) + /* systemd-oomd should always treat inactive units as though they didn't enable any action since they + * should not have a valid cgroup */ + mode = managed_oom_mode_to_string(MANAGED_OOM_AUTO); + else if (streq(property, "ManagedOOMSwap")) + mode = managed_oom_mode_to_string(c->moom_swap); + else if (streq(property, "ManagedOOMMemoryPressure")) + mode = managed_oom_mode_to_string(c->moom_mem_pressure); + else + return -EINVAL; + + r = json_build(&v, JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("mode", JSON_BUILD_STRING(mode)), + JSON_BUILD_PAIR("path", JSON_BUILD_STRING(u->cgroup_path)), + JSON_BUILD_PAIR("property", JSON_BUILD_STRING(property)), + JSON_BUILD_PAIR("limit", JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit)))); + + *ret_v = TAKE_PTR(v); + return r; +} + +int manager_varlink_send_managed_oom_update(Unit *u) { + _cleanup_(json_variant_unrefp) JsonVariant *arr = NULL, *v = NULL; + CGroupContext *c; + int r; + + assert(u); + + if (!UNIT_VTABLE(u)->can_set_managed_oom || !u->manager || !u->manager->managed_oom_varlink_request || !u->cgroup_path) + return 0; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + r = json_build(&arr, JSON_BUILD_EMPTY_ARRAY); + if (r < 0) + return r; + + for (size_t i = 0; i < ELEMENTSOF(managed_oom_mode_properties); i++) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[i], &e); + if (r < 0) + return r; + + r = json_variant_append_array(&arr, e); + if (r < 0) + return r; + } + + r = json_build(&v, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("cgroups", JSON_BUILD_VARIANT(arr)))); + if (r < 0) + return r; + + return varlink_notify(u->manager->managed_oom_varlink_request, v); +} + +static int vl_method_subscribe_managed_oom_cgroups( + Varlink *link, + JsonVariant *parameters, + VarlinkMethodFlags flags, + void *userdata) { + static const UnitType supported_unit_types[] = { UNIT_SLICE, UNIT_SERVICE, UNIT_SCOPE }; + _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *arr = NULL; + Manager *m = userdata; + int r; + + assert(link); + assert(m); + + if (json_variant_elements(parameters) > 0) + return varlink_error_invalid_parameter(link, parameters); + + /* We only take one subscriber for this method so return an error if there's already an existing one. + * This shouldn't happen since systemd-oomd is the only client of this method. */ + if (FLAGS_SET(flags, VARLINK_METHOD_MORE) && m->managed_oom_varlink_request) + return varlink_error(m->managed_oom_varlink_request, VARLINK_ERROR_SUBSCRIPTION_TAKEN, NULL); + + r = json_build(&arr, JSON_BUILD_EMPTY_ARRAY); + if (r < 0) + return r; + + for (size_t i = 0; i < ELEMENTSOF(supported_unit_types); i++) { + Unit *u; + + LIST_FOREACH(units_by_type, u, m->units_by_type[supported_unit_types[i]]) { + CGroupContext *c; + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u))) + continue; + + c = unit_get_cgroup_context(u); + if (!c) + continue; + + for (size_t j = 0; j < ELEMENTSOF(managed_oom_mode_properties); j++) { + _cleanup_(json_variant_unrefp) JsonVariant *e = NULL; + + /* For the initial varlink call we only care about units that enabled (i.e. mode is not + * set to "auto") oomd properties. */ + if (!(streq(managed_oom_mode_properties[j], "ManagedOOMSwap") && c->moom_swap == MANAGED_OOM_KILL) && + !(streq(managed_oom_mode_properties[j], "ManagedOOMMemoryPressure") && c->moom_mem_pressure == MANAGED_OOM_KILL)) + continue; + + r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[j], &e); + if (r < 0) + return r; + + r = json_variant_append_array(&arr, e); + if (r < 0) + return r; + } + } + } + + r = json_build(&v, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("cgroups", JSON_BUILD_VARIANT(arr)))); + if (r < 0) + return r; + + if (!FLAGS_SET(flags, VARLINK_METHOD_MORE)) + return varlink_reply(link, v); + + m->managed_oom_varlink_request = varlink_ref(link); + return varlink_notify(m->managed_oom_varlink_request, v); +} + static int vl_method_get_user_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) { static const JsonDispatch dispatch_table[] = { @@ -262,6 +412,17 @@ static int vl_method_get_memberships(Varlink *link, JsonVariant *parameters, Var return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL); } +static void vl_disconnect(VarlinkServer *s, Varlink *link, void *userdata) { + Manager *m = userdata; + + assert(m); + assert(s); + assert(link); + + if (link == m->managed_oom_varlink_request) + m->managed_oom_varlink_request = varlink_unref(link); +} + int manager_varlink_init(Manager *m) { _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL; int r; @@ -284,16 +445,25 @@ int manager_varlink_init(Manager *m) { s, "io.systemd.UserDatabase.GetUserRecord", vl_method_get_user_record, "io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record, - "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships); + "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships, + "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", vl_method_subscribe_managed_oom_cgroups); if (r < 0) return log_error_errno(r, "Failed to register varlink methods: %m"); + r = varlink_server_bind_disconnect(s, vl_disconnect); + if (r < 0) + return log_error_errno(r, "Failed to register varlink disconnect handler: %m"); + if (!MANAGER_IS_TEST_RUN(m)) { (void) mkdir_p_label("/run/systemd/userdb", 0755); r = varlink_server_listen_address(s, "/run/systemd/userdb/io.systemd.DynamicUser", 0666); if (r < 0) return log_error_errno(r, "Failed to bind to varlink socket: %m"); + + r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM, 0666); + if (r < 0) + return log_error_errno(r, "Failed to bind to varlink socket: %m"); } r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL); @@ -307,5 +477,11 @@ int manager_varlink_init(Manager *m) { void manager_varlink_done(Manager *m) { assert(m); + /* Send the final message if we still have a subscribe request open. */ + if (m->managed_oom_varlink_request) { + (void) varlink_error(m->managed_oom_varlink_request, VARLINK_ERROR_DISCONNECTED, NULL); + m->managed_oom_varlink_request = varlink_unref(m->managed_oom_varlink_request); + } + m->varlink_server = varlink_server_unref(m->varlink_server); } diff --git a/src/core/core-varlink.h b/src/core/core-varlink.h index 89818e2766..0b191ae6c4 100644 --- a/src/core/core-varlink.h +++ b/src/core/core-varlink.h @@ -5,3 +5,8 @@ int manager_varlink_init(Manager *m); void manager_varlink_done(Manager *m); + +/* The manager is expected to send an update to systemd-oomd if one of the following occurs: + * - The value of ManagedOOM*= properties change + * - A unit with ManagedOOM*= properties changes unit active state */ +int manager_varlink_send_managed_oom_update(Unit *u); diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index 9fdd3d83ca..584d974f1a 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -8,6 +8,7 @@ #include "bus-get-properties.h" #include "cgroup-util.h" #include "cgroup.h" +#include "core-varlink.h" #include "dbus-cgroup.h" #include "dbus-util.h" #include "errno-util.h" @@ -19,6 +20,7 @@ BUS_DEFINE_PROPERTY_GET(bus_property_get_tasks_max, "t", TasksMax, tasks_max_resolve); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_mode, managed_oom_mode, ManagedOOMMode); static int property_get_cgroup_mask( sd_bus *bus, @@ -391,6 +393,9 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_PROPERTY("IPIngressFilterPath", "as", NULL, offsetof(CGroupContext, ip_filters_ingress), 0), SD_BUS_PROPERTY("IPEgressFilterPath", "as", NULL, offsetof(CGroupContext, ip_filters_egress), 0), SD_BUS_PROPERTY("DisableControllers", "as", property_get_cgroup_mask, offsetof(CGroupContext, disable_controllers), 0), + SD_BUS_PROPERTY("ManagedOOMSwap", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_swap), 0), + SD_BUS_PROPERTY("ManagedOOMMemoryPressure", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_mem_pressure), 0), + SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimitPercent", "s", bus_property_get_percent, offsetof(CGroupContext, moom_mem_pressure_limit), 0), SD_BUS_VTABLE_END }; @@ -1667,6 +1672,45 @@ int bus_cgroup_set_property( return 1; } + if (STR_IN_SET(name, "ManagedOOMSwap", "ManagedOOMMemoryPressure")) { + ManagedOOMMode *cgroup_mode = streq(name, "ManagedOOMSwap") ? &c->moom_swap : &c->moom_mem_pressure; + ManagedOOMMode m; + const char *mode; + + if (!UNIT_VTABLE(u)->can_set_managed_oom) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name); + + r = sd_bus_message_read(message, "s", &mode); + if (r < 0) + return r; + + m = managed_oom_mode_from_string(mode); + if (m < 0) + return -EINVAL; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *cgroup_mode = m; + unit_write_settingf(u, flags, name, "%s=%s", name, mode); + } + + (void) manager_varlink_send_managed_oom_update(u); + return 1; + } + + if (streq(name, "ManagedOOMMemoryPressureLimitPercent")) { + if (!UNIT_VTABLE(u)->can_set_managed_oom) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name); + + r = bus_set_transient_percent(u, name, &c->moom_mem_pressure_limit, message, flags, error); + if (r < 0) + return r; + + if (c->moom_mem_pressure == MANAGED_OOM_KILL) + (void) manager_varlink_send_managed_oom_update(u); + + return 1; + } + if (streq(name, "DisableControllers") || (u->transient && u->load_state == UNIT_STUB)) return bus_cgroup_set_transient_property(u, c, name, message, flags, error); diff --git a/src/core/dbus-util.c b/src/core/dbus-util.c index 951450e53d..f534001a9c 100644 --- a/src/core/dbus-util.c +++ b/src/core/dbus-util.c @@ -91,6 +91,35 @@ int bus_set_transient_bool( return 1; } +int bus_set_transient_percent( + Unit *u, + const char *name, + int *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + const char *v; + int r; + + assert(p); + + r = sd_bus_message_read(message, "s", &v); + if (r < 0) + return r; + + r = parse_percent(v); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = r; + unit_write_settingf(u, flags, name, "%s=%d%%", name, r); + } + + return 1; +} + int bus_set_transient_usec_internal( Unit *u, const char *name, diff --git a/src/core/dbus-util.h b/src/core/dbus-util.h index 654ceb5279..7781a425be 100644 --- a/src/core/dbus-util.h +++ b/src/core/dbus-util.h @@ -240,6 +240,7 @@ int bus_set_transient_user_relaxed(Unit *u, const char *name, char **p, sd_bus_m int bus_set_transient_path(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); int bus_set_transient_string(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); int bus_set_transient_bool(Unit *u, const char *name, bool *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_percent(Unit *u, const char *name, int *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); int bus_set_transient_usec_internal(Unit *u, const char *name, usec_t *p, bool fix_0, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); static inline int bus_set_transient_usec(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) { return bus_set_transient_usec_internal(u, name, p, false, message, flags, error); diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index c60d565eb4..5b148779dc 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -224,6 +224,9 @@ $1.IPAddressAllow, config_parse_ip_address_access, 0, $1.IPAddressDeny, config_parse_ip_address_access, 0, offsetof($1, cgroup_context.ip_address_deny) $1.IPIngressFilterPath, config_parse_ip_filter_bpf_progs, 0, offsetof($1, cgroup_context.ip_filters_ingress) $1.IPEgressFilterPath, config_parse_ip_filter_bpf_progs, 0, offsetof($1, cgroup_context.ip_filters_egress) +$1.ManagedOOMSwap, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_swap) +$1.ManagedOOMMemoryPressure, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_mem_pressure) +$1.ManagedOOMMemoryPressureLimitPercent,config_parse_managed_oom_mem_pressure_limit,0, offsetof($1, cgroup_context) $1.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0' )m4_dnl Unit.Description, config_parse_unit_string_printf, 0, offsetof(Unit, description) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index df40119175..9361e93051 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -26,6 +26,7 @@ #include "capability-util.h" #include "cgroup-setup.h" #include "conf-parser.h" +#include "core-varlink.h" #include "cpu-set-util.h" #include "env-util.h" #include "errno-list.h" @@ -3812,6 +3813,86 @@ int config_parse_delegate( return 0; } +int config_parse_managed_oom_mode( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + Unit *u = userdata; + ManagedOOMMode *mode = data, m; + UnitType t; + + t = unit_name_to_type(unit); + assert(t != _UNIT_TYPE_INVALID); + + if (!unit_vtable[t]->can_set_managed_oom) + return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue); + + if (isempty(rvalue)) { + *mode = MANAGED_OOM_AUTO; + goto finish; + } + + m = managed_oom_mode_from_string(rvalue); + if (m < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid syntax, ignoring: %s", rvalue); + return 0; + } + *mode = m; + +finish: + (void) manager_varlink_send_managed_oom_update(u); + return 0; +} + +int config_parse_managed_oom_mem_pressure_limit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + Unit *u = userdata; + CGroupContext *c = data; + UnitType t; + int r; + + t = unit_name_to_type(unit); + assert(t != _UNIT_TYPE_INVALID); + + if (!unit_vtable[t]->can_set_managed_oom) + return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue); + + if (isempty(rvalue)) { + c->moom_mem_pressure_limit = 0; + goto finish; + } + + r = parse_percent(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse limit percent value, ignoring: %s", rvalue); + return 0; + } + + c->moom_mem_pressure_limit = r; + +finish: + /* Only update the limit if memory pressure detection is enabled because the information is irrelevant otherwise */ + if (c->moom_mem_pressure == MANAGED_OOM_KILL) + (void) manager_varlink_send_managed_oom_update(u); + return 0; +} + int config_parse_device_allow( const char *unit, const char *filename, diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index d67852a74d..fa4c1fb1a0 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -76,6 +76,8 @@ CONFIG_PARSER_PROTOTYPE(config_parse_cpu_shares); CONFIG_PARSER_PROTOTYPE(config_parse_memory_limit); CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max); CONFIG_PARSER_PROTOTYPE(config_parse_delegate); +CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode); +CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit); CONFIG_PARSER_PROTOTYPE(config_parse_device_policy); CONFIG_PARSER_PROTOTYPE(config_parse_device_allow); CONFIG_PARSER_PROTOTYPE(config_parse_io_device_latency); diff --git a/src/core/manager.c b/src/core/manager.c index 6a8c4bf362..bf3a3c64f0 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -2578,6 +2578,11 @@ static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) { * We only do this for the cgroup the PID belonged to. */ (void) unit_check_oom(u1); + /* This only logs for now. In the future when the interface for kills/notifications + * is more stable we can extend service results table similar to how kernel oom kills + * are managed. */ + (void) unit_check_oomd_kill(u1); + manager_invoke_sigchld_event(m, u1, &si); } if (u2) diff --git a/src/core/manager.h b/src/core/manager.h index 9e98b31c4b..073cc74a85 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -434,6 +434,8 @@ struct Manager { bool honor_device_enumeration; VarlinkServer *varlink_server; + /* Only systemd-oomd should be using this to subscribe to changes in ManagedOOM settings */ + Varlink *managed_oom_varlink_request; }; static inline usec_t manager_default_timeout_abort_usec(Manager *m) { diff --git a/src/core/scope.c b/src/core/scope.c index 42c51b0865..540c83ba45 100644 --- a/src/core/scope.c +++ b/src/core/scope.c @@ -621,6 +621,7 @@ const UnitVTable scope_vtable = { .can_delegate = true, .can_fail = true, .once_only = true, + .can_set_managed_oom = true, .init = scope_init, .load = scope_load, diff --git a/src/core/service.c b/src/core/service.c index d23384c475..9d834d4069 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -4533,6 +4533,7 @@ const UnitVTable service_vtable = { .can_transient = true, .can_delegate = true, .can_fail = true, + .can_set_managed_oom = true, .init = service_init, .done = service_done, diff --git a/src/core/slice.c b/src/core/slice.c index 49541aacab..36e5d6a40f 100644 --- a/src/core/slice.c +++ b/src/core/slice.c @@ -435,6 +435,7 @@ const UnitVTable slice_vtable = { .private_section = "Slice", .can_transient = true, + .can_set_managed_oom = true, .init = slice_init, .load = slice_load, diff --git a/src/core/unit.c b/src/core/unit.c index 1165d4ea8b..fd73ad2949 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -15,6 +15,7 @@ #include "bus-util.h" #include "cgroup-setup.h" #include "cgroup-util.h" +#include "core-varlink.h" #include "dbus-unit.h" #include "dbus.h" #include "dropin.h" @@ -1573,6 +1574,31 @@ static int unit_add_mount_dependencies(Unit *u) { return 0; } +static int unit_add_oomd_dependencies(Unit *u) { + CGroupContext *c; + bool wants_oomd; + int r; + + assert(u); + + if (!u->default_dependencies) + return 0; + + c = unit_get_cgroup_context(u); + if (!c) + return 0; + + wants_oomd = (c->moom_swap == MANAGED_OOM_KILL || c->moom_mem_pressure == MANAGED_OOM_KILL); + if (!wants_oomd) + return 0; + + r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, "systemd-oomd.service", true, UNIT_DEPENDENCY_FILE); + if (r < 0) + return r; + + return 0; +} + static int unit_add_startup_units(Unit *u) { CGroupContext *c; @@ -1633,6 +1659,10 @@ int unit_load(Unit *u) { if (r < 0) goto fail; + r = unit_add_oomd_dependencies(u); + if (r < 0) + goto fail; + r = unit_add_startup_units(u); if (r < 0) goto fail; @@ -2592,6 +2622,18 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, UnitNotifyFlag * the bus queue, so that any job change signal queued will force out the unit change signal first. */ unit_add_to_dbus_queue(u); + /* Update systemd-oomd on the property/state change */ + if (os != ns) { + /* Always send an update if the unit is going into an inactive state so systemd-oomd knows to stop + * monitoring. + * Also send an update whenever the unit goes active; this is to handle a case where an override file + * sets one of the ManagedOOM*= properties to "kill", then later removes it. systemd-oomd needs to + * know to stop monitoring when the unit changes from "kill" -> "auto" on daemon-reload, but we don't + * have the information on the property. Thus, indiscriminately send an update. */ + if (UNIT_IS_INACTIVE_OR_FAILED(ns) || ns == UNIT_ACTIVE) + (void) manager_varlink_send_managed_oom_update(u); + } + /* Update timestamps for state changes */ if (!MANAGER_IS_RELOADING(m)) { dual_timestamp_get(&u->state_change_timestamp); @@ -3558,6 +3600,9 @@ int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs) { if (u->cpu_usage_last != NSEC_INFINITY) (void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, u->cpu_usage_last); + if (u->managed_oom_kill_last > 0) + (void) serialize_item_format(f, "managed-oom-kill-last", "%" PRIu64, u->managed_oom_kill_last); + if (u->oom_kill_last > 0) (void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, u->oom_kill_last); @@ -3803,6 +3848,14 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) { continue; + } else if (streq(l, "managed-oom-kill-last")) { + + r = safe_atou64(v, &u->managed_oom_kill_last); + if (r < 0) + log_unit_debug(u, "Failed to read managed OOM kill last %s, ignoring.", v); + + continue; + } else if (streq(l, "oom-kill-last")) { r = safe_atou64(v, &u->oom_kill_last); diff --git a/src/core/unit.h b/src/core/unit.h index 35873d57bc..1e6d7ccf6b 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -260,7 +260,10 @@ typedef struct Unit { nsec_t cpu_usage_base; nsec_t cpu_usage_last; /* the most recently read value */ - /* The current counter of the oom_kill field in the memory.events cgroup attribute */ + /* The current counter of processes sent SIGKILL by systemd-oomd */ + uint64_t managed_oom_kill_last; + + /* The current counter of the oom_kill field in the memory.events cgroup attribute */ uint64_t oom_kill_last; /* Where the io.stat data was at the time the unit was started */ @@ -625,6 +628,9 @@ typedef struct UnitVTable { /* True if queued jobs of this type should be GC'ed if no other job needs them anymore */ bool gc_jobs:1; + + /* True if systemd-oomd can monitor and act on this unit's recursive children's cgroup(s) */ + bool can_set_managed_oom:1; } UnitVTable; extern const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX]; diff --git a/src/oom/meson.build b/src/oom/meson.build new file mode 100644 index 0000000000..78c92deff3 --- /dev/null +++ b/src/oom/meson.build @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: LGPL-2.1+ + +systemd_oomd_sources = files(''' + oomd-manager-bus.c + oomd-manager-bus.h + oomd-manager.c + oomd-manager.h + oomd-util.c + oomd-util.h + oomd.c +'''.split()) + +oomctl_sources = files(''' + oomctl.c +'''.split()) + +if conf.get('ENABLE_OOMD') == 1 + tests += [ + [['src/oom/test-oomd-util.c', + 'src/oom/oomd-util.c', + 'src/oom/oomd-util.h'], + [], + []] + ] + + install_data('org.freedesktop.oom1.conf', + install_dir : dbuspolicydir) + + install_data('org.freedesktop.oom1.service', + install_dir : dbussystemservicedir) + + install_data('oomd.conf', + install_dir : pkgsysconfdir) +endif diff --git a/src/oom/oomctl.c b/src/oom/oomctl.c new file mode 100644 index 0000000000..01e43d3560 --- /dev/null +++ b/src/oom/oomctl.c @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include +#include + +#include "bus-error.h" +#include "copy.h" +#include "main-func.h" +#include "pretty-print.h" +#include "terminal-util.h" +#include "verbs.h" + +static PagerFlags arg_pager_flags = 0; + +static int help(int argc, char *argv[], void *userdata) { + _cleanup_free_ char *link = NULL; + int r; + + (void) pager_open(arg_pager_flags); + + r = terminal_urlify_man("oomctl", "1", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] COMMAND ...\n\n" + "%2$sManage or inspect the userspace OOM killer.%3$s\n" + "\n%4$sCommands:%5$s\n" + " dump Output the current state of systemd-oomd\n" + "\n%4$sOptions:%5$s\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --no-pager Do not pipe output into a pager\n" + "\nSee the %6$s for details.\n" + , program_invocation_short_name + , ansi_highlight(), ansi_normal() + , ansi_underline(), ansi_normal() + , link + ); + + return 0; +} + +static int dump_state(int argc, char *argv[], void *userdata) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + int fd = -1; + int r; + + r = sd_bus_open_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to connect system bus: %m"); + + (void) pager_open(arg_pager_flags); + + r = sd_bus_call_method( + bus, + "org.freedesktop.oom1", + "/org/freedesktop/oom1", + "org.freedesktop.oom1.Manager", + "DumpByFileDescriptor", + &error, + &reply, + NULL); + if (r < 0) + return log_error_errno(r, "Failed to dump context: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "h", &fd); + if (r < 0) + return bus_log_parse_error(r); + + fflush(stdout); + return copy_bytes(fd, STDOUT_FILENO, (uint64_t) -1, 0); +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(0, NULL, NULL); + + case ARG_VERSION: + return version(); + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached("Invalid option passed."); + } + + return 1; +} + +static int run(int argc, char* argv[]) { + static const Verb verbs[] = { + { "help", VERB_ANY, VERB_ANY, 0, help }, + { "dump", VERB_ANY, 1, VERB_DEFAULT, dump_state }, + {} + }; + + int r; + + log_show_color(true); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + return dispatch_verb(argc, argv, verbs, NULL); +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/oom/oomd-manager-bus.c b/src/oom/oomd-manager-bus.c new file mode 100644 index 0000000000..67c5fbf92f --- /dev/null +++ b/src/oom/oomd-manager-bus.c @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include + +#include "bus-common-errors.h" +#include "bus-polkit.h" +#include "fd-util.h" +#include "oomd-manager-bus.h" +#include "oomd-manager.h" +#include "user-util.h" + +static int bus_method_dump_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *dump = NULL; + _cleanup_close_ int fd = -1; + Manager *m = userdata; + int r; + + assert(message); + assert(m); + + r = manager_get_dump_string(m, &dump); + if (r < 0) + return r; + + fd = acquire_data_fd(dump, strlen(dump), 0); + if (fd < 0) + return fd; + + return sd_bus_reply_method_return(message, "h", fd); +} + +const sd_bus_vtable manager_vtable[] = { + SD_BUS_VTABLE_START(0), + SD_BUS_METHOD("DumpByFileDescriptor", NULL, "h", bus_method_dump_by_fd, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_VTABLE_END +}; diff --git a/src/oom/oomd-manager-bus.h b/src/oom/oomd-manager-bus.h new file mode 100644 index 0000000000..60ccf3b373 --- /dev/null +++ b/src/oom/oomd-manager-bus.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "sd-bus.h" + +typedef struct Manager Manager; + +extern const sd_bus_vtable manager_vtable[]; diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c new file mode 100644 index 0000000000..49b57a86a4 --- /dev/null +++ b/src/oom/oomd-manager.c @@ -0,0 +1,549 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "bus-log-control-api.h" +#include "bus-util.h" +#include "bus-polkit.h" +#include "cgroup-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "oomd-manager-bus.h" +#include "oomd-manager.h" +#include "path-util.h" + +typedef struct ManagedOOMReply { + ManagedOOMMode mode; + char *path; + char *property; + unsigned limit; +} ManagedOOMReply; + +static void managed_oom_reply_destroy(ManagedOOMReply *reply) { + assert(reply); + free(reply->path); + free(reply->property); +} + +static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + ManagedOOMMode *mode = userdata, m; + const char *s; + + assert(mode); + assert_se(s = json_variant_string(v)); + + m = managed_oom_mode_from_string(s); + if (m < 0) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), "%s is not a valid ManagedOOMMode", s); + + *mode = m; + return 0; +} + +static int process_managed_oom_reply( + Varlink *link, + JsonVariant *parameters, + const char *error_id, + VarlinkReplyFlags flags, + void *userdata) { + JsonVariant *c, *cgroups; + Manager *m = userdata; + int r = 0; + + assert(m); + + static const JsonDispatch dispatch_table[] = { + { "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMReply, mode), JSON_MANDATORY }, + { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMReply, path), JSON_MANDATORY }, + { "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMReply, property), JSON_MANDATORY }, + { "limit", JSON_VARIANT_UNSIGNED, json_dispatch_unsigned, offsetof(ManagedOOMReply, limit), 0 }, + {}, + }; + + if (error_id) { + r = -EIO; + log_debug("Error getting ManagedOOM cgroups: %s", error_id); + goto finish; + } + + cgroups = json_variant_by_key(parameters, "cgroups"); + if (!cgroups) { + r = -EINVAL; + goto finish; + } + + /* Skip malformed elements and keep processing in case the others are good */ + JSON_VARIANT_ARRAY_FOREACH(c, cgroups) { + _cleanup_(managed_oom_reply_destroy) ManagedOOMReply reply = {}; + OomdCGroupContext *ctx; + Hashmap *monitor_hm; + loadavg_t limit; + int ret; + + if (!json_variant_is_object(c)) + continue; + + ret = json_dispatch(c, dispatch_table, NULL, 0, &reply); + if (ret == -ENOMEM) { + r = ret; + goto finish; + } else if (ret < 0) + continue; + + monitor_hm = streq(reply.property, "ManagedOOMSwap") ? + m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts; + + if (reply.mode == MANAGED_OOM_AUTO) { + (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, reply.path)); + continue; + } + + limit = m->default_mem_pressure_limit; + + if (streq(reply.property, "ManagedOOMMemoryPressure")) { + if (reply.limit > 100) + continue; + else if (reply.limit != 0) { + ret = store_loadavg_fixed_point((unsigned long) reply.limit, 0, &limit); + if (ret < 0) + continue; + } + } + + ret = oomd_insert_cgroup_context(NULL, monitor_hm, reply.path); + if (ret == -ENOMEM) { + r = ret; + goto finish; + } + + /* Always update the limit in case it was changed. For non-memory pressure detection the value is + * ignored so always updating it here is not a problem. */ + ctx = hashmap_get(monitor_hm, reply.path); + if (ctx) + ctx->mem_pressure_limit = limit; + } + +finish: + if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES)) + m->varlink = varlink_close_unref(link); + + return r; +} + +/* Fill `new_h` with `path`'s descendent OomdCGroupContexts. Only include descendent cgroups that are possible + * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1". + * + * This function ignores most errors in order to handle cgroups that may have been cleaned up while populating + * the hashmap. + * + * `new_h` is of the form { key: cgroup paths -> value: OomdCGroupContext } */ +static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) { + _cleanup_free_ char *subpath = NULL; + _cleanup_closedir_ DIR *d = NULL; + int r; + + assert(new_h); + assert(path); + + r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d); + if (r < 0) + return r; + + r = cg_read_subgroup(d, &subpath); + if (r < 0) + return r; + else if (r == 0) { /* No subgroups? We're a leaf node */ + r = oomd_insert_cgroup_context(NULL, new_h, path); + return (r == -ENOMEM) ? r : 0; + } + + do { + _cleanup_free_ char *cg_path = NULL; + bool oom_group; + + cg_path = path_join(empty_to_root(path), subpath); + if (!cg_path) + return -ENOMEM; + + subpath = mfree(subpath); + + r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group); + /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */ + if (r < 0) + return (r == -ENOMEM) ? r : 0; + + if (oom_group) { + r = oomd_insert_cgroup_context(NULL, new_h, cg_path); + if (r == -ENOMEM) + return r; + } else { + r = recursively_get_cgroup_context(new_h, cg_path); + if (r == -ENOMEM) + return r; + } + } while ((r = cg_read_subgroup(d, &subpath)) > 0); + + return 0; +} + +static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) { + _cleanup_hashmap_free_ Hashmap *new_base = NULL; + OomdCGroupContext *ctx; + int r; + + assert(monitored_cgroups); + + new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops); + if (!new_base) + return -ENOMEM; + + HASHMAP_FOREACH(ctx, *monitored_cgroups) { + /* Skip most errors since the cgroup we're trying to update might not exist anymore. */ + r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path); + if (r == -ENOMEM) + return r; + } + + hashmap_free(*monitored_cgroups); + *monitored_cgroups = TAKE_PTR(new_base); + + return 0; +} + +static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) { + _cleanup_hashmap_free_ Hashmap *candidates = NULL; + OomdCGroupContext *ctx; + int r; + + assert(monitored_cgroups); + assert(ret_candidates); + + candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops); + if (!candidates) + return -ENOMEM; + + HASHMAP_FOREACH(ctx, monitored_cgroups) { + r = recursively_get_cgroup_context(candidates, ctx->path); + if (r == -ENOMEM) + return r; + } + + *ret_candidates = TAKE_PTR(candidates); + + return 0; +} + +static int acquire_managed_oom_connect(Manager *m) { + _cleanup_(varlink_close_unrefp) Varlink *link = NULL; + int r; + + assert(m); + assert(m->event); + + r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM); + if (r < 0) + return log_error_errno(r, "Failed to connect to %s: %m", VARLINK_ADDR_PATH_MANAGED_OOM); + + (void) varlink_set_userdata(link, m); + (void) varlink_set_description(link, "oomd"); + (void) varlink_set_relative_timeout(link, USEC_INFINITY); + + r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach varlink connection to event loop: %m"); + + r = varlink_bind_reply(link, process_managed_oom_reply); + if (r < 0) + return log_error_errno(r, "Failed to bind reply callback: %m"); + + r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL); + if (r < 0) + return log_error_errno(r, "Failed to observe varlink call: %m"); + + m->varlink = TAKE_PTR(link); + return 0; +} + +static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) { + _cleanup_set_free_ Set *targets = NULL; + Manager *m = userdata; + usec_t usec_now; + int r; + + assert(s); + assert(userdata); + + /* Reset timer */ + r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now); + if (r < 0) + return log_error_errno(r, "Failed to reset event timer"); + + r = sd_event_source_set_time_relative(s, INTERVAL_USEC); + if (r < 0) + return log_error_errno(r, "Failed to set relative time for timer"); + + /* Reconnect if our connection dropped */ + if (!m->varlink) { + r = acquire_managed_oom_connect(m); + if (r < 0) + return log_error_errno(r, "Failed to acquire varlink connection"); + } + + /* Update the cgroups used for detection/action */ + r = update_monitored_cgroup_contexts(&m->monitored_swap_cgroup_contexts); + if (r == -ENOMEM) + return log_error_errno(r, "Failed to update monitored swap cgroup contexts"); + + r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts); + if (r == -ENOMEM) + return log_error_errno(r, "Failed to update monitored memory pressure cgroup contexts"); + + r = oomd_system_context_acquire("/proc/swaps", &m->system_context); + /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM */ + if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts))) + return log_error_errno(r, "Failed to acquire system context"); + + /* If we're still recovering from a kill, don't try to kill again yet */ + if (m->post_action_delay_start > 0) { + if (m->post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now) + return 0; + else + m->post_action_delay_start = 0; + } + + r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, PRESSURE_DURATION_USEC, &targets); + if (r == -ENOMEM) + return log_error_errno(r, "Failed to check if memory pressure exceeded limits"); + else if (r == 1) { + /* Check if there was reclaim activity in the last interval. The concern is the following case: + * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending + * cgroup. Even after this, well-behaved processes will fault in recently resident pages and + * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need + * to kill something (it won't help anyways). */ + if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts)) { + _cleanup_hashmap_free_ Hashmap *candidates = NULL; + OomdCGroupContext *t; + + r = get_monitored_cgroup_contexts_candidates(m->monitored_mem_pressure_cgroup_contexts, &candidates); + if (r == -ENOMEM) + return log_error_errno(r, "Failed to get monitored memory pressure cgroup candidates"); + + SET_FOREACH(t, targets) { + log_notice("Memory pressure for %s is greater than %lu for more than %"PRIu64" seconds and there was reclaim activity", + t->path, LOAD_INT(t->mem_pressure_limit), PRESSURE_DURATION_USEC / USEC_PER_SEC); + + r = oomd_kill_by_pgscan(candidates, t->path, m->dry_run); + if (r == -ENOMEM) + return log_error_errno(r, "Failed to kill cgroup processes by pgscan"); + if (r < 0) + log_info("Failed to kill any cgroup(s) under %s based on pressure", t->path); + else { + /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */ + m->post_action_delay_start = usec_now; + return 0; + } + } + } + } + + if (oomd_swap_free_below(&m->system_context, (100 - m->swap_used_limit))) { + _cleanup_hashmap_free_ Hashmap *candidates = NULL; + + log_notice("Swap used (%"PRIu64") / total (%"PRIu64") is more than %u%%", + m->system_context.swap_used, m->system_context.swap_total, m->swap_used_limit); + + r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates); + if (r == -ENOMEM) + return log_error_errno(r, "Failed to get monitored swap cgroup candidates"); + + r = oomd_kill_by_swap_usage(candidates, m->dry_run); + if (r == -ENOMEM) + return log_error_errno(r, "Failed to kill cgroup processes by swap usage"); + if (r < 0) + log_info("Failed to kill any cgroup(s) based on swap"); + else { + m->post_action_delay_start = usec_now; + return 0; + } + } + + return 0; +} + +static int monitor_cgroup_contexts(Manager *m) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + int r; + + assert(m); + assert(m->event); + + r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_cgroup_contexts_handler, m); + if (r < 0) + return r; + + r = sd_event_source_set_exit_on_failure(s, true); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(s, SD_EVENT_ON); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s, "oomd-timer"); + + m->cgroup_context_event_source = TAKE_PTR(s); + return 0; +} + +void manager_free(Manager *m) { + assert(m); + + varlink_close_unref(m->varlink); + sd_event_source_unref(m->cgroup_context_event_source); + sd_event_unref(m->event); + + bus_verify_polkit_async_registry_free(m->polkit_registry); + sd_bus_flush_close_unref(m->bus); + + hashmap_free(m->monitored_swap_cgroup_contexts); + hashmap_free(m->monitored_mem_pressure_cgroup_contexts); + + free(m); +} + +int manager_new(Manager **ret) { + _cleanup_(manager_freep) Manager *m = NULL; + int r; + + assert(ret); + + m = new0(Manager, 1); + if (!m) + return -ENOMEM; + + r = sd_event_default(&m->event); + if (r < 0) + return r; + + (void) sd_event_set_watchdog(m->event, true); + + r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL); + if (r < 0) + return r; + + r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL); + if (r < 0) + return r; + + m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops); + if (!m->monitored_swap_cgroup_contexts) + return -ENOMEM; + + m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops); + if (!m->monitored_mem_pressure_cgroup_contexts) + return -ENOMEM; + + *ret = TAKE_PTR(m); + return 0; +} + +static int manager_connect_bus(Manager *m) { + int r; + + assert(m); + assert(!m->bus); + + r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom"); + if (r < 0) + return log_error_errno(r, "Failed to connect to bus: %m"); + + r = sd_bus_add_object_vtable(m->bus, NULL, "/org/freedesktop/oom1", "org.freedesktop.oom1.Manager", manager_vtable, m); + if (r < 0) + return log_error_errno(r, "Failed to add manager object vtable: %m"); + + r = bus_log_control_api_register(m->bus); + if (r < 0) + return r; + + r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to request name: %m"); + + r = sd_bus_attach_event(m->bus, m->event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + + return 0; +} + +int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit) { + unsigned long l; + int r; + + assert(m); + + m->dry_run = dry_run; + + m->swap_used_limit = swap_used_limit != -1 ? swap_used_limit : DEFAULT_SWAP_USED_LIMIT; + assert(m->swap_used_limit <= 100); + + l = mem_pressure_limit != -1 ? mem_pressure_limit : DEFAULT_MEM_PRESSURE_LIMIT; + r = store_loadavg_fixed_point(l, 0, &m->default_mem_pressure_limit); + if (r < 0) + return r; + + r = manager_connect_bus(m); + if (r < 0) + return r; + + r = acquire_managed_oom_connect(m); + if (r < 0) + return r; + + r = monitor_cgroup_contexts(m); + if (r < 0) + return r; + + return 0; +} + +int manager_get_dump_string(Manager *m, char **ret) { + _cleanup_free_ char *dump = NULL; + _cleanup_fclose_ FILE *f = NULL; + OomdCGroupContext *c; + size_t size; + char *key; + int r; + + assert(m); + assert(ret); + + f = open_memstream_unlocked(&dump, &size); + if (!f) + return -errno; + + fprintf(f, + "Dry Run: %s\n" + "Swap Used Limit: %u%%\n" + "Default Memory Pressure Limit: %lu%%\n" + "System Context:\n", + yes_no(m->dry_run), + m->swap_used_limit, + LOAD_INT(m->default_mem_pressure_limit)); + oomd_dump_system_context(&m->system_context, f, "\t"); + + fprintf(f, "Swap Monitored CGroups:\n"); + HASHMAP_FOREACH_KEY(c, key, m->monitored_swap_cgroup_contexts) + oomd_dump_swap_cgroup_context(c, f, "\t"); + + fprintf(f, "Memory Pressure Monitored CGroups:\n"); + HASHMAP_FOREACH_KEY(c, key, m->monitored_mem_pressure_cgroup_contexts) + oomd_dump_memory_pressure_cgroup_context(c, f, "\t"); + + r = fflush_and_check(f); + if (r < 0) + return r; + + f = safe_fclose(f); + + *ret = TAKE_PTR(dump); + return 0; +} diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h new file mode 100644 index 0000000000..b5c249799b --- /dev/null +++ b/src/oom/oomd-manager.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "conf-parser.h" +#include "oomd-util.h" +#include "sd-event.h" +#include "varlink.h" + +/* Polling interval for monitoring stats */ +#define INTERVAL_USEC (1 * USEC_PER_SEC) + +/* Used to weight the averages */ +#define AVERAGE_SIZE_DECAY 4 + +/* Take action if 10s of memory pressure > 60 for more than 30s. We use the "full" value from PSI so this is the + * percentage of time all tasks were delayed (i.e. unproductive). + * Generally 60 or higher might be acceptable for something like system.slice with no memory.high set; processes in + * system.slice are assumed to be less latency sensitive. */ +#define PRESSURE_DURATION_USEC (30 * USEC_PER_SEC) +#define DEFAULT_MEM_PRESSURE_LIMIT 60 +#define DEFAULT_SWAP_USED_LIMIT 90 + +#define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC) + +typedef struct Manager Manager; + +struct Manager { + sd_bus *bus; + sd_event *event; + + Hashmap *polkit_registry; + + bool dry_run; + unsigned swap_used_limit; + loadavg_t default_mem_pressure_limit; + + /* k: cgroup paths -> v: OomdCGroupContext + * Used to detect when to take action. */ + Hashmap *monitored_swap_cgroup_contexts; + Hashmap *monitored_mem_pressure_cgroup_contexts; + + OomdSystemContext system_context; + + usec_t post_action_delay_start; + + sd_event_source *cgroup_context_event_source; + + Varlink *varlink; +}; + +void manager_free(Manager *m); +DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + +int manager_new(Manager **ret); + +int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit); + +int manager_get_dump_string(Manager *m, char **ret); + +CONFIG_PARSER_PROTOTYPE(config_parse_oomd_default); diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c new file mode 100644 index 0000000000..6cd4ba4f93 --- /dev/null +++ b/src/oom/oomd-util.c @@ -0,0 +1,451 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include +#include + +#include "cgroup-util.h" +#include "fd-util.h" +#include "format-util.h" +#include "oomd-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "procfs-util.h" +#include "signal-util.h" +#include "sort-util.h" +#include "stat-util.h" +#include "stdio-util.h" + +DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + oomd_cgroup_ctx_hash_ops, + char, + string_hash_func, + string_compare_func, + OomdCGroupContext, + oomd_cgroup_context_free); + +static int log_kill(pid_t pid, int sig, void *userdata) { + log_debug("oomd attempting to kill " PID_FMT " with %s", pid, signal_to_string(sig)); + return 0; +} + +static int increment_oomd_xattr(const char *path, const char *xattr, uint64_t num_procs_killed) { + _cleanup_free_ char *value = NULL; + char buf[DECIMAL_STR_MAX(uint64_t) + 1]; + uint64_t curr_count = 0; + int r; + + assert(path); + assert(xattr); + + r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, path, xattr, &value); + if (r < 0 && r != -ENODATA) + return r; + + if (!isempty(value)) { + r = safe_atou64(value, &curr_count); + if (r < 0) + return r; + } + + if (curr_count > UINT64_MAX - num_procs_killed) + return -EOVERFLOW; + + xsprintf(buf, "%"PRIu64, curr_count + num_procs_killed); + r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, path, xattr, buf, strlen(buf), 0); + if (r < 0) + return r; + + return 0; +} + +OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) { + if (!ctx) + return NULL; + + free(ctx->path); + return mfree(ctx); +} + +int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) { + _cleanup_set_free_ Set *targets = NULL; + OomdCGroupContext *ctx; + char *key; + int r; + + assert(h); + assert(ret); + + targets = set_new(NULL); + if (!targets) + return -ENOMEM; + + HASHMAP_FOREACH_KEY(ctx, key, h) { + if (ctx->memory_pressure.avg10 > ctx->mem_pressure_limit) { + usec_t diff; + + if (ctx->last_hit_mem_pressure_limit == 0) + ctx->last_hit_mem_pressure_limit = now(CLOCK_MONOTONIC); + + diff = now(CLOCK_MONOTONIC) - ctx->last_hit_mem_pressure_limit; + if (diff >= duration) { + r = set_put(targets, ctx); + if (r < 0) + return -ENOMEM; + } + } else + ctx->last_hit_mem_pressure_limit = 0; + } + + if (!set_isempty(targets)) { + *ret = TAKE_PTR(targets); + return 1; + } + + *ret = NULL; + return 0; +} + +bool oomd_memory_reclaim(Hashmap *h) { + uint64_t pgscan = 0, pgscan_of = 0, last_pgscan = 0, last_pgscan_of = 0; + OomdCGroupContext *ctx; + + assert(h); + + /* If sum of all the current pgscan values are greater than the sum of all the last_pgscan values, + * there was reclaim activity. Used along with pressure checks to decide whether to take action. */ + + HASHMAP_FOREACH(ctx, h) { + uint64_t sum; + + sum = pgscan + ctx->pgscan; + if (sum < pgscan || sum < ctx->pgscan) + pgscan_of++; /* count overflows */ + pgscan = sum; + + sum = last_pgscan + ctx->last_pgscan; + if (sum < last_pgscan || sum < ctx->last_pgscan) + last_pgscan_of++; /* count overflows */ + last_pgscan = sum; + } + + /* overflow counts are the same, return sums comparison */ + if (last_pgscan_of == pgscan_of) + return pgscan > last_pgscan; + + return pgscan_of > last_pgscan_of; +} + +bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent) { + uint64_t swap_threshold; + + assert(ctx); + assert(threshold_percent <= 100); + + swap_threshold = ctx->swap_total * threshold_percent / ((uint64_t) 100); + return (ctx->swap_total - ctx->swap_used) < swap_threshold; +} + +int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const char *prefix, OomdCGroupContext ***ret) { + _cleanup_free_ OomdCGroupContext **sorted = NULL; + OomdCGroupContext *item; + size_t k = 0; + + assert(h); + assert(compare_func); + assert(ret); + + sorted = new0(OomdCGroupContext*, hashmap_size(h)); + if (!sorted) + return -ENOMEM; + + HASHMAP_FOREACH(item, h) { + if (item->path && prefix && !path_startswith(item->path, prefix)) + continue; + + sorted[k++] = item; + } + + typesafe_qsort(sorted, k, compare_func); + + *ret = TAKE_PTR(sorted); + + assert(k <= INT_MAX); + return (int) k; +} + +int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) { + _cleanup_set_free_ Set *pids_killed = NULL; + int r; + + assert(path); + + if (dry_run) { + _cleanup_free_ char *cg_path = NULL; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &cg_path); + if (r < 0) + return r; + + log_debug("oomd dry-run: Would have tried to kill %s with recurse=%s", cg_path, true_false(recurse)); + return 0; + } + + pids_killed = set_new(NULL); + if (!pids_killed) + return -ENOMEM; + + if (recurse) + r = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); + else + r = cg_kill(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); + if (r < 0) + return r; + + r = increment_oomd_xattr(path, "user.systemd_oomd_kill", set_size(pids_killed)); + if (r < 0) + log_debug_errno(r, "Failed to set user.systemd_oomd_kill on kill: %m"); + + return set_size(pids_killed) != 0; +} + +int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) { + _cleanup_free_ OomdCGroupContext **sorted = NULL; + int r; + + assert(h); + + r = oomd_sort_cgroup_contexts(h, compare_pgscan, prefix, &sorted); + if (r < 0) + return r; + + for (int i = 0; i < r; i++) { + if (sorted[i]->pgscan == 0) + break; + + r = oomd_cgroup_kill(sorted[i]->path, true, dry_run); + if (r > 0 || r == -ENOMEM) + break; + } + + return r; +} + +int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run) { + _cleanup_free_ OomdCGroupContext **sorted = NULL; + int r; + + assert(h); + + r = oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted); + if (r < 0) + return r; + + /* Try to kill cgroups with non-zero swap usage until we either succeed in + * killing or we get to a cgroup with no swap usage. */ + for (int i = 0; i < r; i++) { + if (sorted[i]->swap_usage == 0) + break; + + r = oomd_cgroup_kill(sorted[i]->path, true, dry_run); + if (r > 0 || r == -ENOMEM) + break; + } + + return r; +} + +int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) { + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL; + _cleanup_free_ char *p = NULL, *val = NULL; + bool is_root; + int r; + + assert(path); + assert(ret); + + ctx = new0(OomdCGroupContext, 1); + if (!ctx) + return -ENOMEM; + + is_root = empty_or_root(path); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "memory.pressure", &p); + if (r < 0) + return log_debug_errno(r, "Error getting cgroup memory pressure path from %s: %m", path); + + r = read_resource_pressure(p, PRESSURE_TYPE_FULL, &ctx->memory_pressure); + if (r < 0) + return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p); + + if (is_root) { + r = procfs_memory_get_used(&ctx->current_memory_usage); + if (r < 0) + return log_debug_errno(r, "Error getting memory used from procfs: %m"); + } else { + r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.current", &ctx->current_memory_usage); + if (r < 0) + return log_debug_errno(r, "Error getting memory.current from %s: %m", path); + + r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.min", &ctx->memory_min); + if (r < 0) + return log_debug_errno(r, "Error getting memory.min from %s: %m", path); + + r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.low", &ctx->memory_low); + if (r < 0) + return log_debug_errno(r, "Error getting memory.low from %s: %m", path); + + r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.swap.current", &ctx->swap_usage); + if (r < 0) + return log_debug_errno(r, "Error getting memory.swap.current from %s: %m", path); + + r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, path, "memory.stat", STRV_MAKE("pgscan"), &val); + if (r < 0) + return log_debug_errno(r, "Error getting pgscan from memory.stat under %s: %m", path); + + r = safe_atou64(val, &ctx->pgscan); + if (r < 0) + return log_debug_errno(r, "Error converting pgscan value to uint64_t: %m"); + } + + ctx->path = strdup(empty_to_root(path)); + if (!ctx->path) + return -ENOMEM; + + *ret = TAKE_PTR(ctx); + return 0; +} + +int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret) { + _cleanup_fclose_ FILE *f = NULL; + OomdSystemContext ctx = {}; + int r; + + assert(proc_swaps_path); + assert(ret); + + f = fopen(proc_swaps_path, "re"); + if (!f) + return -errno; + + (void) fscanf(f, "%*s %*s %*s %*s %*s\n"); + + for (;;) { + uint64_t total, used; + + r = fscanf(f, + "%*s " /* device/file */ + "%*s " /* type of swap */ + "%" PRIu64 " " /* swap size */ + "%" PRIu64 " " /* used */ + "%*s\n", /* priority */ + &total, &used); + + if (r == EOF && feof(f)) + break; + + if (r != 2) { + if (ferror(f)) + return log_debug_errno(errno, "Error reading from %s: %m", proc_swaps_path); + + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "Failed to parse values from %s: %m", proc_swaps_path); + } + + ctx.swap_total += total * 1024U; + ctx.swap_used += used * 1024U; + } + + *ret = ctx; + return 0; +} + +int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path) { + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *curr_ctx = NULL; + OomdCGroupContext *old_ctx, *ctx; + int r; + + assert(new_h); + assert(path); + + r = oomd_cgroup_context_acquire(path, &curr_ctx); + if (r < 0) + return log_debug_errno(r, "Failed to get OomdCGroupContext for %s: %m", path); + + old_ctx = hashmap_get(old_h, path); + if (old_ctx) { + curr_ctx->last_pgscan = old_ctx->pgscan; + curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit; + curr_ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit; + } + + ctx = TAKE_PTR(curr_ctx); + r = hashmap_put(new_h, ctx->path, ctx); + if (r < 0) + return r; + + return 0; +} + +void oomd_dump_swap_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) { + char swap[FORMAT_BYTES_MAX]; + + assert(ctx); + assert(f); + + if (!empty_or_root(ctx->path)) + fprintf(f, + "%sPath: %s\n" + "%s\tSwap Usage: %s\n", + strempty(prefix), ctx->path, + strempty(prefix), format_bytes(swap, sizeof(swap), ctx->swap_usage)); + else + fprintf(f, + "%sPath: %s\n" + "%s\tSwap Usage: (see System Context)\n", + strempty(prefix), ctx->path, + strempty(prefix)); +} + +void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) { + char tbuf[FORMAT_TIMESPAN_MAX], mem_use[FORMAT_BYTES_MAX]; + char mem_min[FORMAT_BYTES_MAX], mem_low[FORMAT_BYTES_MAX]; + + assert(ctx); + assert(f); + + fprintf(f, + "%sPath: %s\n" + "%s\tMemory Pressure Limit: %lu%%\n" + "%s\tPressure: Avg10: %lu.%02lu Avg60: %lu.%02lu Avg300: %lu.%02lu Total: %s\n" + "%s\tCurrent Memory Usage: %s\n", + strempty(prefix), ctx->path, + strempty(prefix), LOAD_INT(ctx->mem_pressure_limit), + strempty(prefix), + LOAD_INT(ctx->memory_pressure.avg10), LOAD_FRAC(ctx->memory_pressure.avg10), + LOAD_INT(ctx->memory_pressure.avg60), LOAD_FRAC(ctx->memory_pressure.avg60), + LOAD_INT(ctx->memory_pressure.avg300), LOAD_FRAC(ctx->memory_pressure.avg300), + format_timespan(tbuf, sizeof(tbuf), ctx->memory_pressure.total, USEC_PER_SEC), + strempty(prefix), format_bytes(mem_use, sizeof(mem_use), ctx->current_memory_usage)); + + if (!empty_or_root(ctx->path)) + fprintf(f, + "%s\tMemory Min: %s\n" + "%s\tMemory Low: %s\n" + "%s\tPgscan: %" PRIu64 "\n", + strempty(prefix), format_bytes_cgroup_protection(mem_min, sizeof(mem_min), ctx->memory_min), + strempty(prefix), format_bytes_cgroup_protection(mem_low, sizeof(mem_low), ctx->memory_low), + strempty(prefix), ctx->pgscan); +} + +void oomd_dump_system_context(const OomdSystemContext *ctx, FILE *f, const char *prefix) { + char used[FORMAT_BYTES_MAX], total[FORMAT_BYTES_MAX]; + + assert(ctx); + assert(f); + + fprintf(f, + "%sSwap: Used: %s Total: %s\n", + strempty(prefix), + format_bytes(used, sizeof(used), ctx->swap_used), + format_bytes(total, sizeof(total), ctx->swap_total)); +} diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h new file mode 100644 index 0000000000..cfd717a018 --- /dev/null +++ b/src/oom/oomd-util.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include + +#include "hashmap.h" +#include "psi-util.h" + +#define GROWING_SIZE_PERCENTILE 80 + +extern const struct hash_ops oomd_cgroup_ctx_hash_ops; + +typedef struct OomdCGroupContext OomdCGroupContext; +typedef struct OomdSystemContext OomdSystemContext; + +typedef int (oomd_compare_t)(OomdCGroupContext * const *, OomdCGroupContext * const *); + +struct OomdCGroupContext { + char *path; + + ResourcePressure memory_pressure; + + uint64_t current_memory_usage; + + uint64_t memory_min; + uint64_t memory_low; + uint64_t swap_usage; + + uint64_t last_pgscan; + uint64_t pgscan; + + /* These are only used by oomd_pressure_above for acting on high memory pressure. */ + loadavg_t mem_pressure_limit; + usec_t last_hit_mem_pressure_limit; +}; + +struct OomdSystemContext { + uint64_t swap_total; + uint64_t swap_used; +}; + +OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx); +DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_free); + +/* All hashmaps used with these functions are expected to be of the form + * key: cgroup paths -> value: OomdCGroupContext. */ + +/* Scans all the OomdCGroupContexts in `h` and returns 1 and a set of pointers to those OomdCGroupContexts in `ret` + * if any of them have exceeded their supplied memory pressure limits for the `duration` length of time. + * `last_hit_mem_pressure_limit` is updated accordingly for each entry when the limit is exceeded, and when it returns + * below the limit. + * Returns 0 and sets `ret` to an empty set if no entries exceeded limits for `duration`. + * Returns -ENOMEM for allocation errors. */ +int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret); + +/* Sum up current OomdCGroupContexts' pgscan values and last interval's pgscan values in `h`. Returns true if the + * current sum is higher than the last interval's sum (there was some reclaim activity). */ +bool oomd_memory_reclaim(Hashmap *h); + +/* Returns true if the amount of swap free is below the percentage of swap specified by `threshold_percent`. */ +bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent); + +static inline int compare_pgscan(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { + assert(c1); + assert(c2); + + if ((*c1)->pgscan > (*c2)->pgscan) + return -1; + else if ((*c1)->pgscan < (*c2)->pgscan) + return 1; + else + return 0; +} + +static inline int compare_swap_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { + assert(c1); + assert(c2); + + if ((*c1)->swap_usage > (*c2)->swap_usage) + return -1; + else if ((*c1)->swap_usage < (*c2)->swap_usage) + return 1; + else + return 0; +} + +/* Get an array of OomdCGroupContexts from `h`, qsorted from largest to smallest values according to `compare_func`. + * If `prefix` is not NULL, only include OomdCGroupContexts whose paths start with prefix. Otherwise all paths are sorted. + * Returns the number of sorted items; negative on error. */ +int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const char *prefix, OomdCGroupContext ***ret); + +/* Returns a negative value on error, 0 if no processes were killed, or 1 if processes were killed. */ +int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run); + +/* The following oomd_kill_by_* functions return 1 if processes were killed, or negative otherwise. */ +/* If `prefix` is supplied, only cgroups whose paths start with `prefix` are eligible candidates. Otherwise, + * everything in `h` is a candidate. */ +int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run); +int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run); + +int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret); +int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret); + +/* Get the OomdCGroupContext of `path` and insert it into `new_h`. The key for the inserted context will be `path`. + * + * `old_h` is used to get data used to calculate prior interval information. `old_h` can be NULL in which case there + * was no prior data to reference. */ +int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path); + +void oomd_dump_swap_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix); +void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix); +void oomd_dump_system_context(const OomdSystemContext *ctx, FILE *f, const char *prefix); diff --git a/src/oom/oomd.c b/src/oom/oomd.c new file mode 100644 index 0000000000..0b611efd57 --- /dev/null +++ b/src/oom/oomd.c @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include + +#include "cgroup-util.h" +#include "conf-parser.h" +#include "daemon-util.h" +#include "log.h" +#include "main-func.h" +#include "oomd-manager.h" +#include "parse-util.h" +#include "pretty-print.c" +#include "psi-util.h" +#include "signal-util.h" + +static bool arg_dry_run = false; +static int arg_swap_used_limit = -1; +static int arg_mem_pressure_limit = -1; + +static int parse_config(void) { + static const ConfigTableItem items[] = { + { "OOM", "SwapUsedLimitPercent", config_parse_percent, 0, &arg_swap_used_limit }, + { "OOM", "DefaultMemoryPressureLimitPercent", config_parse_percent, 0, &arg_mem_pressure_limit }, + {} + }; + + return config_parse_many_nulstr(PKGSYSCONFDIR "/oomd.conf", + CONF_PATHS_NULSTR("systemd/oomd.conf.d"), + "OOM\0", + config_item_table_lookup, + items, + CONFIG_PARSE_WARN, + NULL, + NULL); +} + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-oomd", "1", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...]\n\n" + "Run the userspace out-of-memory (OOM) killer.\n\n" + " -h --help Show this help\n" + " --dry-run Log write/destructive actions instead of doing them\n" + "\nSee the %s for details.\n" + , program_invocation_short_name + , link + ); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_DRY_RUN, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "dry-run", no_argument, NULL, ARG_DRY_RUN }, + {} + }; + + int c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_DRY_RUN: + arg_dry_run = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached("Invalid option passed."); + } + + return 1; +} + +static int run(int argc, char *argv[]) { + _cleanup_(notify_on_cleanup) const char *notify_msg = NULL; + _cleanup_(manager_freep) Manager *m = NULL; + int r; + + log_setup_service(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = parse_config(); + if (r < 0) + return r; + + /* Do some basic requirement checks for running systemd-oomd. It's not exhaustive as some of the other + * requirements do not have a reliable means to check for in code. */ + if (access("/proc/swaps", F_OK) < 0) + return log_error_errno(errno, "Swap not enabled: %m"); + + if (!is_pressure_supported()) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Pressure Stall Information (PSI) is not supported"); + + r = cg_all_unified(); + if (r < 0) + return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m"); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Requires the unified cgroups hierarchy"); + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0); + + r = manager_new(&m); + if (r < 0) + return log_error_errno(r, "Failed to create manager: %m"); + + r = manager_start(m, arg_dry_run, arg_swap_used_limit, arg_mem_pressure_limit); + if (r < 0) + return log_error_errno(r, "Failed to start up daemon: %m"); + + notify_msg = notify_start(NOTIFY_READY, NOTIFY_STOPPING); + + log_info("systemd-oomd starting%s!", arg_dry_run ? " in dry run mode" : ""); + + r = sd_event_loop(m->event); + if (r < 0) + return log_error_errno(r, "Event loop failed: %m"); + + return 0; +} + +DEFINE_MAIN_FUNCTION(run); diff --git a/src/oom/oomd.conf b/src/oom/oomd.conf new file mode 100644 index 0000000000..8ac9716961 --- /dev/null +++ b/src/oom/oomd.conf @@ -0,0 +1,16 @@ +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. +# +# Entries in this file show the compile time defaults. +# You can change settings by editing this file. +# Defaults can be restored by simply deleting this file. +# +# See oomd.conf(5) for details + +[OOM] +#SwapUsedLimitPercent=90% +#DefaultMemoryPressureLimitPercent=60% diff --git a/src/oom/org.freedesktop.oom1.conf b/src/oom/org.freedesktop.oom1.conf new file mode 100644 index 0000000000..48b526f0aa --- /dev/null +++ b/src/oom/org.freedesktop.oom1.conf @@ -0,0 +1,47 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/oom/org.freedesktop.oom1.service b/src/oom/org.freedesktop.oom1.service new file mode 100644 index 0000000000..78150716ed --- /dev/null +++ b/src/oom/org.freedesktop.oom1.service @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: LGPL-2.1+ +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[D-BUS Service] +Name=org.freedesktop.oom1 +Exec=/bin/false +User=root +SystemdService=dbus-org.freedesktop.oom1.service diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c new file mode 100644 index 0000000000..5503f8f2e0 --- /dev/null +++ b/src/oom/test-oomd-util.c @@ -0,0 +1,348 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include + +#include "alloc-util.h" +#include "cgroup-setup.h" +#include "cgroup-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "oomd-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +static int fork_and_sleep(unsigned sleep_min) { + usec_t n, timeout, ts; + + pid_t pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + timeout = sleep_min * USEC_PER_MINUTE; + ts = now(CLOCK_MONOTONIC); + while (true) { + n = now(CLOCK_MONOTONIC); + if (ts + timeout < n) { + log_error("Child timed out waiting to be killed"); + abort(); + } + sleep(1); + } + } + + return pid; +} + +static void test_oomd_cgroup_kill(void) { + _cleanup_free_ char *cgroup_root = NULL, *cgroup = NULL; + int pid[2]; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + if (cg_all_unified() <= 0) + return (void) log_tests_skipped("cgroups are not running in unified mode"); + + assert_se(cg_pid_get_path(NULL, 0, &cgroup_root) >= 0); + + /* Create another cgroup below this one for the pids we forked off. We need this to be managed + * by the test so that pid1 doesn't delete it before we can read the xattrs. */ + cgroup = path_join(cgroup_root, "oomdkilltest"); + assert(cgroup); + + /* If we don't have permissions to set xattrs we're likely in a userns or missing capabilities */ + if (cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_test", "test", 4, 0) == -EPERM) + return (void) log_tests_skipped("no permissions to set user xattrs"); + + /* Do this twice to also check the increment behavior on the xattrs */ + for (int i = 0; i < 2; i++) { + _cleanup_free_ char *v = NULL; + int r; + + for (int j = 0; j < 2; j++) { + pid[j] = fork_and_sleep(5); + assert_se(cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, cgroup, pid[j]) >= 0); + } + + r = oomd_cgroup_kill(cgroup, false /* recurse */, false /* dry run */); + if (r <= 0) { + log_debug_errno(r, "Failed to kill processes under %s: %m", cgroup); + abort(); + } + + /* Wait a bit since processes may take some time to be cleaned up. */ + sleep(2); + assert_se(cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, cgroup) == true); + + assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.systemd_oomd_kill", &v) >= 0); + assert_se(memcmp(v, i == 0 ? "2" : "4", 2) == 0); + } +} + +static void test_oomd_cgroup_context_acquire_and_insert(void) { + _cleanup_hashmap_free_ Hashmap *h1 = NULL, *h2 = NULL; + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL; + _cleanup_free_ char *cgroup = NULL; + OomdCGroupContext *c1, *c2; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + if (!is_pressure_supported()) + return (void) log_tests_skipped("system does not support pressure"); + + if (cg_all_unified() <= 0) + return (void) log_tests_skipped("cgroups are not running in unified mode"); + + assert_se(cg_pid_get_path(NULL, 0, &cgroup) >= 0); + + assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0); + + assert_se(streq(ctx->path, cgroup)); + assert_se(ctx->memory_pressure.avg10 == 0); + assert_se(ctx->memory_pressure.avg60 == 0); + assert_se(ctx->memory_pressure.avg300 == 0); + assert_se(ctx->memory_pressure.total == 0); + assert_se(ctx->current_memory_usage > 0); + assert_se(ctx->memory_min == 0); + assert_se(ctx->memory_low == 0); + assert_se(ctx->swap_usage == 0); + assert_se(ctx->last_pgscan == 0); + assert_se(ctx->pgscan == 0); + ctx = oomd_cgroup_context_free(ctx); + + /* Test the root cgroup */ + assert_se(oomd_cgroup_context_acquire("", &ctx) == 0); + assert_se(streq(ctx->path, "/")); + assert_se(ctx->current_memory_usage > 0); + + /* Test hashmap inserts */ + assert_se(h1 = hashmap_new(&oomd_cgroup_ctx_hash_ops)); + assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == 0); + c1 = hashmap_get(h1, cgroup); + assert_se(c1); + + /* make sure certain values from h1 get updated in h2 */ + c1->pgscan = 5555; + c1->mem_pressure_limit = 6789; + c1->last_hit_mem_pressure_limit = 42; + assert_se(h2 = hashmap_new(&oomd_cgroup_ctx_hash_ops)); + assert_se(oomd_insert_cgroup_context(h1, h2, cgroup) == 0); + c1 = hashmap_get(h1, cgroup); + c2 = hashmap_get(h2, cgroup); + assert_se(c1); + assert_se(c2); + assert_se(c1 != c2); + assert_se(c2->last_pgscan == 5555); + assert_se(c2->mem_pressure_limit == 6789); + assert_se(c2->last_hit_mem_pressure_limit == 42); +} + +static void test_oomd_system_context_acquire(void) { + _cleanup_(unlink_tempfilep) char path[] = "/oomdgetsysctxtestXXXXXX"; + OomdSystemContext ctx; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + assert_se(mkstemp(path)); + + assert_se(oomd_system_context_acquire("/verylikelynonexistentpath", &ctx) == -ENOENT); + + assert_se(oomd_system_context_acquire(path, &ctx) == 0); + assert_se(ctx.swap_total == 0); + assert_se(ctx.swap_used == 0); + + assert_se(write_string_file(path, "some\nwords\nacross\nmultiple\nlines", WRITE_STRING_FILE_CREATE) == 0); + assert_se(oomd_system_context_acquire(path, &ctx) == 0); + assert_se(ctx.swap_total == 0); + assert_se(ctx.swap_used == 0); + + assert_se(write_string_file(path, "Filename Type Size Used Priority\n" + "/swapvol/swapfile file 18971644 0 -3\n" + "/dev/vda2 partition 1999868 993780 -2", WRITE_STRING_FILE_CREATE) == 0); + assert_se(oomd_system_context_acquire(path, &ctx) == 0); + assert_se(ctx.swap_total == 21474828288); + assert_se(ctx.swap_used == 1017630720); +} + +static void test_oomd_pressure_above(void) { + _cleanup_hashmap_free_ Hashmap *h1 = NULL, *h2 = NULL; + _cleanup_set_free_ Set *t1 = NULL, *t2 = NULL, *t3 = NULL; + OomdCGroupContext ctx[2], *c; + loadavg_t threshold; + + assert_se(store_loadavg_fixed_point(80, 0, &threshold) == 0); + + /* /herp.slice */ + assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg10)) == 0); + assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg60)) == 0); + assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg300)) == 0); + ctx[0].mem_pressure_limit = threshold; + + /* /derp.slice */ + assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg10)) == 0); + assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg60)) == 0); + assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg300)) == 0); + ctx[1].mem_pressure_limit = threshold; + + + /* High memory pressure */ + assert_se(h1 = hashmap_new(&string_hash_ops)); + assert_se(hashmap_put(h1, "/herp.slice", &ctx[0]) >= 0); + assert_se(oomd_pressure_above(h1, 0 /* duration */, &t1) == 1); + assert_se(set_contains(t1, &ctx[0]) == true); + assert_se(c = hashmap_get(h1, "/herp.slice")); + assert_se(c->last_hit_mem_pressure_limit > 0); + + /* Low memory pressure */ + assert_se(h2 = hashmap_new(&string_hash_ops)); + assert_se(hashmap_put(h2, "/derp.slice", &ctx[1]) >= 0); + assert_se(oomd_pressure_above(h2, 0 /* duration */, &t2) == 0); + assert_se(t2 == NULL); + assert_se(c = hashmap_get(h2, "/derp.slice")); + assert_se(c->last_hit_mem_pressure_limit == 0); + + /* High memory pressure w/ multiple cgroups */ + assert_se(hashmap_put(h1, "/derp.slice", &ctx[1]) >= 0); + assert_se(oomd_pressure_above(h1, 0 /* duration */, &t3) == 1); + assert_se(set_contains(t3, &ctx[0]) == true); + assert_se(set_size(t3) == 1); + assert_se(c = hashmap_get(h1, "/herp.slice")); + assert_se(c->last_hit_mem_pressure_limit > 0); + assert_se(c = hashmap_get(h1, "/derp.slice")); + assert_se(c->last_hit_mem_pressure_limit == 0); +} + +static void test_oomd_memory_reclaim(void) { + _cleanup_hashmap_free_ Hashmap *h1 = NULL; + char **paths = STRV_MAKE("/0.slice", + "/1.slice", + "/2.slice", + "/3.slice", + "/4.slice"); + + OomdCGroupContext ctx[5] = { + { .path = paths[0], + .last_pgscan = 100, + .pgscan = 100 }, + { .path = paths[1], + .last_pgscan = 100, + .pgscan = 100 }, + { .path = paths[2], + .last_pgscan = 77, + .pgscan = 33 }, + { .path = paths[3], + .last_pgscan = UINT64_MAX, + .pgscan = 100 }, + { .path = paths[4], + .last_pgscan = 100, + .pgscan = UINT64_MAX }, + }; + + assert_se(h1 = hashmap_new(&string_hash_ops)); + assert_se(hashmap_put(h1, paths[0], &ctx[0]) >= 0); + assert_se(hashmap_put(h1, paths[1], &ctx[1]) >= 0); + assert_se(oomd_memory_reclaim(h1) == false); + + assert_se(hashmap_put(h1, paths[2], &ctx[2]) >= 0); + assert_se(oomd_memory_reclaim(h1) == false); + + assert_se(hashmap_put(h1, paths[4], &ctx[4]) >= 0); + assert_se(oomd_memory_reclaim(h1) == true); + + assert_se(hashmap_put(h1, paths[3], &ctx[3]) >= 0); + assert_se(oomd_memory_reclaim(h1) == false); +} + +static void test_oomd_swap_free_below(void) { + OomdSystemContext ctx = (OomdSystemContext) { + .swap_total = 20971512 * 1024U, + .swap_used = 20971440 * 1024U, + }; + assert_se(oomd_swap_free_below(&ctx, 20) == true); + + ctx = (OomdSystemContext) { + .swap_total = 20971512 * 1024U, + .swap_used = 3310136 * 1024U, + }; + assert_se(oomd_swap_free_below(&ctx, 20) == false); +} + +static void test_oomd_sort_cgroups(void) { + _cleanup_hashmap_free_ Hashmap *h = NULL; + _cleanup_free_ OomdCGroupContext **sorted_cgroups; + char **paths = STRV_MAKE("/herp.slice", + "/herp.slice/derp.scope", + "/herp.slice/derp.scope/sheep.service", + "/zupa.slice"); + + OomdCGroupContext ctx[4] = { + { .path = paths[0], + .swap_usage = 20, + .pgscan = 60 }, + { .path = paths[1], + .swap_usage = 60, + .pgscan = 40 }, + { .path = paths[2], + .swap_usage = 40, + .pgscan = 20 }, + { .path = paths[3], + .swap_usage = 10, + .pgscan = 80 }, + }; + + assert_se(h = hashmap_new(&string_hash_ops)); + + assert_se(hashmap_put(h, "/herp.slice", &ctx[0]) >= 0); + assert_se(hashmap_put(h, "/herp.slice/derp.scope", &ctx[1]) >= 0); + assert_se(hashmap_put(h, "/herp.slice/derp.scope/sheep.service", &ctx[2]) >= 0); + assert_se(hashmap_put(h, "/zupa.slice", &ctx[3]) >= 0); + + assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 4); + assert_se(sorted_cgroups[0] == &ctx[1]); + assert_se(sorted_cgroups[1] == &ctx[2]); + assert_se(sorted_cgroups[2] == &ctx[0]); + assert_se(sorted_cgroups[3] == &ctx[3]); + sorted_cgroups = mfree(sorted_cgroups); + + assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan, NULL, &sorted_cgroups) == 4); + assert_se(sorted_cgroups[0] == &ctx[3]); + assert_se(sorted_cgroups[1] == &ctx[0]); + assert_se(sorted_cgroups[2] == &ctx[1]); + assert_se(sorted_cgroups[3] == &ctx[2]); + sorted_cgroups = mfree(sorted_cgroups); + + assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan, "/herp.slice/derp.scope", &sorted_cgroups) == 2); + assert_se(sorted_cgroups[0] == &ctx[1]); + assert_se(sorted_cgroups[1] == &ctx[2]); + assert_se(sorted_cgroups[2] == 0); + assert_se(sorted_cgroups[3] == 0); + sorted_cgroups = mfree(sorted_cgroups); +} + +int main(void) { + int r; + + test_setup_logging(LOG_DEBUG); + + test_oomd_system_context_acquire(); + test_oomd_pressure_above(); + test_oomd_memory_reclaim(); + test_oomd_swap_free_below(); + test_oomd_sort_cgroups(); + + /* The following tests operate on live cgroups */ + + r = enter_cgroup_root(NULL); + if (r < 0) + return log_tests_skipped_errno(r, "failed to enter a test cgroup scope"); + + test_oomd_cgroup_kill(); + test_oomd_cgroup_context_acquire_and_insert(); + + return 0; +} diff --git a/src/shared/bus-get-properties.c b/src/shared/bus-get-properties.c index 8ad4694046..5a123bb8f3 100644 --- a/src/shared/bus-get-properties.c +++ b/src/shared/bus-get-properties.c @@ -2,6 +2,7 @@ #include "bus-get-properties.h" #include "rlimit-util.h" +#include "stdio-util.h" #include "string-util.h" int bus_property_get_bool( @@ -54,6 +55,23 @@ int bus_property_get_id128( return sd_bus_message_append_array(reply, 'y', id->bytes, 16); } +int bus_property_get_percent( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + char pstr[DECIMAL_STR_MAX(int) + 2]; + int p = *(int*) userdata; + + xsprintf(pstr, "%d%%", p); + + return sd_bus_message_append_basic(reply, 's', pstr); +} + #if __SIZEOF_SIZE_T__ != 8 int bus_property_get_size( sd_bus *bus, diff --git a/src/shared/bus-get-properties.h b/src/shared/bus-get-properties.h index 81af74309d..f3934a86a2 100644 --- a/src/shared/bus-get-properties.h +++ b/src/shared/bus-get-properties.h @@ -8,6 +8,7 @@ int bus_property_get_bool(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); int bus_property_set_bool(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *value, void *userdata, sd_bus_error *error); int bus_property_get_id128(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); +int bus_property_get_percent(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); #define bus_property_get_usec ((sd_bus_property_get_t) NULL) #define bus_property_set_usec ((sd_bus_property_set_t) NULL) diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 6fe6131292..89e0c5bb95 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -432,7 +432,11 @@ static int bus_append_ip_address_access(sd_bus_message *m, int family, const uni static int bus_append_cgroup_property(sd_bus_message *m, const char *field, const char *eq) { int r; - if (STR_IN_SET(field, "DevicePolicy", "Slice")) + if (STR_IN_SET(field, "DevicePolicy", + "Slice", + "ManagedOOMSwap", + "ManagedOOMMemoryPressure", + "ManagedOOMMemoryPressureLimitPercent")) return bus_append_string(m, field, eq); if (STR_IN_SET(field, "CPUAccounting", diff --git a/src/shared/conf-parser.c b/src/shared/conf-parser.c index 02a27e3a88..524f57ff80 100644 --- a/src/shared/conf-parser.c +++ b/src/shared/conf-parser.c @@ -1243,3 +1243,5 @@ int config_parse_vlanprotocol(const char* unit, return 0; } + +DEFINE_CONFIG_PARSE(config_parse_percent, parse_percent, "Failed to parse percent value"); diff --git a/src/shared/conf-parser.h b/src/shared/conf-parser.h index 57787ea033..2514dcbf48 100644 --- a/src/shared/conf-parser.h +++ b/src/shared/conf-parser.h @@ -147,6 +147,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_ip_port); CONFIG_PARSER_PROTOTYPE(config_parse_mtu); CONFIG_PARSER_PROTOTYPE(config_parse_rlimit); CONFIG_PARSER_PROTOTYPE(config_parse_vlanprotocol); +CONFIG_PARSER_PROTOTYPE(config_parse_percent); typedef enum Disabled { DISABLED_CONFIGURATION, diff --git a/src/shared/meson.build b/src/shared/meson.build index 3f409584e6..0ed216f1aa 100644 --- a/src/shared/meson.build +++ b/src/shared/meson.build @@ -189,6 +189,8 @@ shared_sources = files(''' pkcs11-util.h pretty-print.c pretty-print.h + psi-util.c + psi-util.h ptyfwd.c ptyfwd.h pwquality-util.c diff --git a/src/shared/psi-util.c b/src/shared/psi-util.c new file mode 100644 index 0000000000..21e965b04b --- /dev/null +++ b/src/shared/psi-util.c @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include +#include + +#include "alloc-util.h" +#include "extract-word.h" +#include "fd-util.h" +#include "fileio.h" +#include "parse-util.h" +#include "psi-util.h" +#include "string-util.h" +#include "stat-util.h" +#include "strv.h" + +int read_resource_pressure(const char *path, PressureType type, ResourcePressure *ret) { + _cleanup_free_ char *line = NULL; + _cleanup_fclose_ FILE *f = NULL; + unsigned field_filled = 0; + ResourcePressure rp = {}; + const char *t, *cline; + char *word; + int r; + + assert(path); + assert(IN_SET(type, PRESSURE_TYPE_SOME, PRESSURE_TYPE_FULL)); + assert(ret); + + if (type == PRESSURE_TYPE_SOME) + t = "some"; + else if (type == PRESSURE_TYPE_FULL) + t = "full"; + else + return -EINVAL; + + r = fopen_unlocked(path, "re", &f); + if (r < 0) + return r; + + for (;;) { + _cleanup_free_ char *l = NULL; + char *w; + + r = read_line(f, LONG_LINE_MAX, &l); + if (r < 0) + return r; + if (r == 0) + break; + + w = first_word(l, t); + if (w) { + line = TAKE_PTR(l); + cline = w; + break; + } + } + + if (!line) + return -ENODATA; + + /* extracts either avgX=Y.Z or total=X */ + while ((r = extract_first_word(&cline, &word, NULL, 0)) > 0) { + _cleanup_free_ char *w = word; + const char *v; + + if ((v = startswith(w, "avg10="))) { + if (field_filled & (1U << 0)) + return -EINVAL; + + field_filled |= 1U << 0; + r = parse_loadavg_fixed_point(v, &rp.avg10); + } else if ((v = startswith(w, "avg60="))) { + if (field_filled & (1U << 1)) + return -EINVAL; + + field_filled |= 1U << 1; + r = parse_loadavg_fixed_point(v, &rp.avg60); + } else if ((v = startswith(w, "avg300="))) { + if (field_filled & (1U << 2)) + return -EINVAL; + + field_filled |= 1U << 2; + r = parse_loadavg_fixed_point(v, &rp.avg300); + } else if ((v = startswith(w, "total="))) { + if (field_filled & (1U << 3)) + return -EINVAL; + + field_filled |= 1U << 3; + r = safe_atou64(v, &rp.total); + } else + continue; + + if (r < 0) + return r; + } + + if (r < 0) + return r; + + if (field_filled != 15U) + return -EINVAL; + + *ret = rp; + return 0; +} + +int is_pressure_supported(void) { + const char *p; + + FOREACH_STRING(p, "/proc/pressure/cpu", "/proc/pressure/io", "/proc/pressure/memory") + if (access(p, F_OK) < 0) { + if (errno == ENOENT) + return 0; + return -errno; + } + + return 1; +} diff --git a/src/shared/psi-util.h b/src/shared/psi-util.h new file mode 100644 index 0000000000..9810dbec6e --- /dev/null +++ b/src/shared/psi-util.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include + +#include "parse-util.h" +#include "time-util.h" + +typedef enum PressureType { + PRESSURE_TYPE_SOME, + PRESSURE_TYPE_FULL, +} PressureType; + +/* Averages are stored in fixed-point with 11 bit fractions */ +typedef struct ResourcePressure { + loadavg_t avg10; + loadavg_t avg60; + loadavg_t avg300; + usec_t total; +} ResourcePressure; + +/** Upstream 4.20+ format + * + * some avg10=0.22 avg60=0.17 avg300=1.11 total=58761459 + * full avg10=0.23 avg60=0.16 avg300=1.08 total=58464525 + */ +int read_resource_pressure(const char *path, PressureType type, ResourcePressure *ret); + +/* Was the kernel compiled with CONFIG_PSI=y? 1 if yes, 0 if not, negative on error. */ +int is_pressure_supported(void); diff --git a/src/shared/tests.c b/src/shared/tests.c index a5cb486c99..fe6d9dfbd5 100644 --- a/src/shared/tests.c +++ b/src/shared/tests.c @@ -254,7 +254,7 @@ static int allocate_scope(void) { return 0; } -int enter_cgroup_subroot(char **ret_cgroup) { +static int enter_cgroup(char **ret_cgroup, bool enter_subroot) { _cleanup_free_ char *cgroup_root = NULL, *cgroup_subroot = NULL; CGroupMask supported; int r; @@ -268,7 +268,13 @@ int enter_cgroup_subroot(char **ret_cgroup) { return log_warning_errno(r, "cg_pid_get_path(NULL, 0, ...) failed: %m"); assert(r >= 0); - assert_se(asprintf(&cgroup_subroot, "%s/%" PRIx64, cgroup_root, random_u64()) >= 0); + if (enter_subroot) + assert_se(asprintf(&cgroup_subroot, "%s/%" PRIx64, cgroup_root, random_u64()) >= 0); + else { + cgroup_subroot = strdup(cgroup_root); + assert_se(cgroup_subroot != NULL); + } + assert_se(cg_mask_supported(&supported) >= 0); /* If this fails, then we don't mind as the later cgroup operations will fail too, and it's fine if @@ -287,3 +293,11 @@ int enter_cgroup_subroot(char **ret_cgroup) { return 0; } + +int enter_cgroup_subroot(char **ret_cgroup) { + return enter_cgroup(ret_cgroup, true); +} + +int enter_cgroup_root(char **ret_cgroup) { + return enter_cgroup(ret_cgroup, false); +} diff --git a/src/shared/tests.h b/src/shared/tests.h index 6817ef4860..505ca39775 100644 --- a/src/shared/tests.h +++ b/src/shared/tests.h @@ -20,6 +20,7 @@ static inline bool manager_errno_skip_test(int r) { char* setup_fake_runtime_dir(void); int enter_cgroup_subroot(char **ret_cgroup); +int enter_cgroup_root(char **ret_cgroup); int get_testdata_dir(const char *suffix, char **ret); const char* get_catalog_dir(void); bool slow_tests_enabled(void); diff --git a/src/shared/varlink.c b/src/shared/varlink.c index 86b5f08ae7..fabfe78280 100644 --- a/src/shared/varlink.c +++ b/src/shared/varlink.c @@ -418,6 +418,11 @@ static int varlink_test_disconnect(Varlink *v) { if (IN_SET(v->state, VARLINK_IDLE_CLIENT) && (v->write_disconnected || v->got_pollhup)) goto disconnect; + /* The server is still expecting to write more, but its write end is disconnected and it got a POLLHUP + * (i.e. from a disconnected client), so disconnect. */ + if (IN_SET(v->state, VARLINK_PENDING_METHOD, VARLINK_PENDING_METHOD_MORE) && v->write_disconnected && v->got_pollhup) + goto disconnect; + return 0; disconnect: diff --git a/src/shared/varlink.h b/src/shared/varlink.h index 06a34b480d..030db39b2f 100644 --- a/src/shared/varlink.h +++ b/src/shared/varlink.h @@ -171,3 +171,4 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(VarlinkServer *, varlink_server_unref); #define VARLINK_ERROR_METHOD_NOT_FOUND "org.varlink.service.MethodNotFound" #define VARLINK_ERROR_METHOD_NOT_IMPLEMENTED "org.varlink.service.MethodNotImplemented" #define VARLINK_ERROR_INVALID_PARAMETER "org.varlink.service.InvalidParameter" +#define VARLINK_ERROR_SUBSCRIPTION_TAKEN "org.varlink.service.SubscriptionTaken" diff --git a/src/systemd/sd-messages.h b/src/systemd/sd-messages.h index 05f00ed577..eea8c2c900 100644 --- a/src/systemd/sd-messages.h +++ b/src/systemd/sd-messages.h @@ -127,6 +127,9 @@ _SD_BEGIN_DECLARATIONS; #define SD_MESSAGE_OVERMOUNTING SD_ID128_MAKE(1d,ee,03,69,c7,fc,47,36,b7,09,9b,38,ec,b4,6e,e7) #define SD_MESSAGE_OVERMOUNTING_STR SD_ID128_MAKE_STR(1d,ee,03,69,c7,fc,47,36,b7,09,9b,38,ec,b4,6e,e7) +#define SD_MESSAGE_UNIT_OOMD_KILL SD_ID128_MAKE(d9,89,61,1b,15,e4,4c,9d,bf,31,e3,c8,12,56,e4,ed) +#define SD_MESSAGE_UNIT_OOMD_KILL_STR SD_ID128_MAKE_STR(d9,89,61,1b,15,e4,4c,9d,bf,31,e3,c8,12,56,e4,ed) + #define SD_MESSAGE_UNIT_OUT_OF_MEMORY SD_ID128_MAKE(fe,6f,aa,94,e7,77,46,63,a0,da,52,71,78,91,d8,ef) #define SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR SD_ID128_MAKE_STR(fe,6f,aa,94,e7,77,46,63,a0,da,52,71,78,91,d8,ef) diff --git a/src/test/meson.build b/src/test/meson.build index 9bb3499963..60dec9512c 100644 --- a/src/test/meson.build +++ b/src/test/meson.build @@ -804,6 +804,10 @@ tests += [ [['src/test/test-local-addresses.c'], [], []], + + [['src/test/test-psi-util.c'], + [], + []], ] ############################################################ diff --git a/src/test/test-parse-util.c b/src/test/test-parse-util.c index 3806c3f8cf..d4f908f5d4 100644 --- a/src/test/test-parse-util.c +++ b/src/test/test-parse-util.c @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: LGPL-2.1+ */ #include +#include #include #include #include @@ -929,6 +930,42 @@ static void test_parse_mtu(void) { assert_se(parse_mtu(AF_UNSPEC, "", &mtu) == -EINVAL); } +static void test_parse_loadavg_fixed_point(void) { + loadavg_t fp; + + assert_se(parse_loadavg_fixed_point("1.23", &fp) == 0); + assert_se(LOAD_INT(fp) == 1); + assert_se(LOAD_FRAC(fp) == 23); + + assert_se(parse_loadavg_fixed_point("1.80", &fp) == 0); + assert_se(LOAD_INT(fp) == 1); + assert_se(LOAD_FRAC(fp) == 80); + + assert_se(parse_loadavg_fixed_point("0.07", &fp) == 0); + assert_se(LOAD_INT(fp) == 0); + assert_se(LOAD_FRAC(fp) == 7); + + assert_se(parse_loadavg_fixed_point("0.00", &fp) == 0); + assert_se(LOAD_INT(fp) == 0); + assert_se(LOAD_FRAC(fp) == 0); + + assert_se(parse_loadavg_fixed_point("4096.57", &fp) == 0); + assert_se(LOAD_INT(fp) == 4096); + assert_se(LOAD_FRAC(fp) == 57); + + /* Caps out at 2 digit fracs */ + assert_se(parse_loadavg_fixed_point("1.100", &fp) == -ERANGE); + + assert_se(parse_loadavg_fixed_point("4096.4096", &fp) == -ERANGE); + assert_se(parse_loadavg_fixed_point("-4000.5", &fp) == -ERANGE); + assert_se(parse_loadavg_fixed_point("18446744073709551615.5", &fp) == -ERANGE); + assert_se(parse_loadavg_fixed_point("foobar", &fp) == -EINVAL); + assert_se(parse_loadavg_fixed_point("3333", &fp) == -EINVAL); + assert_se(parse_loadavg_fixed_point("1.2.3", &fp) == -EINVAL); + assert_se(parse_loadavg_fixed_point(".", &fp) == -EINVAL); + assert_se(parse_loadavg_fixed_point("", &fp) == -EINVAL); +} + int main(int argc, char *argv[]) { log_parse_environment(); log_open(); @@ -955,6 +992,7 @@ int main(int argc, char *argv[]) { test_parse_errno(); test_parse_syscall_and_errno(); test_parse_mtu(); + test_parse_loadavg_fixed_point(); return 0; } diff --git a/src/test/test-psi-util.c b/src/test/test-psi-util.c new file mode 100644 index 0000000000..bde8ef80b1 --- /dev/null +++ b/src/test/test-psi-util.c @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include + +#include "alloc-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "psi-util.h" +#include "tests.h" + +static void test_read_mem_pressure(void) { + _cleanup_(unlink_tempfilep) char path[] = "/tmp/pressurereadtestXXXXXX"; + ResourcePressure rp; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + assert_se(mkstemp(path)); + + assert_se(read_resource_pressure("/verylikelynonexistentpath", PRESSURE_TYPE_SOME, &rp) < 0); + assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) < 0); + + assert_se(write_string_file(path, "herpdederp\n", WRITE_STRING_FILE_CREATE) == 0); + assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) < 0); + + /* Pressure file with some invalid values*/ + assert_se(write_string_file(path, "some avg10=0.22=55 avg60=0.17=8 avg300=1.11=00 total=58761459\n" + "full avg10=0.23=55 avg60=0.16=8 avg300=1.08=00 total=58464525", WRITE_STRING_FILE_CREATE) == 0); + assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) < 0); + + /* Same pressure valid values as below but with duplicate avg60 field */ + assert_se(write_string_file(path, "some avg10=0.22 avg60=0.17 avg60=0.18 avg300=1.11 total=58761459\n" + "full avg10=0.23 avg60=0.16 avg300=1.08 total=58464525", WRITE_STRING_FILE_CREATE) == 0); + assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) < 0); + + assert_se(write_string_file(path, "some avg10=0.22 avg60=0.17 avg300=1.11 total=58761459\n" + "full avg10=0.23 avg60=0.16 avg300=1.08 total=58464525", WRITE_STRING_FILE_CREATE) == 0); + assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) == 0); + assert_se(LOAD_INT(rp.avg10) == 0); + assert_se(LOAD_FRAC(rp.avg10) == 22); + assert_se(LOAD_INT(rp.avg60) == 0); + assert_se(LOAD_FRAC(rp.avg60) == 17); + assert_se(LOAD_INT(rp.avg300) == 1); + assert_se(LOAD_FRAC(rp.avg300) == 11); + assert_se(rp.total == 58761459); + assert(read_resource_pressure(path, PRESSURE_TYPE_FULL, &rp) == 0); + assert_se(LOAD_INT(rp.avg10) == 0); + assert_se(LOAD_FRAC(rp.avg10) == 23); + assert_se(LOAD_INT(rp.avg60) == 0); + assert_se(LOAD_FRAC(rp.avg60) == 16); + assert_se(LOAD_INT(rp.avg300) == 1); + assert_se(LOAD_FRAC(rp.avg300) == 8); + assert_se(rp.total == 58464525); + + /* Pressure file with extra unsupported fields */ + assert_se(write_string_file(path, "some avg5=0.55 avg10=0.22 avg60=0.17 avg300=1.11 total=58761459\n" + "full avg10=0.23 avg60=0.16 avg300=1.08 avg600=2.00 total=58464525", WRITE_STRING_FILE_CREATE) == 0); + assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) == 0); + assert_se(LOAD_INT(rp.avg10) == 0); + assert_se(LOAD_FRAC(rp.avg10) == 22); + assert_se(LOAD_INT(rp.avg60) == 0); + assert_se(LOAD_FRAC(rp.avg60) == 17); + assert_se(LOAD_INT(rp.avg300) == 1); + assert_se(LOAD_FRAC(rp.avg300) == 11); + assert_se(rp.total == 58761459); + assert(read_resource_pressure(path, PRESSURE_TYPE_FULL, &rp) == 0); + assert_se(LOAD_INT(rp.avg10) == 0); + assert_se(LOAD_FRAC(rp.avg10) == 23); + assert_se(LOAD_INT(rp.avg60) == 0); + assert_se(LOAD_FRAC(rp.avg60) == 16); + assert_se(LOAD_INT(rp.avg300) == 1); + assert_se(LOAD_FRAC(rp.avg300) == 8); + assert_se(rp.total == 58464525); +} + +int main(void) { + test_setup_logging(LOG_DEBUG); + test_read_mem_pressure(); + return 0; +} diff --git a/src/test/test-tables.c b/src/test/test-tables.c index 59f90b76ec..7273611143 100644 --- a/src/test/test-tables.c +++ b/src/test/test-tables.c @@ -3,6 +3,7 @@ #include "architecture.h" #include "automount.h" #include "cgroup.h" +#include "cgroup-util.h" #include "compress.h" #include "condition.h" #include "device-private.h" @@ -71,6 +72,7 @@ int main(int argc, char **argv) { test_table(locale_variable, VARIABLE_LC); test_table(log_target, LOG_TARGET); test_table(mac_address_policy, MAC_ADDRESS_POLICY); + test_table(managed_oom_mode, MANAGED_OOM_MODE); test_table(manager_state, MANAGER_STATE); test_table(manager_timestamp, MANAGER_TIMESTAMP); test_table(mount_exec_command, MOUNT_EXEC_COMMAND); diff --git a/sysusers.d/systemd.conf.m4 b/sysusers.d/systemd.conf.m4 index ef5a3cb619..fdfdcf553c 100644 --- a/sysusers.d/systemd.conf.m4 +++ b/sysusers.d/systemd.conf.m4 @@ -9,6 +9,9 @@ g systemd-journal - - m4_ifdef(`ENABLE_NETWORKD', u systemd-network - "systemd Network Management" )m4_dnl +m4_ifdef(`ENABLE_OOMD', +u systemd-oom - "systemd Userspace OOM Killer" +)m4_dnl m4_ifdef(`ENABLE_RESOLVE', u systemd-resolve - "systemd Resolver" )m4_dnl diff --git a/units/meson.build b/units/meson.build index 08c39c99b3..e94e7f7efd 100644 --- a/units/meson.build +++ b/units/meson.build @@ -201,6 +201,7 @@ in_units = [ ['systemd-networkd.service', 'ENABLE_NETWORKD'], ['systemd-networkd-wait-online.service', 'ENABLE_NETWORKD'], ['systemd-nspawn@.service', ''], + ['systemd-oomd.service', 'ENABLE_OOMD'], ['systemd-portabled.service', 'ENABLE_PORTABLED', 'dbus-org.freedesktop.portable1.service'], ['systemd-userdbd.service', 'ENABLE_USERDB'], diff --git a/units/systemd-oomd.service.in b/units/systemd-oomd.service.in new file mode 100644 index 0000000000..a270a0ed78 --- /dev/null +++ b/units/systemd-oomd.service.in @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: LGPL-2.1+ +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Unit] +Description=Userspace Out-Of-Memory (OOM) Killer +Documentation=man:systemd-oomd.service(8) +ConditionCapability=CAP_KILL +DefaultDependencies=no +Before=multi-user.target shutdown.target +Conflicts=shutdown.target + +[Service] +AmbientCapabilities=CAP_KILL CAP_DAC_OVERRIDE +BusName=org.freedesktop.oom1 +CapabilityBoundingSet=CAP_KILL CAP_DAC_OVERRIDE +ExecStart=@rootlibexecdir@/systemd-oomd +IPAddressDeny=any +LockPersonality=yes +MemoryDenyWriteExecute=yes +# Reserve some minimum amount of memory so that systemd-oomd can continue to +# run in resource starved scenarios. +MemoryMin=64M +MemoryLow=64M +NoNewPrivileges=yes +OOMScoreAdjust=-900 +PrivateDevices=yes +PrivateTmp=yes +ProtectClock=yes +ProtectHome=yes +ProtectHostname=yes +ProtectKernelLogs=yes +ProtectKernelModules=yes +ProtectKernelTunables=yes +ProtectSystem=strict +Restart=on-failure +RestrictAddressFamilies=AF_UNIX +RestrictNamespaces=yes +RestrictRealtime=yes +RestrictSUIDSGID=yes +SystemCallArchitectures=native +SystemCallErrorNumber=EPERM +SystemCallFilter=@system-service +Type=notify +User=systemd-oom +@SERVICE_WATCHDOG@ + +[Install] +WantedBy=multi-user.target +Alias=dbus-org.freedesktop.oom1.service