diff --git a/docs/TRANSIENT-SETTINGS.md b/docs/TRANSIENT-SETTINGS.md
index f8ff413d28..f0dc2ee20f 100644
--- a/docs/TRANSIENT-SETTINGS.md
+++ b/docs/TRANSIENT-SETTINGS.md
@@ -270,6 +270,9 @@ All cgroup/resource control settings are available for transient units
✓ IPAccounting=
✓ IPAddressAllow=
✓ IPAddressDeny=
+✓ ManagedOOMSwap=
+✓ ManagedOOMMemoryPressure=
+✓ ManagedOOMMemoryPressureLimitPercent=
```
## Process Killing Settings
diff --git a/man/oomctl.xml b/man/oomctl.xml
new file mode 100644
index 0000000000..10633b92fc
--- /dev/null
+++ b/man/oomctl.xml
@@ -0,0 +1,86 @@
+
+
+
+
+
+
+
+ oomctl
+ systemd
+
+
+
+ oomctl
+ 1
+
+
+
+ oomctl
+ Analyze the state stored in systemd-oomd
+
+
+
+
+ oomctl
+ OPTIONS
+ COMMAND
+
+
+
+
+ Description
+
+ oomctl may be used to get information about the various contexts read in by
+ the systemd1 userspace
+ out-of-memory (OOM) killer,
+ systemd-oomd8.
+
+
+
+
+ Commands
+
+ The following commands are understood:
+
+
+
+ dump
+
+ Show the current state of the cgroup(s) and system context(s) stored by
+ systemd-oomd.
+
+
+
+
+
+
+
+ Options
+
+ The following options are understood:
+
+
+
+
+
+
+
+
+
+ Exit status
+
+ On success, 0 is returned, a non-zero failure code otherwise.
+
+
+
+ See Also
+
+ systemd1,
+ systemd-oomd.service8,
+ oomd.conf5
+
+
+
+
diff --git a/man/oomd.conf.xml b/man/oomd.conf.xml
new file mode 100644
index 0000000000..e6be947c5b
--- /dev/null
+++ b/man/oomd.conf.xml
@@ -0,0 +1,88 @@
+
+
+
+
+
+
+ oomd.conf
+ systemd
+
+
+
+ oomd.conf
+ 5
+
+
+
+ oomd.conf
+ oomd.conf.d
+ Global systemd-oomd configuration files
+
+
+
+ /etc/systemd/oomd.conf
+ /etc/systemd/oomd.conf.d/*.conf
+ /usr/lib/systemd/oomd.conf.d/*.conf
+
+
+
+ Description
+
+ These files configure the various parameters of the
+ systemd1 userspace
+ out-of-memory (OOM) killer,
+ systemd-oomd.service8.
+ See systemd.syntax7
+ for a general description of the syntax.
+
+
+
+
+
+
+ [OOM] Section Options
+
+ The following options are available in the [OOM] section:
+
+
+
+ SwapUsedLimitPercent=
+
+ Sets the limit for swap usage on the system before systemd-oomd will
+ take action. If the percentage of swap used on the system is more than what is defined here,
+ systemd-oomd will act on eligible descendant cgroups, starting from the ones with the
+ highest swap usage to the lowest swap usage. Which cgroups are monitored and what
+ action gets taken depends on what the unit has configured for ManagedOOMSwap=.
+ Takes a percentage value between 0% and 100%, inclusive. Defaults to 90%.
+
+
+
+ DefaultMemoryPressureLimitPercent=
+
+ Sets the limit for memory pressure on the unit's cgroup before systemd-oomd
+ will take action. A unit can override this value with ManagedOOMMemoryPressureLimitPercent=.
+ The memory pressure for this property represents the fraction of time in a 10 second window in which all tasks
+ in the cgroup were delayed. For each monitored cgroup, if the memory pressure on that cgroup exceeds the
+ limit set for more than 30 seconds, systemd-oomd will act on eligible descendant cgroups,
+ starting from the ones with the most reclaim activity to the least reclaim activity. Which cgroups are
+ monitored and what action gets taken depends on what the unit has configured for
+ ManagedOOMMemoryPressure=. Takes a percentage value between 0% and 100%, inclusive.
+ Defaults to 60%.
+
+
+
+
+
+
+ See Also
+
+ systemd1,
+ systemd.resource-control5,
+ systemd-oomd.service8,
+ oomctl1
+
+
+
+
diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml
index 02f7293288..3c0e5b6eb1 100644
--- a/man/org.freedesktop.systemd1.xml
+++ b/man/org.freedesktop.systemd1.xml
@@ -2414,6 +2414,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
readonly as IPEgressFilterPath = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DisableControllers = ['...', ...];
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMSwap = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMMemoryPressure = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMMemoryPressureLimitPercent = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as Environment = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@@ -2928,6 +2934,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
+
+
+
+
+
+
@@ -3478,6 +3490,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
+
+
+
+
+
+
@@ -4121,6 +4139,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
readonly as IPEgressFilterPath = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DisableControllers = ['...', ...];
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMSwap = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMMemoryPressure = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMMemoryPressureLimitPercent = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as Environment = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@@ -4661,6 +4685,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
+
+
+
+
+
+
@@ -5211,6 +5241,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
+
+
+
+
+
+
@@ -5780,6 +5816,12 @@ node /org/freedesktop/systemd1/unit/home_2emount {
readonly as IPEgressFilterPath = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DisableControllers = ['...', ...];
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMSwap = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMMemoryPressure = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMMemoryPressureLimitPercent = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as Environment = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@@ -6250,6 +6292,12 @@ node /org/freedesktop/systemd1/unit/home_2emount {
+
+
+
+
+
+
@@ -6720,6 +6768,12 @@ node /org/freedesktop/systemd1/unit/home_2emount {
+
+
+
+
+
+
@@ -7404,6 +7458,12 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
readonly as IPEgressFilterPath = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DisableControllers = ['...', ...];
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMSwap = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMMemoryPressure = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMMemoryPressureLimitPercent = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as Environment = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@@ -7860,6 +7920,12 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
+
+
+
+
+
+
@@ -8316,6 +8382,12 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
+
+
+
+
+
+
@@ -8859,6 +8931,12 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
readonly as IPEgressFilterPath = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DisableControllers = ['...', ...];
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMSwap = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMMemoryPressure = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMMemoryPressureLimitPercent = '...';
};
interface org.freedesktop.DBus.Peer { ... };
interface org.freedesktop.DBus.Introspectable { ... };
@@ -8989,6 +9067,12 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
+
+
+
+
+
+
@@ -9123,6 +9207,12 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
+
+
+
+
+
+
@@ -9276,6 +9366,12 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
readonly as IPEgressFilterPath = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly as DisableControllers = ['...', ...];
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMSwap = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMMemoryPressure = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+ readonly s ManagedOOMMemoryPressureLimitPercent = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s KillMode = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@@ -9422,6 +9518,12 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
+
+
+
+
+
+
@@ -9582,6 +9684,12 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
+
+
+
+
+
+
diff --git a/man/rules/meson.build b/man/rules/meson.build
index 00cd57420e..806561a412 100644
--- a/man/rules/meson.build
+++ b/man/rules/meson.build
@@ -45,6 +45,8 @@ manpages = [
['nss-mymachines', '8', ['libnss_mymachines.so.2'], 'ENABLE_NSS_MYMACHINES'],
['nss-resolve', '8', ['libnss_resolve.so.2'], 'ENABLE_NSS_RESOLVE'],
['nss-systemd', '8', ['libnss_systemd.so.2'], 'ENABLE_NSS_SYSTEMD'],
+ ['oomctl', '1', [], 'ENABLE_OOMD'],
+ ['oomd.conf', '5', ['oomd.conf.d'], 'ENABLE_OOMD'],
['org.freedesktop.LogControl1', '5', [], ''],
['org.freedesktop.home1', '5', [], 'ENABLE_HOMED'],
['org.freedesktop.hostname1', '5', [], 'ENABLE_HOSTNAMED'],
@@ -907,6 +909,7 @@ manpages = [
['systemd-networkd.service', '8', ['systemd-networkd'], 'ENABLE_NETWORKD'],
['systemd-notify', '1', [], ''],
['systemd-nspawn', '1', [], ''],
+ ['systemd-oomd.service', '8', ['systemd-oomd'], 'ENABLE_OOMD'],
['systemd-path', '1', [], ''],
['systemd-portabled.service', '8', ['systemd-portabled'], 'ENABLE_PORTABLED'],
['systemd-pstore.service', '8', ['systemd-pstore'], 'ENABLE_PSTORE'],
diff --git a/man/systemd-oomd.service.xml b/man/systemd-oomd.service.xml
new file mode 100644
index 0000000000..9d72373d1e
--- /dev/null
+++ b/man/systemd-oomd.service.xml
@@ -0,0 +1,98 @@
+
+
+
+
+
+
+
+ systemd-oomd.service
+ systemd
+
+
+
+ systemd-oomd.service
+ 8
+
+
+
+ systemd-oomd.service
+ systemd-oomd
+ A userspace out-of-memory (OOM) killer
+
+
+
+ systemd-oomd.service
+ /usr/lib/systemd/systemd-oomd
+
+
+
+ Description
+
+ systemd-oomd is a system service that uses cgroups-v2 and pressure stall information (PSI)
+ to monitor and take action on processes before an OOM occurs in kernel space.
+
+ You can enable monitoring and actions on units by setting ManagedOOMSwap= and/or
+ ManagedOOMMemoryPressure= to the appropriate value. systemd-oomd will
+ periodically poll enabled units' cgroup data to detect when corrective action needs to occur. When an action needs
+ to happen, it will only be performed on the descendant cgroups of the enabled units. More precisely, only cgroups with
+ memory.oom.group set to 1 and leaf cgroup nodes are eligible candidates.
+ Action will be taken recursively on all of the processes under the chosen candidate.
+
+ See
+ oomd.conf5
+ for more information about the configuration of this service.
+
+
+
+ Setup Information
+
+ The system must be running systemd with a full unified cgroup hierarchy for the expected cgroups-v2 features.
+ Furthermore, resource accounting must be turned on for all units monitored by systemd-oomd.
+ The easiest way to turn on resource accounting is by ensuring the values for DefaultCPUAccounting,
+ DefaultIOAccounting, DefaultMemoryAccounting, and
+ DefaultTasksAccounting are set to true in
+ systemd-system.conf5.
+
+ You will need a kernel compiled with PSI support. This is available in Linux 4.20 and above.
+
+ The system must also have swap enabled for systemd-oomd to function correctly. With swap
+ enabled, the system spends enough time swapping pages to let systemd-oomd react.
+ Without swap, the system enters a livelocked state much more quickly and may prevent systemd-oomd
+ from responding in a reasonable amount of time. See
+ "In defence of swap: common misconceptions"
+ for more details on swap.
+
+ Be aware that if you intend to enable monitoring and actions on user.slice,
+ user-$UID.slice, or their ancestor cgroups, it is highly recommended that your programs be
+ managed by the systemd user manager to prevent running too many processes under the same session scope (and thus
+ avoid a situation where memory intensive tasks trigger systemd-oomd to kill everything under the
+ cgroup). If you're using a desktop environment like GNOME, it already spawns many session components with the
+ systemd user manager.
+
+
+
+ Usage Recommendations
+
+ ManagedOOMSwap= works with the system-wide swap values, so setting it on the root slice
+ -.slice, and allowing all descendant cgroups to be eligible candidates may make the most
+ sense.
+
+ ManagedOOMMemoryPressure= tends to work better on the cgroups below the root slice
+ -.slice. For units which tend to have processes that are less latency sensitive (e.g.
+ system.slice), a higher limit like the default of 60% may be acceptable, as those processes
+ can usually ride out slowdowns caused by lack of memory without serious consequences. However, something like
+ user@$UID.service may prefer a much lower value like 40%.
+
+
+
+ See Also
+
+ systemd1,
+ systemd-system.conf5,
+ systemd.resource-control5,
+ oomd.conf5,
+ oomctl1
+
+
+
diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml
index d72f9048e7..b40fa86145 100644
--- a/man/systemd.resource-control.xml
+++ b/man/systemd.resource-control.xml
@@ -869,6 +869,49 @@ DeviceAllow=/dev/loop-control
+
+
+ ManagedOOMSwap=auto|kill
+ ManagedOOMMemoryPressure=auto|kill
+
+
+ Specifies how
+ systemd-oomd.service8
+ will act on this unit's cgroups. Defaults to .
+
+ When set to , systemd-oomd will actively monitor this unit's
+ cgroup metrics to decide whether it needs to act. If the cgroup passes the limits set by
+ oomd.conf5 or its
+ overrides, systemd-oomd will send a SIGKILL to all of the processes
+ under the chosen candidate cgroup. Note that only descendant cgroups can be eligible candidates for killing;
+ the unit that set its property to is not a candidate (unless one of its ancestors set
+ their property to ). You can find more details on candidates and kill behavior at
+ systemd-oomd.service8
+ and oomd.conf5. Setting
+ either of these properties to will also automatically acquire
+ After= and Wants= dependencies on
+ systemd-oomd.service unless DefaultDependencies=no.
+
+
+ When set to , systemd-oomd will not actively use this cgroup's
+ data for monitoring and detection. However, if an ancestor cgroup has one of these properties set to
+ , a unit with can still be an eligible candidate for
+ systemd-oomd to act on.
+
+
+
+
+ ManagedOOMMemoryPressureLimitPercent=
+
+
+ Overrides the default memory pressure limit set by
+ oomd.conf5 for this unit
+ (cgroup). Takes a percentage value between 0% and 100%, inclusive. This property is ignored unless
+ ManagedOOMMemoryPressure=. Defaults to 0%, which means use the
+ default set by oomd.conf5.
+
+
+
@@ -1030,6 +1073,7 @@ DeviceAllow=/dev/loop-control
systemd.exec5,
systemd.directives7,
systemd.special7,
+ systemd-oomd.service8,
The documentation for control groups and specific controllers in the Linux kernel:
Control Groups v2.
diff --git a/meson.build b/meson.build
index 04cb63d921..43cf7bf2bb 100644
--- a/meson.build
+++ b/meson.build
@@ -1412,6 +1412,9 @@ conf.set10('ENABLE_HOMED', have)
have = have and conf.get('HAVE_PAM') == 1
conf.set10('ENABLE_PAM_HOME', have)
+have = get_option('oomd') and get_option('mode') == 'developer'
+conf.set10('ENABLE_OOMD', have)
+
want_remote = get_option('remote')
if want_remote != 'false'
have_deps = [conf.get('HAVE_MICROHTTPD') == 1,
@@ -1451,6 +1454,7 @@ foreach term : ['analyze',
'networkd',
'nss-myhostname',
'nss-systemd',
+ 'oomd',
'portabled',
'pstore',
'quotacheck',
@@ -1671,6 +1675,7 @@ subdir('src/analyze')
subdir('src/journal-remote')
subdir('src/coredump')
subdir('src/pstore')
+subdir('src/oom')
subdir('src/hostname')
subdir('src/import')
subdir('src/partition')
@@ -2730,6 +2735,27 @@ if conf.get('ENABLE_PSTORE') == 1
install_dir : rootlibexecdir)
endif
+if conf.get('ENABLE_OOMD') == 1
+ executable('systemd-oomd',
+ systemd_oomd_sources,
+ include_directories : includes,
+ link_with : [libshared],
+ dependencies : [],
+ install_rpath : rootlibexecdir,
+ install : true,
+ install_dir : rootlibexecdir)
+
+ public_programs += executable(
+ 'oomctl',
+ oomctl_sources,
+ include_directories : includes,
+ link_with : [libshared],
+ dependencies : [],
+ install_rpath : rootlibexecdir,
+ install : true,
+ install_dir : rootbindir)
+endif
+
if conf.get('ENABLE_BINFMT') == 1
public_programs += executable(
'systemd-binfmt',
@@ -3748,6 +3774,7 @@ foreach tuple : [
['DNS-over-TLS(openssl)', conf.get('DNS_OVER_TLS_USE_OPENSSL') == 1],
['coredump'],
['pstore'],
+ ['oomd'],
['polkit'],
['legacy pkla', install_polkit_pkla],
['efi'],
diff --git a/meson_options.txt b/meson_options.txt
index d5ce647ae6..a6a0c1e4b8 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -97,6 +97,8 @@ option('coredump', type : 'boolean',
description : 'install the coredump handler')
option('pstore', type : 'boolean',
description : 'install the pstore archival tool')
+option('oomd', type : 'boolean', value : 'false',
+ description : 'install the userspace oom killer')
option('logind', type : 'boolean',
description : 'install the systemd-logind stack')
option('hostnamed', type : 'boolean',
diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c
index 6210347553..d2655673fd 100644
--- a/src/basic/cgroup-util.c
+++ b/src/basic/cgroup-util.c
@@ -1685,6 +1685,26 @@ int cg_get_attribute_as_uint64(const char *controller, const char *path, const c
return 0;
}
+int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret) {
+ _cleanup_free_ char *value = NULL;
+ int r;
+
+ assert(ret);
+
+ r = cg_get_attribute(controller, path, attribute, &value);
+ if (r == -ENOENT)
+ return -ENODATA;
+ if (r < 0)
+ return r;
+
+ r = parse_boolean(value);
+ if (r < 0)
+ return r;
+
+ *ret = r;
+ return 0;
+}
+
int cg_get_keyed_attribute_full(
const char *controller,
const char *path,
@@ -2161,3 +2181,10 @@ CGroupMask get_cpu_accounting_mask(void) {
bool cpu_accounting_is_cheap(void) {
return get_cpu_accounting_mask() == 0;
}
+
+static const char* const managed_oom_mode_table[_MANAGED_OOM_MODE_MAX] = {
+ [MANAGED_OOM_AUTO] = "auto",
+ [MANAGED_OOM_KILL] = "kill",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(managed_oom_mode, ManagedOOMMode);
diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h
index 2b88571bc1..eda2b16a1b 100644
--- a/src/basic/cgroup-util.h
+++ b/src/basic/cgroup-util.h
@@ -208,6 +208,9 @@ static inline int cg_get_keyed_attribute_graceful(
int cg_get_attribute_as_uint64(const char *controller, const char *path, const char *attribute, uint64_t *ret);
+/* Does a parse_boolean() on the attribute contents and sets ret accordingly */
+int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret);
+
int cg_set_access(const char *controller, const char *path, uid_t uid, gid_t gid);
int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags);
@@ -275,3 +278,13 @@ CGroupController cgroup_controller_from_string(const char *s) _pure_;
bool is_cgroup_fs(const struct statfs *s);
bool fd_is_cgroup_fs(int fd);
+
+typedef enum ManagedOOMMode {
+ MANAGED_OOM_AUTO,
+ MANAGED_OOM_KILL,
+ _MANAGED_OOM_MODE_MAX,
+ _MANAGED_OOM_MODE_INVALID = -1,
+} ManagedOOMMode;
+
+const char* managed_oom_mode_to_string(ManagedOOMMode m) _const_;
+ManagedOOMMode managed_oom_mode_from_string(const char *s) _pure_;
diff --git a/src/basic/def.h b/src/basic/def.h
index 970654a1ad..9f1f3c229c 100644
--- a/src/basic/def.h
+++ b/src/basic/def.h
@@ -63,3 +63,5 @@
.un.sun_family = AF_UNIX, \
.un.sun_path = "\0/org/freedesktop/plymouthd", \
}
+
+#define VARLINK_ADDR_PATH_MANAGED_OOM "/run/systemd/io.system.ManagedOOM"
diff --git a/src/basic/linux/loadavg.h b/src/basic/linux/loadavg.h
new file mode 100644
index 0000000000..521a787e8a
--- /dev/null
+++ b/src/basic/linux/loadavg.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_LOADAVG_H
+#define _LINUX_SCHED_LOADAVG_H
+
+/*
+ * These are the constant used to fake the fixed-point load-average
+ * counting. Some notes:
+ * - 11 bit fractions expand to 22 bits by the multiplies: this gives
+ * a load-average precision of 10 bits integer + 11 bits fractional
+ * - if you want to count load-averages more often, you need more
+ * precision, or rounding will get you. With 2-second counting freq,
+ * the EXP_n values would be 1981, 2034 and 2043 if still using only
+ * 11 bit fractions.
+ */
+extern unsigned long avenrun[]; /* Load averages */
+extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
+
+#define FSHIFT 11 /* nr of bits of precision */
+#define FIXED_1 (1<= load)
+ newload += FIXED_1-1;
+
+ return newload / FIXED_1;
+}
+
+extern unsigned long calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n);
+
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+
+extern void calc_global_load(unsigned long ticks);
+
+#endif /* _LINUX_SCHED_LOADAVG_H */
diff --git a/src/basic/meson.build b/src/basic/meson.build
index d144bc2f87..09eb612c3a 100644
--- a/src/basic/meson.build
+++ b/src/basic/meson.build
@@ -111,6 +111,7 @@ basic_sources = files('''
linux/ipv6_route.h
linux/l2tp.h
linux/libc-compat.h
+ linux/loadavg.h
linux/netdevice.h
linux/netlink.h
linux/rtnetlink.h
diff --git a/src/basic/parse-util.c b/src/basic/parse-util.c
index 818c9054d6..dca2ef9f92 100644
--- a/src/basic/parse-util.c
+++ b/src/basic/parse-util.c
@@ -862,3 +862,45 @@ int parse_oom_score_adjust(const char *s, int *ret) {
*ret = v;
return 0;
}
+
+int store_loadavg_fixed_point(unsigned long i, unsigned long f, loadavg_t *ret) {
+ assert(ret);
+
+ if (i >= (~0UL << FSHIFT))
+ return -ERANGE;
+
+ i = i << FSHIFT;
+ f = DIV_ROUND_UP((f << FSHIFT), 100);
+
+ if (f >= FIXED_1)
+ return -ERANGE;
+
+ *ret = i | f;
+ return 0;
+}
+
+int parse_loadavg_fixed_point(const char *s, loadavg_t *ret) {
+ const char *d, *f_str, *i_str;
+ unsigned long i, f;
+ int r;
+
+ assert(s);
+ assert(ret);
+
+ d = strchr(s, '.');
+ if (!d)
+ return -EINVAL;
+
+ i_str = strndupa(s, d - s);
+ f_str = d + 1;
+
+ r = safe_atolu_full(i_str, 10, &i);
+ if (r < 0)
+ return r;
+
+ r = safe_atolu_full(f_str, 10, &f);
+ if (r < 0)
+ return r;
+
+ return store_loadavg_fixed_point(i, f, ret);
+}
diff --git a/src/basic/parse-util.h b/src/basic/parse-util.h
index 2cee65c49a..f22a19c5c6 100644
--- a/src/basic/parse-util.h
+++ b/src/basic/parse-util.h
@@ -3,12 +3,15 @@
#include
#include
+#include
#include
#include
#include
#include "macro.h"
+typedef unsigned long loadavg_t;
+
int parse_boolean(const char *v) _pure_;
int parse_dev(const char *s, dev_t *ret);
int parse_pid(const char *s, pid_t* ret_pid);
@@ -88,18 +91,18 @@ static inline int safe_atoux64(const char *s, uint64_t *ret) {
}
#if LONG_MAX == INT_MAX
-static inline int safe_atolu(const char *s, unsigned long *ret_u) {
+static inline int safe_atolu_full(const char *s, unsigned base, long unsigned *ret_u) {
assert_cc(sizeof(unsigned long) == sizeof(unsigned));
- return safe_atou(s, (unsigned*) ret_u);
+ return safe_atou_full(s, base, (unsigned*) ret_u);
}
static inline int safe_atoli(const char *s, long int *ret_u) {
assert_cc(sizeof(long int) == sizeof(int));
return safe_atoi(s, (int*) ret_u);
}
#else
-static inline int safe_atolu(const char *s, unsigned long *ret_u) {
+static inline int safe_atolu_full(const char *s, unsigned base, unsigned long *ret_u) {
assert_cc(sizeof(unsigned long) == sizeof(unsigned long long));
- return safe_atollu(s, (unsigned long long*) ret_u);
+ return safe_atollu_full(s, base, (unsigned long long*) ret_u);
}
static inline int safe_atoli(const char *s, long int *ret_u) {
assert_cc(sizeof(long int) == sizeof(long long int));
@@ -107,6 +110,10 @@ static inline int safe_atoli(const char *s, long int *ret_u) {
}
#endif
+static inline int safe_atolu(const char *s, unsigned long *ret_u) {
+ return safe_atolu_full(s, 0, ret_u);
+}
+
#if SIZE_MAX == UINT_MAX
static inline int safe_atozu(const char *s, size_t *ret_u) {
assert_cc(sizeof(size_t) == sizeof(unsigned));
@@ -137,3 +144,8 @@ int parse_ip_port_range(const char *s, uint16_t *low, uint16_t *high);
int parse_ip_prefix_length(const char *s, int *ret);
int parse_oom_score_adjust(const char *s, int *ret);
+
+/* Given a Linux load average (e.g. decimal number 34.89 where 34 is passed as i and 89 is passed as f), convert it
+ * to a loadavg_t. */
+int store_loadavg_fixed_point(unsigned long i, unsigned long f, loadavg_t *ret);
+int parse_loadavg_fixed_point(const char *s, loadavg_t *ret);
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
index 211e4a5945..1958c1be2b 100644
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -128,6 +128,9 @@ void cgroup_context_init(CGroupContext *c) {
.startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
.tasks_max = TASKS_MAX_UNSET,
+
+ .moom_swap = MANAGED_OOM_AUTO,
+ .moom_mem_pressure = MANAGED_OOM_AUTO,
};
}
@@ -411,7 +414,10 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
"%sTasksMax: %" PRIu64 "\n"
"%sDevicePolicy: %s\n"
"%sDisableControllers: %s\n"
- "%sDelegate: %s\n",
+ "%sDelegate: %s\n"
+ "%sManagedOOMSwap: %s\n"
+ "%sManagedOOMMemoryPressure: %s\n"
+ "%sManagedOOMMemoryPressureLimitPercent: %d%%\n",
prefix, yes_no(c->cpu_accounting),
prefix, yes_no(c->io_accounting),
prefix, yes_no(c->blockio_accounting),
@@ -441,7 +447,10 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
prefix, tasks_max_resolve(&c->tasks_max),
prefix, cgroup_device_policy_to_string(c->device_policy),
prefix, strempty(disable_controllers_str),
- prefix, yes_no(c->delegate));
+ prefix, yes_no(c->delegate),
+ prefix, managed_oom_mode_to_string(c->moom_swap),
+ prefix, managed_oom_mode_to_string(c->moom_mem_pressure),
+ prefix, c->moom_mem_pressure_limit);
if (c->delegate) {
_cleanup_free_ char *t = NULL;
@@ -2672,6 +2681,47 @@ static void unit_remove_from_cgroup_empty_queue(Unit *u) {
u->in_cgroup_empty_queue = false;
}
+int unit_check_oomd_kill(Unit *u) {
+ _cleanup_free_ char *value = NULL;
+ bool increased;
+ uint64_t n = 0;
+ int r;
+
+ if (!u->cgroup_path)
+ return 0;
+
+ r = cg_all_unified();
+ if (r < 0)
+ return log_unit_debug_errno(u, r, "Couldn't determine whether we are in all unified mode: %m");
+ else if (r == 0)
+ return 0;
+
+ r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.systemd_oomd_kill", &value);
+ if (r < 0 && r != -ENODATA)
+ return r;
+
+ if (!isempty(value)) {
+ r = safe_atou64(value, &n);
+ if (r < 0)
+ return r;
+ }
+
+ increased = n > u->managed_oom_kill_last;
+ u->managed_oom_kill_last = n;
+
+ if (!increased)
+ return 0;
+
+ if (n > 0)
+ log_struct(LOG_NOTICE,
+ "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR,
+ LOG_UNIT_ID(u),
+ LOG_UNIT_INVOCATION_ID(u),
+ LOG_UNIT_MESSAGE(u, "systemd-oomd killed %"PRIu64" process(es) in this unit.", n));
+
+ return 1;
+}
+
int unit_check_oom(Unit *u) {
_cleanup_free_ char *oom_kill = NULL;
bool increased;
diff --git a/src/core/cgroup.h b/src/core/cgroup.h
index 9ac5c8bfc0..881b3f3dfe 100644
--- a/src/core/cgroup.h
+++ b/src/core/cgroup.h
@@ -159,6 +159,11 @@ struct CGroupContext {
/* Common */
TasksMax tasks_max;
+
+ /* Settings for systemd-oomd */
+ ManagedOOMMode moom_swap;
+ ManagedOOMMode moom_mem_pressure;
+ int moom_mem_pressure_limit;
};
/* Used when querying IP accounting data */
@@ -224,6 +229,7 @@ int unit_watch_cgroup(Unit *u);
int unit_watch_cgroup_memory(Unit *u);
void unit_add_to_cgroup_empty_queue(Unit *u);
+int unit_check_oomd_kill(Unit *u);
int unit_check_oom(Unit *u);
int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path);
diff --git a/src/core/core-varlink.c b/src/core/core-varlink.c
index 4bb262bd93..18219097f4 100644
--- a/src/core/core-varlink.c
+++ b/src/core/core-varlink.c
@@ -2,6 +2,7 @@
#include "core-varlink.h"
#include "mkdir.h"
+#include "strv.h"
#include "user-util.h"
#include "varlink.h"
@@ -15,6 +16,11 @@ typedef struct LookupParameters {
const char *service;
} LookupParameters;
+static const char* const managed_oom_mode_properties[] = {
+ "ManagedOOMSwap",
+ "ManagedOOMMemoryPressure",
+};
+
static int build_user_json(const char *user_name, uid_t uid, JsonVariant **ret) {
assert(user_name);
assert(uid_is_valid(uid));
@@ -45,6 +51,150 @@ static bool user_match_lookup_parameters(LookupParameters *p, const char *name,
return true;
}
+static int build_managed_oom_json_array_element(Unit *u, const char *property, JsonVariant **ret_v) {
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ CGroupContext *c;
+ const char *mode;
+ int r;
+
+ assert(u);
+ assert(property);
+ assert(ret_v);
+
+ if (!UNIT_VTABLE(u)->can_set_managed_oom)
+ return -EOPNOTSUPP;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return -EINVAL;
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
+ /* systemd-oomd should always treat inactive units as though they didn't enable any action since they
+ * should not have a valid cgroup */
+ mode = managed_oom_mode_to_string(MANAGED_OOM_AUTO);
+ else if (streq(property, "ManagedOOMSwap"))
+ mode = managed_oom_mode_to_string(c->moom_swap);
+ else if (streq(property, "ManagedOOMMemoryPressure"))
+ mode = managed_oom_mode_to_string(c->moom_mem_pressure);
+ else
+ return -EINVAL;
+
+ r = json_build(&v, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("mode", JSON_BUILD_STRING(mode)),
+ JSON_BUILD_PAIR("path", JSON_BUILD_STRING(u->cgroup_path)),
+ JSON_BUILD_PAIR("property", JSON_BUILD_STRING(property)),
+ JSON_BUILD_PAIR("limit", JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit))));
+
+ *ret_v = TAKE_PTR(v);
+ return r;
+}
+
+int manager_varlink_send_managed_oom_update(Unit *u) {
+ _cleanup_(json_variant_unrefp) JsonVariant *arr = NULL, *v = NULL;
+ CGroupContext *c;
+ int r;
+
+ assert(u);
+
+ if (!UNIT_VTABLE(u)->can_set_managed_oom || !u->manager || !u->manager->managed_oom_varlink_request || !u->cgroup_path)
+ return 0;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return 0;
+
+ r = json_build(&arr, JSON_BUILD_EMPTY_ARRAY);
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < ELEMENTSOF(managed_oom_mode_properties); i++) {
+ _cleanup_(json_variant_unrefp) JsonVariant *e = NULL;
+
+ r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[i], &e);
+ if (r < 0)
+ return r;
+
+ r = json_variant_append_array(&arr, e);
+ if (r < 0)
+ return r;
+ }
+
+ r = json_build(&v, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("cgroups", JSON_BUILD_VARIANT(arr))));
+ if (r < 0)
+ return r;
+
+ return varlink_notify(u->manager->managed_oom_varlink_request, v);
+}
+
+static int vl_method_subscribe_managed_oom_cgroups(
+ Varlink *link,
+ JsonVariant *parameters,
+ VarlinkMethodFlags flags,
+ void *userdata) {
+ static const UnitType supported_unit_types[] = { UNIT_SLICE, UNIT_SERVICE, UNIT_SCOPE };
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *arr = NULL;
+ Manager *m = userdata;
+ int r;
+
+ assert(link);
+ assert(m);
+
+ if (json_variant_elements(parameters) > 0)
+ return varlink_error_invalid_parameter(link, parameters);
+
+ /* We only take one subscriber for this method so return an error if there's already an existing one.
+ * This shouldn't happen since systemd-oomd is the only client of this method. */
+ if (FLAGS_SET(flags, VARLINK_METHOD_MORE) && m->managed_oom_varlink_request)
+ return varlink_error(m->managed_oom_varlink_request, VARLINK_ERROR_SUBSCRIPTION_TAKEN, NULL);
+
+ r = json_build(&arr, JSON_BUILD_EMPTY_ARRAY);
+ if (r < 0)
+ return r;
+
+ for (size_t i = 0; i < ELEMENTSOF(supported_unit_types); i++) {
+ Unit *u;
+
+ LIST_FOREACH(units_by_type, u, m->units_by_type[supported_unit_types[i]]) {
+ CGroupContext *c;
+
+ if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
+ continue;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ continue;
+
+ for (size_t j = 0; j < ELEMENTSOF(managed_oom_mode_properties); j++) {
+ _cleanup_(json_variant_unrefp) JsonVariant *e = NULL;
+
+ /* For the initial varlink call we only care about units that enabled (i.e. mode is not
+ * set to "auto") oomd properties. */
+ if (!(streq(managed_oom_mode_properties[j], "ManagedOOMSwap") && c->moom_swap == MANAGED_OOM_KILL) &&
+ !(streq(managed_oom_mode_properties[j], "ManagedOOMMemoryPressure") && c->moom_mem_pressure == MANAGED_OOM_KILL))
+ continue;
+
+ r = build_managed_oom_json_array_element(u, managed_oom_mode_properties[j], &e);
+ if (r < 0)
+ return r;
+
+ r = json_variant_append_array(&arr, e);
+ if (r < 0)
+ return r;
+ }
+ }
+ }
+
+ r = json_build(&v, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("cgroups", JSON_BUILD_VARIANT(arr))));
+ if (r < 0)
+ return r;
+
+ if (!FLAGS_SET(flags, VARLINK_METHOD_MORE))
+ return varlink_reply(link, v);
+
+ m->managed_oom_varlink_request = varlink_ref(link);
+ return varlink_notify(m->managed_oom_varlink_request, v);
+}
+
static int vl_method_get_user_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
static const JsonDispatch dispatch_table[] = {
@@ -262,6 +412,17 @@ static int vl_method_get_memberships(Varlink *link, JsonVariant *parameters, Var
return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
}
+static void vl_disconnect(VarlinkServer *s, Varlink *link, void *userdata) {
+ Manager *m = userdata;
+
+ assert(m);
+ assert(s);
+ assert(link);
+
+ if (link == m->managed_oom_varlink_request)
+ m->managed_oom_varlink_request = varlink_unref(link);
+}
+
int manager_varlink_init(Manager *m) {
_cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
int r;
@@ -284,16 +445,25 @@ int manager_varlink_init(Manager *m) {
s,
"io.systemd.UserDatabase.GetUserRecord", vl_method_get_user_record,
"io.systemd.UserDatabase.GetGroupRecord", vl_method_get_group_record,
- "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships);
+ "io.systemd.UserDatabase.GetMemberships", vl_method_get_memberships,
+ "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", vl_method_subscribe_managed_oom_cgroups);
if (r < 0)
return log_error_errno(r, "Failed to register varlink methods: %m");
+ r = varlink_server_bind_disconnect(s, vl_disconnect);
+ if (r < 0)
+ return log_error_errno(r, "Failed to register varlink disconnect handler: %m");
+
if (!MANAGER_IS_TEST_RUN(m)) {
(void) mkdir_p_label("/run/systemd/userdb", 0755);
r = varlink_server_listen_address(s, "/run/systemd/userdb/io.systemd.DynamicUser", 0666);
if (r < 0)
return log_error_errno(r, "Failed to bind to varlink socket: %m");
+
+ r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM, 0666);
+ if (r < 0)
+ return log_error_errno(r, "Failed to bind to varlink socket: %m");
}
r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL);
@@ -307,5 +477,11 @@ int manager_varlink_init(Manager *m) {
void manager_varlink_done(Manager *m) {
assert(m);
+ /* Send the final message if we still have a subscribe request open. */
+ if (m->managed_oom_varlink_request) {
+ (void) varlink_error(m->managed_oom_varlink_request, VARLINK_ERROR_DISCONNECTED, NULL);
+ m->managed_oom_varlink_request = varlink_unref(m->managed_oom_varlink_request);
+ }
+
m->varlink_server = varlink_server_unref(m->varlink_server);
}
diff --git a/src/core/core-varlink.h b/src/core/core-varlink.h
index 89818e2766..0b191ae6c4 100644
--- a/src/core/core-varlink.h
+++ b/src/core/core-varlink.h
@@ -5,3 +5,8 @@
int manager_varlink_init(Manager *m);
void manager_varlink_done(Manager *m);
+
+/* The manager is expected to send an update to systemd-oomd if one of the following occurs:
+ * - The value of ManagedOOM*= properties change
+ * - A unit with ManagedOOM*= properties changes unit active state */
+int manager_varlink_send_managed_oom_update(Unit *u);
diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c
index 9fdd3d83ca..584d974f1a 100644
--- a/src/core/dbus-cgroup.c
+++ b/src/core/dbus-cgroup.c
@@ -8,6 +8,7 @@
#include "bus-get-properties.h"
#include "cgroup-util.h"
#include "cgroup.h"
+#include "core-varlink.h"
#include "dbus-cgroup.h"
#include "dbus-util.h"
#include "errno-util.h"
@@ -19,6 +20,7 @@
BUS_DEFINE_PROPERTY_GET(bus_property_get_tasks_max, "t", TasksMax, tasks_max_resolve);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_mode, managed_oom_mode, ManagedOOMMode);
static int property_get_cgroup_mask(
sd_bus *bus,
@@ -391,6 +393,9 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
SD_BUS_PROPERTY("IPIngressFilterPath", "as", NULL, offsetof(CGroupContext, ip_filters_ingress), 0),
SD_BUS_PROPERTY("IPEgressFilterPath", "as", NULL, offsetof(CGroupContext, ip_filters_egress), 0),
SD_BUS_PROPERTY("DisableControllers", "as", property_get_cgroup_mask, offsetof(CGroupContext, disable_controllers), 0),
+ SD_BUS_PROPERTY("ManagedOOMSwap", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_swap), 0),
+ SD_BUS_PROPERTY("ManagedOOMMemoryPressure", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_mem_pressure), 0),
+ SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimitPercent", "s", bus_property_get_percent, offsetof(CGroupContext, moom_mem_pressure_limit), 0),
SD_BUS_VTABLE_END
};
@@ -1667,6 +1672,45 @@ int bus_cgroup_set_property(
return 1;
}
+ if (STR_IN_SET(name, "ManagedOOMSwap", "ManagedOOMMemoryPressure")) {
+ ManagedOOMMode *cgroup_mode = streq(name, "ManagedOOMSwap") ? &c->moom_swap : &c->moom_mem_pressure;
+ ManagedOOMMode m;
+ const char *mode;
+
+ if (!UNIT_VTABLE(u)->can_set_managed_oom)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name);
+
+ r = sd_bus_message_read(message, "s", &mode);
+ if (r < 0)
+ return r;
+
+ m = managed_oom_mode_from_string(mode);
+ if (m < 0)
+ return -EINVAL;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *cgroup_mode = m;
+ unit_write_settingf(u, flags, name, "%s=%s", name, mode);
+ }
+
+ (void) manager_varlink_send_managed_oom_update(u);
+ return 1;
+ }
+
+ if (streq(name, "ManagedOOMMemoryPressureLimitPercent")) {
+ if (!UNIT_VTABLE(u)->can_set_managed_oom)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name);
+
+ r = bus_set_transient_percent(u, name, &c->moom_mem_pressure_limit, message, flags, error);
+ if (r < 0)
+ return r;
+
+ if (c->moom_mem_pressure == MANAGED_OOM_KILL)
+ (void) manager_varlink_send_managed_oom_update(u);
+
+ return 1;
+ }
+
if (streq(name, "DisableControllers") || (u->transient && u->load_state == UNIT_STUB))
return bus_cgroup_set_transient_property(u, c, name, message, flags, error);
diff --git a/src/core/dbus-util.c b/src/core/dbus-util.c
index 951450e53d..f534001a9c 100644
--- a/src/core/dbus-util.c
+++ b/src/core/dbus-util.c
@@ -91,6 +91,35 @@ int bus_set_transient_bool(
return 1;
}
+int bus_set_transient_percent(
+ Unit *u,
+ const char *name,
+ int *p,
+ sd_bus_message *message,
+ UnitWriteFlags flags,
+ sd_bus_error *error) {
+
+ const char *v;
+ int r;
+
+ assert(p);
+
+ r = sd_bus_message_read(message, "s", &v);
+ if (r < 0)
+ return r;
+
+ r = parse_percent(v);
+ if (r < 0)
+ return r;
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ *p = r;
+ unit_write_settingf(u, flags, name, "%s=%d%%", name, r);
+ }
+
+ return 1;
+}
+
int bus_set_transient_usec_internal(
Unit *u,
const char *name,
diff --git a/src/core/dbus-util.h b/src/core/dbus-util.h
index 654ceb5279..7781a425be 100644
--- a/src/core/dbus-util.h
+++ b/src/core/dbus-util.h
@@ -240,6 +240,7 @@ int bus_set_transient_user_relaxed(Unit *u, const char *name, char **p, sd_bus_m
int bus_set_transient_path(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
int bus_set_transient_string(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
int bus_set_transient_bool(Unit *u, const char *name, bool *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
+int bus_set_transient_percent(Unit *u, const char *name, int *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
int bus_set_transient_usec_internal(Unit *u, const char *name, usec_t *p, bool fix_0, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);
static inline int bus_set_transient_usec(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) {
return bus_set_transient_usec_internal(u, name, p, false, message, flags, error);
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4
index c60d565eb4..5b148779dc 100644
--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@@ -224,6 +224,9 @@ $1.IPAddressAllow, config_parse_ip_address_access, 0,
$1.IPAddressDeny, config_parse_ip_address_access, 0, offsetof($1, cgroup_context.ip_address_deny)
$1.IPIngressFilterPath, config_parse_ip_filter_bpf_progs, 0, offsetof($1, cgroup_context.ip_filters_ingress)
$1.IPEgressFilterPath, config_parse_ip_filter_bpf_progs, 0, offsetof($1, cgroup_context.ip_filters_egress)
+$1.ManagedOOMSwap, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_swap)
+$1.ManagedOOMMemoryPressure, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_mem_pressure)
+$1.ManagedOOMMemoryPressureLimitPercent,config_parse_managed_oom_mem_pressure_limit,0, offsetof($1, cgroup_context)
$1.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0'
)m4_dnl
Unit.Description, config_parse_unit_string_printf, 0, offsetof(Unit, description)
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
index df40119175..9361e93051 100644
--- a/src/core/load-fragment.c
+++ b/src/core/load-fragment.c
@@ -26,6 +26,7 @@
#include "capability-util.h"
#include "cgroup-setup.h"
#include "conf-parser.h"
+#include "core-varlink.h"
#include "cpu-set-util.h"
#include "env-util.h"
#include "errno-list.h"
@@ -3812,6 +3813,86 @@ int config_parse_delegate(
return 0;
}
+int config_parse_managed_oom_mode(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ Unit *u = userdata;
+ ManagedOOMMode *mode = data, m;
+ UnitType t;
+
+ t = unit_name_to_type(unit);
+ assert(t != _UNIT_TYPE_INVALID);
+
+ if (!unit_vtable[t]->can_set_managed_oom)
+ return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue);
+
+ if (isempty(rvalue)) {
+ *mode = MANAGED_OOM_AUTO;
+ goto finish;
+ }
+
+ m = managed_oom_mode_from_string(rvalue);
+ if (m < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ *mode = m;
+
+finish:
+ (void) manager_varlink_send_managed_oom_update(u);
+ return 0;
+}
+
+int config_parse_managed_oom_mem_pressure_limit(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ Unit *u = userdata;
+ CGroupContext *c = data;
+ UnitType t;
+ int r;
+
+ t = unit_name_to_type(unit);
+ assert(t != _UNIT_TYPE_INVALID);
+
+ if (!unit_vtable[t]->can_set_managed_oom)
+ return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue);
+
+ if (isempty(rvalue)) {
+ c->moom_mem_pressure_limit = 0;
+ goto finish;
+ }
+
+ r = parse_percent(rvalue);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse limit percent value, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ c->moom_mem_pressure_limit = r;
+
+finish:
+ /* Only update the limit if memory pressure detection is enabled because the information is irrelevant otherwise */
+ if (c->moom_mem_pressure == MANAGED_OOM_KILL)
+ (void) manager_varlink_send_managed_oom_update(u);
+ return 0;
+}
+
int config_parse_device_allow(
const char *unit,
const char *filename,
diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h
index d67852a74d..fa4c1fb1a0 100644
--- a/src/core/load-fragment.h
+++ b/src/core/load-fragment.h
@@ -76,6 +76,8 @@ CONFIG_PARSER_PROTOTYPE(config_parse_cpu_shares);
CONFIG_PARSER_PROTOTYPE(config_parse_memory_limit);
CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max);
CONFIG_PARSER_PROTOTYPE(config_parse_delegate);
+CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit);
CONFIG_PARSER_PROTOTYPE(config_parse_device_policy);
CONFIG_PARSER_PROTOTYPE(config_parse_device_allow);
CONFIG_PARSER_PROTOTYPE(config_parse_io_device_latency);
diff --git a/src/core/manager.c b/src/core/manager.c
index 6a8c4bf362..bf3a3c64f0 100644
--- a/src/core/manager.c
+++ b/src/core/manager.c
@@ -2578,6 +2578,11 @@ static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) {
* We only do this for the cgroup the PID belonged to. */
(void) unit_check_oom(u1);
+ /* This only logs for now. In the future when the interface for kills/notifications
+ * is more stable we can extend service results table similar to how kernel oom kills
+ * are managed. */
+ (void) unit_check_oomd_kill(u1);
+
manager_invoke_sigchld_event(m, u1, &si);
}
if (u2)
diff --git a/src/core/manager.h b/src/core/manager.h
index 9e98b31c4b..073cc74a85 100644
--- a/src/core/manager.h
+++ b/src/core/manager.h
@@ -434,6 +434,8 @@ struct Manager {
bool honor_device_enumeration;
VarlinkServer *varlink_server;
+ /* Only systemd-oomd should be using this to subscribe to changes in ManagedOOM settings */
+ Varlink *managed_oom_varlink_request;
};
static inline usec_t manager_default_timeout_abort_usec(Manager *m) {
diff --git a/src/core/scope.c b/src/core/scope.c
index 42c51b0865..540c83ba45 100644
--- a/src/core/scope.c
+++ b/src/core/scope.c
@@ -621,6 +621,7 @@ const UnitVTable scope_vtable = {
.can_delegate = true,
.can_fail = true,
.once_only = true,
+ .can_set_managed_oom = true,
.init = scope_init,
.load = scope_load,
diff --git a/src/core/service.c b/src/core/service.c
index d23384c475..9d834d4069 100644
--- a/src/core/service.c
+++ b/src/core/service.c
@@ -4533,6 +4533,7 @@ const UnitVTable service_vtable = {
.can_transient = true,
.can_delegate = true,
.can_fail = true,
+ .can_set_managed_oom = true,
.init = service_init,
.done = service_done,
diff --git a/src/core/slice.c b/src/core/slice.c
index 49541aacab..36e5d6a40f 100644
--- a/src/core/slice.c
+++ b/src/core/slice.c
@@ -435,6 +435,7 @@ const UnitVTable slice_vtable = {
.private_section = "Slice",
.can_transient = true,
+ .can_set_managed_oom = true,
.init = slice_init,
.load = slice_load,
diff --git a/src/core/unit.c b/src/core/unit.c
index 1165d4ea8b..fd73ad2949 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -15,6 +15,7 @@
#include "bus-util.h"
#include "cgroup-setup.h"
#include "cgroup-util.h"
+#include "core-varlink.h"
#include "dbus-unit.h"
#include "dbus.h"
#include "dropin.h"
@@ -1573,6 +1574,31 @@ static int unit_add_mount_dependencies(Unit *u) {
return 0;
}
+static int unit_add_oomd_dependencies(Unit *u) {
+ CGroupContext *c;
+ bool wants_oomd;
+ int r;
+
+ assert(u);
+
+ if (!u->default_dependencies)
+ return 0;
+
+ c = unit_get_cgroup_context(u);
+ if (!c)
+ return 0;
+
+ wants_oomd = (c->moom_swap == MANAGED_OOM_KILL || c->moom_mem_pressure == MANAGED_OOM_KILL);
+ if (!wants_oomd)
+ return 0;
+
+ r = unit_add_two_dependencies_by_name(u, UNIT_AFTER, UNIT_WANTS, "systemd-oomd.service", true, UNIT_DEPENDENCY_FILE);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
static int unit_add_startup_units(Unit *u) {
CGroupContext *c;
@@ -1633,6 +1659,10 @@ int unit_load(Unit *u) {
if (r < 0)
goto fail;
+ r = unit_add_oomd_dependencies(u);
+ if (r < 0)
+ goto fail;
+
r = unit_add_startup_units(u);
if (r < 0)
goto fail;
@@ -2592,6 +2622,18 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, UnitNotifyFlag
* the bus queue, so that any job change signal queued will force out the unit change signal first. */
unit_add_to_dbus_queue(u);
+ /* Update systemd-oomd on the property/state change */
+ if (os != ns) {
+ /* Always send an update if the unit is going into an inactive state so systemd-oomd knows to stop
+ * monitoring.
+ * Also send an update whenever the unit goes active; this is to handle a case where an override file
+ * sets one of the ManagedOOM*= properties to "kill", then later removes it. systemd-oomd needs to
+ * know to stop monitoring when the unit changes from "kill" -> "auto" on daemon-reload, but we don't
+ * have the information on the property. Thus, indiscriminately send an update. */
+ if (UNIT_IS_INACTIVE_OR_FAILED(ns) || ns == UNIT_ACTIVE)
+ (void) manager_varlink_send_managed_oom_update(u);
+ }
+
/* Update timestamps for state changes */
if (!MANAGER_IS_RELOADING(m)) {
dual_timestamp_get(&u->state_change_timestamp);
@@ -3558,6 +3600,9 @@ int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs) {
if (u->cpu_usage_last != NSEC_INFINITY)
(void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, u->cpu_usage_last);
+ if (u->managed_oom_kill_last > 0)
+ (void) serialize_item_format(f, "managed-oom-kill-last", "%" PRIu64, u->managed_oom_kill_last);
+
if (u->oom_kill_last > 0)
(void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, u->oom_kill_last);
@@ -3803,6 +3848,14 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) {
continue;
+ } else if (streq(l, "managed-oom-kill-last")) {
+
+ r = safe_atou64(v, &u->managed_oom_kill_last);
+ if (r < 0)
+ log_unit_debug(u, "Failed to read managed OOM kill last %s, ignoring.", v);
+
+ continue;
+
} else if (streq(l, "oom-kill-last")) {
r = safe_atou64(v, &u->oom_kill_last);
diff --git a/src/core/unit.h b/src/core/unit.h
index 35873d57bc..1e6d7ccf6b 100644
--- a/src/core/unit.h
+++ b/src/core/unit.h
@@ -260,7 +260,10 @@ typedef struct Unit {
nsec_t cpu_usage_base;
nsec_t cpu_usage_last; /* the most recently read value */
- /* The current counter of the oom_kill field in the memory.events cgroup attribute */
+ /* The current counter of processes sent SIGKILL by systemd-oomd */
+ uint64_t managed_oom_kill_last;
+
+ /* The current counter of the oom_kill field in the memory.events cgroup attribute */
uint64_t oom_kill_last;
/* Where the io.stat data was at the time the unit was started */
@@ -625,6 +628,9 @@ typedef struct UnitVTable {
/* True if queued jobs of this type should be GC'ed if no other job needs them anymore */
bool gc_jobs:1;
+
+ /* True if systemd-oomd can monitor and act on this unit's recursive children's cgroup(s) */
+ bool can_set_managed_oom:1;
} UnitVTable;
extern const UnitVTable * const unit_vtable[_UNIT_TYPE_MAX];
diff --git a/src/oom/meson.build b/src/oom/meson.build
new file mode 100644
index 0000000000..78c92deff3
--- /dev/null
+++ b/src/oom/meson.build
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: LGPL-2.1+
+
+systemd_oomd_sources = files('''
+ oomd-manager-bus.c
+ oomd-manager-bus.h
+ oomd-manager.c
+ oomd-manager.h
+ oomd-util.c
+ oomd-util.h
+ oomd.c
+'''.split())
+
+oomctl_sources = files('''
+ oomctl.c
+'''.split())
+
+if conf.get('ENABLE_OOMD') == 1
+ tests += [
+ [['src/oom/test-oomd-util.c',
+ 'src/oom/oomd-util.c',
+ 'src/oom/oomd-util.h'],
+ [],
+ []]
+ ]
+
+ install_data('org.freedesktop.oom1.conf',
+ install_dir : dbuspolicydir)
+
+ install_data('org.freedesktop.oom1.service',
+ install_dir : dbussystemservicedir)
+
+ install_data('oomd.conf',
+ install_dir : pkgsysconfdir)
+endif
diff --git a/src/oom/oomctl.c b/src/oom/oomctl.c
new file mode 100644
index 0000000000..01e43d3560
--- /dev/null
+++ b/src/oom/oomctl.c
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#include
+#include
+
+#include "bus-error.h"
+#include "copy.h"
+#include "main-func.h"
+#include "pretty-print.h"
+#include "terminal-util.h"
+#include "verbs.h"
+
+static PagerFlags arg_pager_flags = 0;
+
+static int help(int argc, char *argv[], void *userdata) {
+ _cleanup_free_ char *link = NULL;
+ int r;
+
+ (void) pager_open(arg_pager_flags);
+
+ r = terminal_urlify_man("oomctl", "1", &link);
+ if (r < 0)
+ return log_oom();
+
+ printf("%1$s [OPTIONS...] COMMAND ...\n\n"
+ "%2$sManage or inspect the userspace OOM killer.%3$s\n"
+ "\n%4$sCommands:%5$s\n"
+ " dump Output the current state of systemd-oomd\n"
+ "\n%4$sOptions:%5$s\n"
+ " -h --help Show this help\n"
+ " --version Show package version\n"
+ " --no-pager Do not pipe output into a pager\n"
+ "\nSee the %6$s for details.\n"
+ , program_invocation_short_name
+ , ansi_highlight(), ansi_normal()
+ , ansi_underline(), ansi_normal()
+ , link
+ );
+
+ return 0;
+}
+
+static int dump_state(int argc, char *argv[], void *userdata) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+ int fd = -1;
+ int r;
+
+ r = sd_bus_open_system(&bus);
+ if (r < 0)
+ return log_error_errno(r, "Failed to connect system bus: %m");
+
+ (void) pager_open(arg_pager_flags);
+
+ r = sd_bus_call_method(
+ bus,
+ "org.freedesktop.oom1",
+ "/org/freedesktop/oom1",
+ "org.freedesktop.oom1.Manager",
+ "DumpByFileDescriptor",
+ &error,
+ &reply,
+ NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to dump context: %s", bus_error_message(&error, r));
+
+ r = sd_bus_message_read(reply, "h", &fd);
+ if (r < 0)
+ return bus_log_parse_error(r);
+
+ fflush(stdout);
+ return copy_bytes(fd, STDOUT_FILENO, (uint64_t) -1, 0);
+}
+
+static int parse_argv(int argc, char *argv[]) {
+ enum {
+ ARG_VERSION = 0x100,
+ ARG_NO_PAGER,
+ };
+
+ static const struct option options[] = {
+ { "help", no_argument, NULL, 'h' },
+ { "version", no_argument, NULL, ARG_VERSION },
+ { "no-pager", no_argument, NULL, ARG_NO_PAGER },
+ {}
+ };
+
+ int c;
+
+ assert(argc >= 0);
+ assert(argv);
+
+ while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+
+ switch (c) {
+
+ case 'h':
+ return help(0, NULL, NULL);
+
+ case ARG_VERSION:
+ return version();
+
+ case ARG_NO_PAGER:
+ arg_pager_flags |= PAGER_DISABLE;
+ break;
+
+ case '?':
+ return -EINVAL;
+
+ default:
+ assert_not_reached("Invalid option passed.");
+ }
+
+ return 1;
+}
+
+static int run(int argc, char* argv[]) {
+ static const Verb verbs[] = {
+ { "help", VERB_ANY, VERB_ANY, 0, help },
+ { "dump", VERB_ANY, 1, VERB_DEFAULT, dump_state },
+ {}
+ };
+
+ int r;
+
+ log_show_color(true);
+ log_parse_environment();
+ log_open();
+
+ r = parse_argv(argc, argv);
+ if (r <= 0)
+ return r;
+
+ return dispatch_verb(argc, argv, verbs, NULL);
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/oom/oomd-manager-bus.c b/src/oom/oomd-manager-bus.c
new file mode 100644
index 0000000000..67c5fbf92f
--- /dev/null
+++ b/src/oom/oomd-manager-bus.c
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#include
+
+#include "bus-common-errors.h"
+#include "bus-polkit.h"
+#include "fd-util.h"
+#include "oomd-manager-bus.h"
+#include "oomd-manager.h"
+#include "user-util.h"
+
+static int bus_method_dump_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *error) {
+ _cleanup_free_ char *dump = NULL;
+ _cleanup_close_ int fd = -1;
+ Manager *m = userdata;
+ int r;
+
+ assert(message);
+ assert(m);
+
+ r = manager_get_dump_string(m, &dump);
+ if (r < 0)
+ return r;
+
+ fd = acquire_data_fd(dump, strlen(dump), 0);
+ if (fd < 0)
+ return fd;
+
+ return sd_bus_reply_method_return(message, "h", fd);
+}
+
+const sd_bus_vtable manager_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+ SD_BUS_METHOD("DumpByFileDescriptor", NULL, "h", bus_method_dump_by_fd, SD_BUS_VTABLE_UNPRIVILEGED),
+ SD_BUS_VTABLE_END
+};
diff --git a/src/oom/oomd-manager-bus.h b/src/oom/oomd-manager-bus.h
new file mode 100644
index 0000000000..60ccf3b373
--- /dev/null
+++ b/src/oom/oomd-manager-bus.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+#pragma once
+
+#include "sd-bus.h"
+
+typedef struct Manager Manager;
+
+extern const sd_bus_vtable manager_vtable[];
diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c
new file mode 100644
index 0000000000..49b57a86a4
--- /dev/null
+++ b/src/oom/oomd-manager.c
@@ -0,0 +1,549 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#include "bus-log-control-api.h"
+#include "bus-util.h"
+#include "bus-polkit.h"
+#include "cgroup-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "oomd-manager-bus.h"
+#include "oomd-manager.h"
+#include "path-util.h"
+
+typedef struct ManagedOOMReply {
+ ManagedOOMMode mode;
+ char *path;
+ char *property;
+ unsigned limit;
+} ManagedOOMReply;
+
+static void managed_oom_reply_destroy(ManagedOOMReply *reply) {
+ assert(reply);
+ free(reply->path);
+ free(reply->property);
+}
+
+static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) {
+ ManagedOOMMode *mode = userdata, m;
+ const char *s;
+
+ assert(mode);
+ assert_se(s = json_variant_string(v));
+
+ m = managed_oom_mode_from_string(s);
+ if (m < 0)
+ return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), "%s is not a valid ManagedOOMMode", s);
+
+ *mode = m;
+ return 0;
+}
+
+static int process_managed_oom_reply(
+ Varlink *link,
+ JsonVariant *parameters,
+ const char *error_id,
+ VarlinkReplyFlags flags,
+ void *userdata) {
+ JsonVariant *c, *cgroups;
+ Manager *m = userdata;
+ int r = 0;
+
+ assert(m);
+
+ static const JsonDispatch dispatch_table[] = {
+ { "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMReply, mode), JSON_MANDATORY },
+ { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMReply, path), JSON_MANDATORY },
+ { "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMReply, property), JSON_MANDATORY },
+ { "limit", JSON_VARIANT_UNSIGNED, json_dispatch_unsigned, offsetof(ManagedOOMReply, limit), 0 },
+ {},
+ };
+
+ if (error_id) {
+ r = -EIO;
+ log_debug("Error getting ManagedOOM cgroups: %s", error_id);
+ goto finish;
+ }
+
+ cgroups = json_variant_by_key(parameters, "cgroups");
+ if (!cgroups) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ /* Skip malformed elements and keep processing in case the others are good */
+ JSON_VARIANT_ARRAY_FOREACH(c, cgroups) {
+ _cleanup_(managed_oom_reply_destroy) ManagedOOMReply reply = {};
+ OomdCGroupContext *ctx;
+ Hashmap *monitor_hm;
+ loadavg_t limit;
+ int ret;
+
+ if (!json_variant_is_object(c))
+ continue;
+
+ ret = json_dispatch(c, dispatch_table, NULL, 0, &reply);
+ if (ret == -ENOMEM) {
+ r = ret;
+ goto finish;
+ } else if (ret < 0)
+ continue;
+
+ monitor_hm = streq(reply.property, "ManagedOOMSwap") ?
+ m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts;
+
+ if (reply.mode == MANAGED_OOM_AUTO) {
+ (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, reply.path));
+ continue;
+ }
+
+ limit = m->default_mem_pressure_limit;
+
+ if (streq(reply.property, "ManagedOOMMemoryPressure")) {
+ if (reply.limit > 100)
+ continue;
+ else if (reply.limit != 0) {
+ ret = store_loadavg_fixed_point((unsigned long) reply.limit, 0, &limit);
+ if (ret < 0)
+ continue;
+ }
+ }
+
+ ret = oomd_insert_cgroup_context(NULL, monitor_hm, reply.path);
+ if (ret == -ENOMEM) {
+ r = ret;
+ goto finish;
+ }
+
+ /* Always update the limit in case it was changed. For non-memory pressure detection the value is
+ * ignored so always updating it here is not a problem. */
+ ctx = hashmap_get(monitor_hm, reply.path);
+ if (ctx)
+ ctx->mem_pressure_limit = limit;
+ }
+
+finish:
+ if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
+ m->varlink = varlink_close_unref(link);
+
+ return r;
+}
+
+/* Fill `new_h` with `path`'s descendent OomdCGroupContexts. Only include descendent cgroups that are possible
+ * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1".
+ *
+ * This function ignores most errors in order to handle cgroups that may have been cleaned up while populating
+ * the hashmap.
+ *
+ * `new_h` is of the form { key: cgroup paths -> value: OomdCGroupContext } */
+static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
+ _cleanup_free_ char *subpath = NULL;
+ _cleanup_closedir_ DIR *d = NULL;
+ int r;
+
+ assert(new_h);
+ assert(path);
+
+ r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
+ if (r < 0)
+ return r;
+
+ r = cg_read_subgroup(d, &subpath);
+ if (r < 0)
+ return r;
+ else if (r == 0) { /* No subgroups? We're a leaf node */
+ r = oomd_insert_cgroup_context(NULL, new_h, path);
+ return (r == -ENOMEM) ? r : 0;
+ }
+
+ do {
+ _cleanup_free_ char *cg_path = NULL;
+ bool oom_group;
+
+ cg_path = path_join(empty_to_root(path), subpath);
+ if (!cg_path)
+ return -ENOMEM;
+
+ subpath = mfree(subpath);
+
+ r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group);
+ /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */
+ if (r < 0)
+ return (r == -ENOMEM) ? r : 0;
+
+ if (oom_group) {
+ r = oomd_insert_cgroup_context(NULL, new_h, cg_path);
+ if (r == -ENOMEM)
+ return r;
+ } else {
+ r = recursively_get_cgroup_context(new_h, cg_path);
+ if (r == -ENOMEM)
+ return r;
+ }
+ } while ((r = cg_read_subgroup(d, &subpath)) > 0);
+
+ return 0;
+}
+
+static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) {
+ _cleanup_hashmap_free_ Hashmap *new_base = NULL;
+ OomdCGroupContext *ctx;
+ int r;
+
+ assert(monitored_cgroups);
+
+ new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops);
+ if (!new_base)
+ return -ENOMEM;
+
+ HASHMAP_FOREACH(ctx, *monitored_cgroups) {
+ /* Skip most errors since the cgroup we're trying to update might not exist anymore. */
+ r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path);
+ if (r == -ENOMEM)
+ return r;
+ }
+
+ hashmap_free(*monitored_cgroups);
+ *monitored_cgroups = TAKE_PTR(new_base);
+
+ return 0;
+}
+
+static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) {
+ _cleanup_hashmap_free_ Hashmap *candidates = NULL;
+ OomdCGroupContext *ctx;
+ int r;
+
+ assert(monitored_cgroups);
+ assert(ret_candidates);
+
+ candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
+ if (!candidates)
+ return -ENOMEM;
+
+ HASHMAP_FOREACH(ctx, monitored_cgroups) {
+ r = recursively_get_cgroup_context(candidates, ctx->path);
+ if (r == -ENOMEM)
+ return r;
+ }
+
+ *ret_candidates = TAKE_PTR(candidates);
+
+ return 0;
+}
+
+static int acquire_managed_oom_connect(Manager *m) {
+ _cleanup_(varlink_close_unrefp) Varlink *link = NULL;
+ int r;
+
+ assert(m);
+ assert(m->event);
+
+ r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM);
+ if (r < 0)
+ return log_error_errno(r, "Failed to connect to %s: %m", VARLINK_ADDR_PATH_MANAGED_OOM);
+
+ (void) varlink_set_userdata(link, m);
+ (void) varlink_set_description(link, "oomd");
+ (void) varlink_set_relative_timeout(link, USEC_INFINITY);
+
+ r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
+
+ r = varlink_bind_reply(link, process_managed_oom_reply);
+ if (r < 0)
+ return log_error_errno(r, "Failed to bind reply callback: %m");
+
+ r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to observe varlink call: %m");
+
+ m->varlink = TAKE_PTR(link);
+ return 0;
+}
+
+static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
+ _cleanup_set_free_ Set *targets = NULL;
+ Manager *m = userdata;
+ usec_t usec_now;
+ int r;
+
+ assert(s);
+ assert(userdata);
+
+ /* Reset timer */
+ r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
+ if (r < 0)
+ return log_error_errno(r, "Failed to reset event timer");
+
+ r = sd_event_source_set_time_relative(s, INTERVAL_USEC);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set relative time for timer");
+
+ /* Reconnect if our connection dropped */
+ if (!m->varlink) {
+ r = acquire_managed_oom_connect(m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to acquire varlink connection");
+ }
+
+ /* Update the cgroups used for detection/action */
+ r = update_monitored_cgroup_contexts(&m->monitored_swap_cgroup_contexts);
+ if (r == -ENOMEM)
+ return log_error_errno(r, "Failed to update monitored swap cgroup contexts");
+
+ r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
+ if (r == -ENOMEM)
+ return log_error_errno(r, "Failed to update monitored memory pressure cgroup contexts");
+
+ r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
+ /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM */
+ if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
+ return log_error_errno(r, "Failed to acquire system context");
+
+ /* If we're still recovering from a kill, don't try to kill again yet */
+ if (m->post_action_delay_start > 0) {
+ if (m->post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
+ return 0;
+ else
+ m->post_action_delay_start = 0;
+ }
+
+ r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, PRESSURE_DURATION_USEC, &targets);
+ if (r == -ENOMEM)
+ return log_error_errno(r, "Failed to check if memory pressure exceeded limits");
+ else if (r == 1) {
+ /* Check if there was reclaim activity in the last interval. The concern is the following case:
+ * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
+ * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
+ * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
+ * to kill something (it won't help anyways). */
+ if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts)) {
+ _cleanup_hashmap_free_ Hashmap *candidates = NULL;
+ OomdCGroupContext *t;
+
+ r = get_monitored_cgroup_contexts_candidates(m->monitored_mem_pressure_cgroup_contexts, &candidates);
+ if (r == -ENOMEM)
+ return log_error_errno(r, "Failed to get monitored memory pressure cgroup candidates");
+
+ SET_FOREACH(t, targets) {
+ log_notice("Memory pressure for %s is greater than %lu for more than %"PRIu64" seconds and there was reclaim activity",
+ t->path, LOAD_INT(t->mem_pressure_limit), PRESSURE_DURATION_USEC / USEC_PER_SEC);
+
+ r = oomd_kill_by_pgscan(candidates, t->path, m->dry_run);
+ if (r == -ENOMEM)
+ return log_error_errno(r, "Failed to kill cgroup processes by pgscan");
+ if (r < 0)
+ log_info("Failed to kill any cgroup(s) under %s based on pressure", t->path);
+ else {
+ /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
+ m->post_action_delay_start = usec_now;
+ return 0;
+ }
+ }
+ }
+ }
+
+ if (oomd_swap_free_below(&m->system_context, (100 - m->swap_used_limit))) {
+ _cleanup_hashmap_free_ Hashmap *candidates = NULL;
+
+ log_notice("Swap used (%"PRIu64") / total (%"PRIu64") is more than %u%%",
+ m->system_context.swap_used, m->system_context.swap_total, m->swap_used_limit);
+
+ r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
+ if (r == -ENOMEM)
+ return log_error_errno(r, "Failed to get monitored swap cgroup candidates");
+
+ r = oomd_kill_by_swap_usage(candidates, m->dry_run);
+ if (r == -ENOMEM)
+ return log_error_errno(r, "Failed to kill cgroup processes by swap usage");
+ if (r < 0)
+ log_info("Failed to kill any cgroup(s) based on swap");
+ else {
+ m->post_action_delay_start = usec_now;
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+static int monitor_cgroup_contexts(Manager *m) {
+ _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+ int r;
+
+ assert(m);
+ assert(m->event);
+
+ r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_cgroup_contexts_handler, m);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_exit_on_failure(s, true);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_enabled(s, SD_EVENT_ON);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(s, "oomd-timer");
+
+ m->cgroup_context_event_source = TAKE_PTR(s);
+ return 0;
+}
+
+void manager_free(Manager *m) {
+ assert(m);
+
+ varlink_close_unref(m->varlink);
+ sd_event_source_unref(m->cgroup_context_event_source);
+ sd_event_unref(m->event);
+
+ bus_verify_polkit_async_registry_free(m->polkit_registry);
+ sd_bus_flush_close_unref(m->bus);
+
+ hashmap_free(m->monitored_swap_cgroup_contexts);
+ hashmap_free(m->monitored_mem_pressure_cgroup_contexts);
+
+ free(m);
+}
+
+int manager_new(Manager **ret) {
+ _cleanup_(manager_freep) Manager *m = NULL;
+ int r;
+
+ assert(ret);
+
+ m = new0(Manager, 1);
+ if (!m)
+ return -ENOMEM;
+
+ r = sd_event_default(&m->event);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_set_watchdog(m->event, true);
+
+ r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL);
+ if (r < 0)
+ return r;
+
+ r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL);
+ if (r < 0)
+ return r;
+
+ m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
+ if (!m->monitored_swap_cgroup_contexts)
+ return -ENOMEM;
+
+ m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
+ if (!m->monitored_mem_pressure_cgroup_contexts)
+ return -ENOMEM;
+
+ *ret = TAKE_PTR(m);
+ return 0;
+}
+
+static int manager_connect_bus(Manager *m) {
+ int r;
+
+ assert(m);
+ assert(!m->bus);
+
+ r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom");
+ if (r < 0)
+ return log_error_errno(r, "Failed to connect to bus: %m");
+
+ r = sd_bus_add_object_vtable(m->bus, NULL, "/org/freedesktop/oom1", "org.freedesktop.oom1.Manager", manager_vtable, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add manager object vtable: %m");
+
+ r = bus_log_control_api_register(m->bus);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to request name: %m");
+
+ r = sd_bus_attach_event(m->bus, m->event, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach bus to event loop: %m");
+
+ return 0;
+}
+
+int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit) {
+ unsigned long l;
+ int r;
+
+ assert(m);
+
+ m->dry_run = dry_run;
+
+ m->swap_used_limit = swap_used_limit != -1 ? swap_used_limit : DEFAULT_SWAP_USED_LIMIT;
+ assert(m->swap_used_limit <= 100);
+
+ l = mem_pressure_limit != -1 ? mem_pressure_limit : DEFAULT_MEM_PRESSURE_LIMIT;
+ r = store_loadavg_fixed_point(l, 0, &m->default_mem_pressure_limit);
+ if (r < 0)
+ return r;
+
+ r = manager_connect_bus(m);
+ if (r < 0)
+ return r;
+
+ r = acquire_managed_oom_connect(m);
+ if (r < 0)
+ return r;
+
+ r = monitor_cgroup_contexts(m);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int manager_get_dump_string(Manager *m, char **ret) {
+ _cleanup_free_ char *dump = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ OomdCGroupContext *c;
+ size_t size;
+ char *key;
+ int r;
+
+ assert(m);
+ assert(ret);
+
+ f = open_memstream_unlocked(&dump, &size);
+ if (!f)
+ return -errno;
+
+ fprintf(f,
+ "Dry Run: %s\n"
+ "Swap Used Limit: %u%%\n"
+ "Default Memory Pressure Limit: %lu%%\n"
+ "System Context:\n",
+ yes_no(m->dry_run),
+ m->swap_used_limit,
+ LOAD_INT(m->default_mem_pressure_limit));
+ oomd_dump_system_context(&m->system_context, f, "\t");
+
+ fprintf(f, "Swap Monitored CGroups:\n");
+ HASHMAP_FOREACH_KEY(c, key, m->monitored_swap_cgroup_contexts)
+ oomd_dump_swap_cgroup_context(c, f, "\t");
+
+ fprintf(f, "Memory Pressure Monitored CGroups:\n");
+ HASHMAP_FOREACH_KEY(c, key, m->monitored_mem_pressure_cgroup_contexts)
+ oomd_dump_memory_pressure_cgroup_context(c, f, "\t");
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+
+ f = safe_fclose(f);
+
+ *ret = TAKE_PTR(dump);
+ return 0;
+}
diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h
new file mode 100644
index 0000000000..b5c249799b
--- /dev/null
+++ b/src/oom/oomd-manager.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+#pragma once
+
+#include "conf-parser.h"
+#include "oomd-util.h"
+#include "sd-event.h"
+#include "varlink.h"
+
+/* Polling interval for monitoring stats */
+#define INTERVAL_USEC (1 * USEC_PER_SEC)
+
+/* Used to weight the averages */
+#define AVERAGE_SIZE_DECAY 4
+
+/* Take action if 10s of memory pressure > 60 for more than 30s. We use the "full" value from PSI so this is the
+ * percentage of time all tasks were delayed (i.e. unproductive).
+ * Generally 60 or higher might be acceptable for something like system.slice with no memory.high set; processes in
+ * system.slice are assumed to be less latency sensitive. */
+#define PRESSURE_DURATION_USEC (30 * USEC_PER_SEC)
+#define DEFAULT_MEM_PRESSURE_LIMIT 60
+#define DEFAULT_SWAP_USED_LIMIT 90
+
+#define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC)
+
+typedef struct Manager Manager;
+
+struct Manager {
+ sd_bus *bus;
+ sd_event *event;
+
+ Hashmap *polkit_registry;
+
+ bool dry_run;
+ unsigned swap_used_limit;
+ loadavg_t default_mem_pressure_limit;
+
+ /* k: cgroup paths -> v: OomdCGroupContext
+ * Used to detect when to take action. */
+ Hashmap *monitored_swap_cgroup_contexts;
+ Hashmap *monitored_mem_pressure_cgroup_contexts;
+
+ OomdSystemContext system_context;
+
+ usec_t post_action_delay_start;
+
+ sd_event_source *cgroup_context_event_source;
+
+ Varlink *varlink;
+};
+
+void manager_free(Manager *m);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free);
+
+int manager_new(Manager **ret);
+
+int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit);
+
+int manager_get_dump_string(Manager *m, char **ret);
+
+CONFIG_PARSER_PROTOTYPE(config_parse_oomd_default);
diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c
new file mode 100644
index 0000000000..6cd4ba4f93
--- /dev/null
+++ b/src/oom/oomd-util.c
@@ -0,0 +1,451 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#include
+#include
+
+#include "cgroup-util.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "oomd-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "procfs-util.h"
+#include "signal-util.h"
+#include "sort-util.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+
+DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
+ oomd_cgroup_ctx_hash_ops,
+ char,
+ string_hash_func,
+ string_compare_func,
+ OomdCGroupContext,
+ oomd_cgroup_context_free);
+
+static int log_kill(pid_t pid, int sig, void *userdata) {
+ log_debug("oomd attempting to kill " PID_FMT " with %s", pid, signal_to_string(sig));
+ return 0;
+}
+
+static int increment_oomd_xattr(const char *path, const char *xattr, uint64_t num_procs_killed) {
+ _cleanup_free_ char *value = NULL;
+ char buf[DECIMAL_STR_MAX(uint64_t) + 1];
+ uint64_t curr_count = 0;
+ int r;
+
+ assert(path);
+ assert(xattr);
+
+ r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, path, xattr, &value);
+ if (r < 0 && r != -ENODATA)
+ return r;
+
+ if (!isempty(value)) {
+ r = safe_atou64(value, &curr_count);
+ if (r < 0)
+ return r;
+ }
+
+ if (curr_count > UINT64_MAX - num_procs_killed)
+ return -EOVERFLOW;
+
+ xsprintf(buf, "%"PRIu64, curr_count + num_procs_killed);
+ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, path, xattr, buf, strlen(buf), 0);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) {
+ if (!ctx)
+ return NULL;
+
+ free(ctx->path);
+ return mfree(ctx);
+}
+
+int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) {
+ _cleanup_set_free_ Set *targets = NULL;
+ OomdCGroupContext *ctx;
+ char *key;
+ int r;
+
+ assert(h);
+ assert(ret);
+
+ targets = set_new(NULL);
+ if (!targets)
+ return -ENOMEM;
+
+ HASHMAP_FOREACH_KEY(ctx, key, h) {
+ if (ctx->memory_pressure.avg10 > ctx->mem_pressure_limit) {
+ usec_t diff;
+
+ if (ctx->last_hit_mem_pressure_limit == 0)
+ ctx->last_hit_mem_pressure_limit = now(CLOCK_MONOTONIC);
+
+ diff = now(CLOCK_MONOTONIC) - ctx->last_hit_mem_pressure_limit;
+ if (diff >= duration) {
+ r = set_put(targets, ctx);
+ if (r < 0)
+ return -ENOMEM;
+ }
+ } else
+ ctx->last_hit_mem_pressure_limit = 0;
+ }
+
+ if (!set_isempty(targets)) {
+ *ret = TAKE_PTR(targets);
+ return 1;
+ }
+
+ *ret = NULL;
+ return 0;
+}
+
+bool oomd_memory_reclaim(Hashmap *h) {
+ uint64_t pgscan = 0, pgscan_of = 0, last_pgscan = 0, last_pgscan_of = 0;
+ OomdCGroupContext *ctx;
+
+ assert(h);
+
+ /* If sum of all the current pgscan values are greater than the sum of all the last_pgscan values,
+ * there was reclaim activity. Used along with pressure checks to decide whether to take action. */
+
+ HASHMAP_FOREACH(ctx, h) {
+ uint64_t sum;
+
+ sum = pgscan + ctx->pgscan;
+ if (sum < pgscan || sum < ctx->pgscan)
+ pgscan_of++; /* count overflows */
+ pgscan = sum;
+
+ sum = last_pgscan + ctx->last_pgscan;
+ if (sum < last_pgscan || sum < ctx->last_pgscan)
+ last_pgscan_of++; /* count overflows */
+ last_pgscan = sum;
+ }
+
+ /* overflow counts are the same, return sums comparison */
+ if (last_pgscan_of == pgscan_of)
+ return pgscan > last_pgscan;
+
+ return pgscan_of > last_pgscan_of;
+}
+
+bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent) {
+ uint64_t swap_threshold;
+
+ assert(ctx);
+ assert(threshold_percent <= 100);
+
+ swap_threshold = ctx->swap_total * threshold_percent / ((uint64_t) 100);
+ return (ctx->swap_total - ctx->swap_used) < swap_threshold;
+}
+
+int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const char *prefix, OomdCGroupContext ***ret) {
+ _cleanup_free_ OomdCGroupContext **sorted = NULL;
+ OomdCGroupContext *item;
+ size_t k = 0;
+
+ assert(h);
+ assert(compare_func);
+ assert(ret);
+
+ sorted = new0(OomdCGroupContext*, hashmap_size(h));
+ if (!sorted)
+ return -ENOMEM;
+
+ HASHMAP_FOREACH(item, h) {
+ if (item->path && prefix && !path_startswith(item->path, prefix))
+ continue;
+
+ sorted[k++] = item;
+ }
+
+ typesafe_qsort(sorted, k, compare_func);
+
+ *ret = TAKE_PTR(sorted);
+
+ assert(k <= INT_MAX);
+ return (int) k;
+}
+
+int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) {
+ _cleanup_set_free_ Set *pids_killed = NULL;
+ int r;
+
+ assert(path);
+
+ if (dry_run) {
+ _cleanup_free_ char *cg_path = NULL;
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &cg_path);
+ if (r < 0)
+ return r;
+
+ log_debug("oomd dry-run: Would have tried to kill %s with recurse=%s", cg_path, true_false(recurse));
+ return 0;
+ }
+
+ pids_killed = set_new(NULL);
+ if (!pids_killed)
+ return -ENOMEM;
+
+ if (recurse)
+ r = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL);
+ else
+ r = cg_kill(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL);
+ if (r < 0)
+ return r;
+
+ r = increment_oomd_xattr(path, "user.systemd_oomd_kill", set_size(pids_killed));
+ if (r < 0)
+ log_debug_errno(r, "Failed to set user.systemd_oomd_kill on kill: %m");
+
+ return set_size(pids_killed) != 0;
+}
+
+int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) {
+ _cleanup_free_ OomdCGroupContext **sorted = NULL;
+ int r;
+
+ assert(h);
+
+ r = oomd_sort_cgroup_contexts(h, compare_pgscan, prefix, &sorted);
+ if (r < 0)
+ return r;
+
+ for (int i = 0; i < r; i++) {
+ if (sorted[i]->pgscan == 0)
+ break;
+
+ r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
+ if (r > 0 || r == -ENOMEM)
+ break;
+ }
+
+ return r;
+}
+
+int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run) {
+ _cleanup_free_ OomdCGroupContext **sorted = NULL;
+ int r;
+
+ assert(h);
+
+ r = oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted);
+ if (r < 0)
+ return r;
+
+ /* Try to kill cgroups with non-zero swap usage until we either succeed in
+ * killing or we get to a cgroup with no swap usage. */
+ for (int i = 0; i < r; i++) {
+ if (sorted[i]->swap_usage == 0)
+ break;
+
+ r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
+ if (r > 0 || r == -ENOMEM)
+ break;
+ }
+
+ return r;
+}
+
+int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
+ _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
+ _cleanup_free_ char *p = NULL, *val = NULL;
+ bool is_root;
+ int r;
+
+ assert(path);
+ assert(ret);
+
+ ctx = new0(OomdCGroupContext, 1);
+ if (!ctx)
+ return -ENOMEM;
+
+ is_root = empty_or_root(path);
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "memory.pressure", &p);
+ if (r < 0)
+ return log_debug_errno(r, "Error getting cgroup memory pressure path from %s: %m", path);
+
+ r = read_resource_pressure(p, PRESSURE_TYPE_FULL, &ctx->memory_pressure);
+ if (r < 0)
+ return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p);
+
+ if (is_root) {
+ r = procfs_memory_get_used(&ctx->current_memory_usage);
+ if (r < 0)
+ return log_debug_errno(r, "Error getting memory used from procfs: %m");
+ } else {
+ r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.current", &ctx->current_memory_usage);
+ if (r < 0)
+ return log_debug_errno(r, "Error getting memory.current from %s: %m", path);
+
+ r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.min", &ctx->memory_min);
+ if (r < 0)
+ return log_debug_errno(r, "Error getting memory.min from %s: %m", path);
+
+ r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.low", &ctx->memory_low);
+ if (r < 0)
+ return log_debug_errno(r, "Error getting memory.low from %s: %m", path);
+
+ r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.swap.current", &ctx->swap_usage);
+ if (r < 0)
+ return log_debug_errno(r, "Error getting memory.swap.current from %s: %m", path);
+
+ r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, path, "memory.stat", STRV_MAKE("pgscan"), &val);
+ if (r < 0)
+ return log_debug_errno(r, "Error getting pgscan from memory.stat under %s: %m", path);
+
+ r = safe_atou64(val, &ctx->pgscan);
+ if (r < 0)
+ return log_debug_errno(r, "Error converting pgscan value to uint64_t: %m");
+ }
+
+ ctx->path = strdup(empty_to_root(path));
+ if (!ctx->path)
+ return -ENOMEM;
+
+ *ret = TAKE_PTR(ctx);
+ return 0;
+}
+
+int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret) {
+ _cleanup_fclose_ FILE *f = NULL;
+ OomdSystemContext ctx = {};
+ int r;
+
+ assert(proc_swaps_path);
+ assert(ret);
+
+ f = fopen(proc_swaps_path, "re");
+ if (!f)
+ return -errno;
+
+ (void) fscanf(f, "%*s %*s %*s %*s %*s\n");
+
+ for (;;) {
+ uint64_t total, used;
+
+ r = fscanf(f,
+ "%*s " /* device/file */
+ "%*s " /* type of swap */
+ "%" PRIu64 " " /* swap size */
+ "%" PRIu64 " " /* used */
+ "%*s\n", /* priority */
+ &total, &used);
+
+ if (r == EOF && feof(f))
+ break;
+
+ if (r != 2) {
+ if (ferror(f))
+ return log_debug_errno(errno, "Error reading from %s: %m", proc_swaps_path);
+
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to parse values from %s: %m", proc_swaps_path);
+ }
+
+ ctx.swap_total += total * 1024U;
+ ctx.swap_used += used * 1024U;
+ }
+
+ *ret = ctx;
+ return 0;
+}
+
+int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path) {
+ _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *curr_ctx = NULL;
+ OomdCGroupContext *old_ctx, *ctx;
+ int r;
+
+ assert(new_h);
+ assert(path);
+
+ r = oomd_cgroup_context_acquire(path, &curr_ctx);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get OomdCGroupContext for %s: %m", path);
+
+ old_ctx = hashmap_get(old_h, path);
+ if (old_ctx) {
+ curr_ctx->last_pgscan = old_ctx->pgscan;
+ curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
+ curr_ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit;
+ }
+
+ ctx = TAKE_PTR(curr_ctx);
+ r = hashmap_put(new_h, ctx->path, ctx);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+void oomd_dump_swap_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) {
+ char swap[FORMAT_BYTES_MAX];
+
+ assert(ctx);
+ assert(f);
+
+ if (!empty_or_root(ctx->path))
+ fprintf(f,
+ "%sPath: %s\n"
+ "%s\tSwap Usage: %s\n",
+ strempty(prefix), ctx->path,
+ strempty(prefix), format_bytes(swap, sizeof(swap), ctx->swap_usage));
+ else
+ fprintf(f,
+ "%sPath: %s\n"
+ "%s\tSwap Usage: (see System Context)\n",
+ strempty(prefix), ctx->path,
+ strempty(prefix));
+}
+
+void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) {
+ char tbuf[FORMAT_TIMESPAN_MAX], mem_use[FORMAT_BYTES_MAX];
+ char mem_min[FORMAT_BYTES_MAX], mem_low[FORMAT_BYTES_MAX];
+
+ assert(ctx);
+ assert(f);
+
+ fprintf(f,
+ "%sPath: %s\n"
+ "%s\tMemory Pressure Limit: %lu%%\n"
+ "%s\tPressure: Avg10: %lu.%02lu Avg60: %lu.%02lu Avg300: %lu.%02lu Total: %s\n"
+ "%s\tCurrent Memory Usage: %s\n",
+ strempty(prefix), ctx->path,
+ strempty(prefix), LOAD_INT(ctx->mem_pressure_limit),
+ strempty(prefix),
+ LOAD_INT(ctx->memory_pressure.avg10), LOAD_FRAC(ctx->memory_pressure.avg10),
+ LOAD_INT(ctx->memory_pressure.avg60), LOAD_FRAC(ctx->memory_pressure.avg60),
+ LOAD_INT(ctx->memory_pressure.avg300), LOAD_FRAC(ctx->memory_pressure.avg300),
+ format_timespan(tbuf, sizeof(tbuf), ctx->memory_pressure.total, USEC_PER_SEC),
+ strempty(prefix), format_bytes(mem_use, sizeof(mem_use), ctx->current_memory_usage));
+
+ if (!empty_or_root(ctx->path))
+ fprintf(f,
+ "%s\tMemory Min: %s\n"
+ "%s\tMemory Low: %s\n"
+ "%s\tPgscan: %" PRIu64 "\n",
+ strempty(prefix), format_bytes_cgroup_protection(mem_min, sizeof(mem_min), ctx->memory_min),
+ strempty(prefix), format_bytes_cgroup_protection(mem_low, sizeof(mem_low), ctx->memory_low),
+ strempty(prefix), ctx->pgscan);
+}
+
+void oomd_dump_system_context(const OomdSystemContext *ctx, FILE *f, const char *prefix) {
+ char used[FORMAT_BYTES_MAX], total[FORMAT_BYTES_MAX];
+
+ assert(ctx);
+ assert(f);
+
+ fprintf(f,
+ "%sSwap: Used: %s Total: %s\n",
+ strempty(prefix),
+ format_bytes(used, sizeof(used), ctx->swap_used),
+ format_bytes(total, sizeof(total), ctx->swap_total));
+}
diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h
new file mode 100644
index 0000000000..cfd717a018
--- /dev/null
+++ b/src/oom/oomd-util.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+#pragma once
+
+#include
+
+#include "hashmap.h"
+#include "psi-util.h"
+
+#define GROWING_SIZE_PERCENTILE 80
+
+extern const struct hash_ops oomd_cgroup_ctx_hash_ops;
+
+typedef struct OomdCGroupContext OomdCGroupContext;
+typedef struct OomdSystemContext OomdSystemContext;
+
+typedef int (oomd_compare_t)(OomdCGroupContext * const *, OomdCGroupContext * const *);
+
+struct OomdCGroupContext {
+ char *path;
+
+ ResourcePressure memory_pressure;
+
+ uint64_t current_memory_usage;
+
+ uint64_t memory_min;
+ uint64_t memory_low;
+ uint64_t swap_usage;
+
+ uint64_t last_pgscan;
+ uint64_t pgscan;
+
+ /* These are only used by oomd_pressure_above for acting on high memory pressure. */
+ loadavg_t mem_pressure_limit;
+ usec_t last_hit_mem_pressure_limit;
+};
+
+struct OomdSystemContext {
+ uint64_t swap_total;
+ uint64_t swap_used;
+};
+
+OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx);
+DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_free);
+
+/* All hashmaps used with these functions are expected to be of the form
+ * key: cgroup paths -> value: OomdCGroupContext. */
+
+/* Scans all the OomdCGroupContexts in `h` and returns 1 and a set of pointers to those OomdCGroupContexts in `ret`
+ * if any of them have exceeded their supplied memory pressure limits for the `duration` length of time.
+ * `last_hit_mem_pressure_limit` is updated accordingly for each entry when the limit is exceeded, and when it returns
+ * below the limit.
+ * Returns 0 and sets `ret` to an empty set if no entries exceeded limits for `duration`.
+ * Returns -ENOMEM for allocation errors. */
+int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret);
+
+/* Sum up current OomdCGroupContexts' pgscan values and last interval's pgscan values in `h`. Returns true if the
+ * current sum is higher than the last interval's sum (there was some reclaim activity). */
+bool oomd_memory_reclaim(Hashmap *h);
+
+/* Returns true if the amount of swap free is below the percentage of swap specified by `threshold_percent`. */
+bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent);
+
+static inline int compare_pgscan(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
+ assert(c1);
+ assert(c2);
+
+ if ((*c1)->pgscan > (*c2)->pgscan)
+ return -1;
+ else if ((*c1)->pgscan < (*c2)->pgscan)
+ return 1;
+ else
+ return 0;
+}
+
+static inline int compare_swap_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
+ assert(c1);
+ assert(c2);
+
+ if ((*c1)->swap_usage > (*c2)->swap_usage)
+ return -1;
+ else if ((*c1)->swap_usage < (*c2)->swap_usage)
+ return 1;
+ else
+ return 0;
+}
+
+/* Get an array of OomdCGroupContexts from `h`, qsorted from largest to smallest values according to `compare_func`.
+ * If `prefix` is not NULL, only include OomdCGroupContexts whose paths start with prefix. Otherwise all paths are sorted.
+ * Returns the number of sorted items; negative on error. */
+int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const char *prefix, OomdCGroupContext ***ret);
+
+/* Returns a negative value on error, 0 if no processes were killed, or 1 if processes were killed. */
+int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run);
+
+/* The following oomd_kill_by_* functions return 1 if processes were killed, or negative otherwise. */
+/* If `prefix` is supplied, only cgroups whose paths start with `prefix` are eligible candidates. Otherwise,
+ * everything in `h` is a candidate. */
+int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run);
+int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run);
+
+int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret);
+int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret);
+
+/* Get the OomdCGroupContext of `path` and insert it into `new_h`. The key for the inserted context will be `path`.
+ *
+ * `old_h` is used to get data used to calculate prior interval information. `old_h` can be NULL in which case there
+ * was no prior data to reference. */
+int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path);
+
+void oomd_dump_swap_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix);
+void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix);
+void oomd_dump_system_context(const OomdSystemContext *ctx, FILE *f, const char *prefix);
diff --git a/src/oom/oomd.c b/src/oom/oomd.c
new file mode 100644
index 0000000000..0b611efd57
--- /dev/null
+++ b/src/oom/oomd.c
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#include
+
+#include "cgroup-util.h"
+#include "conf-parser.h"
+#include "daemon-util.h"
+#include "log.h"
+#include "main-func.h"
+#include "oomd-manager.h"
+#include "parse-util.h"
+#include "pretty-print.c"
+#include "psi-util.h"
+#include "signal-util.h"
+
+static bool arg_dry_run = false;
+static int arg_swap_used_limit = -1;
+static int arg_mem_pressure_limit = -1;
+
+static int parse_config(void) {
+ static const ConfigTableItem items[] = {
+ { "OOM", "SwapUsedLimitPercent", config_parse_percent, 0, &arg_swap_used_limit },
+ { "OOM", "DefaultMemoryPressureLimitPercent", config_parse_percent, 0, &arg_mem_pressure_limit },
+ {}
+ };
+
+ return config_parse_many_nulstr(PKGSYSCONFDIR "/oomd.conf",
+ CONF_PATHS_NULSTR("systemd/oomd.conf.d"),
+ "OOM\0",
+ config_item_table_lookup,
+ items,
+ CONFIG_PARSE_WARN,
+ NULL,
+ NULL);
+}
+
+static int help(void) {
+ _cleanup_free_ char *link = NULL;
+ int r;
+
+ r = terminal_urlify_man("systemd-oomd", "1", &link);
+ if (r < 0)
+ return log_oom();
+
+ printf("%s [OPTIONS...]\n\n"
+ "Run the userspace out-of-memory (OOM) killer.\n\n"
+ " -h --help Show this help\n"
+ " --dry-run Log write/destructive actions instead of doing them\n"
+ "\nSee the %s for details.\n"
+ , program_invocation_short_name
+ , link
+ );
+
+ return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+ enum {
+ ARG_DRY_RUN,
+ };
+
+ static const struct option options[] = {
+ { "help", no_argument, NULL, 'h' },
+ { "dry-run", no_argument, NULL, ARG_DRY_RUN },
+ {}
+ };
+
+ int c;
+
+ assert(argc >= 0);
+ assert(argv);
+
+ while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+
+ switch (c) {
+
+ case 'h':
+ return help();
+
+ case ARG_DRY_RUN:
+ arg_dry_run = true;
+ break;
+
+ case '?':
+ return -EINVAL;
+
+ default:
+ assert_not_reached("Invalid option passed.");
+ }
+
+ return 1;
+}
+
+static int run(int argc, char *argv[]) {
+ _cleanup_(notify_on_cleanup) const char *notify_msg = NULL;
+ _cleanup_(manager_freep) Manager *m = NULL;
+ int r;
+
+ log_setup_service();
+
+ r = parse_argv(argc, argv);
+ if (r <= 0)
+ return r;
+
+ r = parse_config();
+ if (r < 0)
+ return r;
+
+ /* Do some basic requirement checks for running systemd-oomd. It's not exhaustive as some of the other
+ * requirements do not have a reliable means to check for in code. */
+ if (access("/proc/swaps", F_OK) < 0)
+ return log_error_errno(errno, "Swap not enabled: %m");
+
+ if (!is_pressure_supported())
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Pressure Stall Information (PSI) is not supported");
+
+ r = cg_all_unified();
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
+ if (r == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Requires the unified cgroups hierarchy");
+
+ assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0);
+
+ r = manager_new(&m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create manager: %m");
+
+ r = manager_start(m, arg_dry_run, arg_swap_used_limit, arg_mem_pressure_limit);
+ if (r < 0)
+ return log_error_errno(r, "Failed to start up daemon: %m");
+
+ notify_msg = notify_start(NOTIFY_READY, NOTIFY_STOPPING);
+
+ log_info("systemd-oomd starting%s!", arg_dry_run ? " in dry run mode" : "");
+
+ r = sd_event_loop(m->event);
+ if (r < 0)
+ return log_error_errno(r, "Event loop failed: %m");
+
+ return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/oom/oomd.conf b/src/oom/oomd.conf
new file mode 100644
index 0000000000..8ac9716961
--- /dev/null
+++ b/src/oom/oomd.conf
@@ -0,0 +1,16 @@
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or
+# (at your option) any later version.
+#
+# Entries in this file show the compile time defaults.
+# You can change settings by editing this file.
+# Defaults can be restored by simply deleting this file.
+#
+# See oomd.conf(5) for details
+
+[OOM]
+#SwapUsedLimitPercent=90%
+#DefaultMemoryPressureLimitPercent=60%
diff --git a/src/oom/org.freedesktop.oom1.conf b/src/oom/org.freedesktop.oom1.conf
new file mode 100644
index 0000000000..48b526f0aa
--- /dev/null
+++ b/src/oom/org.freedesktop.oom1.conf
@@ -0,0 +1,47 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/oom/org.freedesktop.oom1.service b/src/oom/org.freedesktop.oom1.service
new file mode 100644
index 0000000000..78150716ed
--- /dev/null
+++ b/src/oom/org.freedesktop.oom1.service
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: LGPL-2.1+
+#
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or
+# (at your option) any later version.
+
+[D-BUS Service]
+Name=org.freedesktop.oom1
+Exec=/bin/false
+User=root
+SystemdService=dbus-org.freedesktop.oom1.service
diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c
new file mode 100644
index 0000000000..5503f8f2e0
--- /dev/null
+++ b/src/oom/test-oomd-util.c
@@ -0,0 +1,348 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#include
+
+#include "alloc-util.h"
+#include "cgroup-setup.h"
+#include "cgroup-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "oomd-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "tests.h"
+
+static int fork_and_sleep(unsigned sleep_min) {
+ usec_t n, timeout, ts;
+
+ pid_t pid = fork();
+ assert_se(pid >= 0);
+
+ if (pid == 0) {
+ timeout = sleep_min * USEC_PER_MINUTE;
+ ts = now(CLOCK_MONOTONIC);
+ while (true) {
+ n = now(CLOCK_MONOTONIC);
+ if (ts + timeout < n) {
+ log_error("Child timed out waiting to be killed");
+ abort();
+ }
+ sleep(1);
+ }
+ }
+
+ return pid;
+}
+
+static void test_oomd_cgroup_kill(void) {
+ _cleanup_free_ char *cgroup_root = NULL, *cgroup = NULL;
+ int pid[2];
+
+ if (geteuid() != 0)
+ return (void) log_tests_skipped("not root");
+
+ if (cg_all_unified() <= 0)
+ return (void) log_tests_skipped("cgroups are not running in unified mode");
+
+ assert_se(cg_pid_get_path(NULL, 0, &cgroup_root) >= 0);
+
+ /* Create another cgroup below this one for the pids we forked off. We need this to be managed
+ * by the test so that pid1 doesn't delete it before we can read the xattrs. */
+ cgroup = path_join(cgroup_root, "oomdkilltest");
+ assert(cgroup);
+
+ /* If we don't have permissions to set xattrs we're likely in a userns or missing capabilities */
+ if (cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_test", "test", 4, 0) == -EPERM)
+ return (void) log_tests_skipped("no permissions to set user xattrs");
+
+ /* Do this twice to also check the increment behavior on the xattrs */
+ for (int i = 0; i < 2; i++) {
+ _cleanup_free_ char *v = NULL;
+ int r;
+
+ for (int j = 0; j < 2; j++) {
+ pid[j] = fork_and_sleep(5);
+ assert_se(cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, cgroup, pid[j]) >= 0);
+ }
+
+ r = oomd_cgroup_kill(cgroup, false /* recurse */, false /* dry run */);
+ if (r <= 0) {
+ log_debug_errno(r, "Failed to kill processes under %s: %m", cgroup);
+ abort();
+ }
+
+ /* Wait a bit since processes may take some time to be cleaned up. */
+ sleep(2);
+ assert_se(cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, cgroup) == true);
+
+ assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.systemd_oomd_kill", &v) >= 0);
+ assert_se(memcmp(v, i == 0 ? "2" : "4", 2) == 0);
+ }
+}
+
+static void test_oomd_cgroup_context_acquire_and_insert(void) {
+ _cleanup_hashmap_free_ Hashmap *h1 = NULL, *h2 = NULL;
+ _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
+ _cleanup_free_ char *cgroup = NULL;
+ OomdCGroupContext *c1, *c2;
+
+ if (geteuid() != 0)
+ return (void) log_tests_skipped("not root");
+
+ if (!is_pressure_supported())
+ return (void) log_tests_skipped("system does not support pressure");
+
+ if (cg_all_unified() <= 0)
+ return (void) log_tests_skipped("cgroups are not running in unified mode");
+
+ assert_se(cg_pid_get_path(NULL, 0, &cgroup) >= 0);
+
+ assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
+
+ assert_se(streq(ctx->path, cgroup));
+ assert_se(ctx->memory_pressure.avg10 == 0);
+ assert_se(ctx->memory_pressure.avg60 == 0);
+ assert_se(ctx->memory_pressure.avg300 == 0);
+ assert_se(ctx->memory_pressure.total == 0);
+ assert_se(ctx->current_memory_usage > 0);
+ assert_se(ctx->memory_min == 0);
+ assert_se(ctx->memory_low == 0);
+ assert_se(ctx->swap_usage == 0);
+ assert_se(ctx->last_pgscan == 0);
+ assert_se(ctx->pgscan == 0);
+ ctx = oomd_cgroup_context_free(ctx);
+
+ /* Test the root cgroup */
+ assert_se(oomd_cgroup_context_acquire("", &ctx) == 0);
+ assert_se(streq(ctx->path, "/"));
+ assert_se(ctx->current_memory_usage > 0);
+
+ /* Test hashmap inserts */
+ assert_se(h1 = hashmap_new(&oomd_cgroup_ctx_hash_ops));
+ assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == 0);
+ c1 = hashmap_get(h1, cgroup);
+ assert_se(c1);
+
+ /* make sure certain values from h1 get updated in h2 */
+ c1->pgscan = 5555;
+ c1->mem_pressure_limit = 6789;
+ c1->last_hit_mem_pressure_limit = 42;
+ assert_se(h2 = hashmap_new(&oomd_cgroup_ctx_hash_ops));
+ assert_se(oomd_insert_cgroup_context(h1, h2, cgroup) == 0);
+ c1 = hashmap_get(h1, cgroup);
+ c2 = hashmap_get(h2, cgroup);
+ assert_se(c1);
+ assert_se(c2);
+ assert_se(c1 != c2);
+ assert_se(c2->last_pgscan == 5555);
+ assert_se(c2->mem_pressure_limit == 6789);
+ assert_se(c2->last_hit_mem_pressure_limit == 42);
+}
+
+static void test_oomd_system_context_acquire(void) {
+ _cleanup_(unlink_tempfilep) char path[] = "/oomdgetsysctxtestXXXXXX";
+ OomdSystemContext ctx;
+
+ if (geteuid() != 0)
+ return (void) log_tests_skipped("not root");
+
+ assert_se(mkstemp(path));
+
+ assert_se(oomd_system_context_acquire("/verylikelynonexistentpath", &ctx) == -ENOENT);
+
+ assert_se(oomd_system_context_acquire(path, &ctx) == 0);
+ assert_se(ctx.swap_total == 0);
+ assert_se(ctx.swap_used == 0);
+
+ assert_se(write_string_file(path, "some\nwords\nacross\nmultiple\nlines", WRITE_STRING_FILE_CREATE) == 0);
+ assert_se(oomd_system_context_acquire(path, &ctx) == 0);
+ assert_se(ctx.swap_total == 0);
+ assert_se(ctx.swap_used == 0);
+
+ assert_se(write_string_file(path, "Filename Type Size Used Priority\n"
+ "/swapvol/swapfile file 18971644 0 -3\n"
+ "/dev/vda2 partition 1999868 993780 -2", WRITE_STRING_FILE_CREATE) == 0);
+ assert_se(oomd_system_context_acquire(path, &ctx) == 0);
+ assert_se(ctx.swap_total == 21474828288);
+ assert_se(ctx.swap_used == 1017630720);
+}
+
+static void test_oomd_pressure_above(void) {
+ _cleanup_hashmap_free_ Hashmap *h1 = NULL, *h2 = NULL;
+ _cleanup_set_free_ Set *t1 = NULL, *t2 = NULL, *t3 = NULL;
+ OomdCGroupContext ctx[2], *c;
+ loadavg_t threshold;
+
+ assert_se(store_loadavg_fixed_point(80, 0, &threshold) == 0);
+
+ /* /herp.slice */
+ assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg10)) == 0);
+ assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg60)) == 0);
+ assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg300)) == 0);
+ ctx[0].mem_pressure_limit = threshold;
+
+ /* /derp.slice */
+ assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg10)) == 0);
+ assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg60)) == 0);
+ assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg300)) == 0);
+ ctx[1].mem_pressure_limit = threshold;
+
+
+ /* High memory pressure */
+ assert_se(h1 = hashmap_new(&string_hash_ops));
+ assert_se(hashmap_put(h1, "/herp.slice", &ctx[0]) >= 0);
+ assert_se(oomd_pressure_above(h1, 0 /* duration */, &t1) == 1);
+ assert_se(set_contains(t1, &ctx[0]) == true);
+ assert_se(c = hashmap_get(h1, "/herp.slice"));
+ assert_se(c->last_hit_mem_pressure_limit > 0);
+
+ /* Low memory pressure */
+ assert_se(h2 = hashmap_new(&string_hash_ops));
+ assert_se(hashmap_put(h2, "/derp.slice", &ctx[1]) >= 0);
+ assert_se(oomd_pressure_above(h2, 0 /* duration */, &t2) == 0);
+ assert_se(t2 == NULL);
+ assert_se(c = hashmap_get(h2, "/derp.slice"));
+ assert_se(c->last_hit_mem_pressure_limit == 0);
+
+ /* High memory pressure w/ multiple cgroups */
+ assert_se(hashmap_put(h1, "/derp.slice", &ctx[1]) >= 0);
+ assert_se(oomd_pressure_above(h1, 0 /* duration */, &t3) == 1);
+ assert_se(set_contains(t3, &ctx[0]) == true);
+ assert_se(set_size(t3) == 1);
+ assert_se(c = hashmap_get(h1, "/herp.slice"));
+ assert_se(c->last_hit_mem_pressure_limit > 0);
+ assert_se(c = hashmap_get(h1, "/derp.slice"));
+ assert_se(c->last_hit_mem_pressure_limit == 0);
+}
+
+static void test_oomd_memory_reclaim(void) {
+ _cleanup_hashmap_free_ Hashmap *h1 = NULL;
+ char **paths = STRV_MAKE("/0.slice",
+ "/1.slice",
+ "/2.slice",
+ "/3.slice",
+ "/4.slice");
+
+ OomdCGroupContext ctx[5] = {
+ { .path = paths[0],
+ .last_pgscan = 100,
+ .pgscan = 100 },
+ { .path = paths[1],
+ .last_pgscan = 100,
+ .pgscan = 100 },
+ { .path = paths[2],
+ .last_pgscan = 77,
+ .pgscan = 33 },
+ { .path = paths[3],
+ .last_pgscan = UINT64_MAX,
+ .pgscan = 100 },
+ { .path = paths[4],
+ .last_pgscan = 100,
+ .pgscan = UINT64_MAX },
+ };
+
+ assert_se(h1 = hashmap_new(&string_hash_ops));
+ assert_se(hashmap_put(h1, paths[0], &ctx[0]) >= 0);
+ assert_se(hashmap_put(h1, paths[1], &ctx[1]) >= 0);
+ assert_se(oomd_memory_reclaim(h1) == false);
+
+ assert_se(hashmap_put(h1, paths[2], &ctx[2]) >= 0);
+ assert_se(oomd_memory_reclaim(h1) == false);
+
+ assert_se(hashmap_put(h1, paths[4], &ctx[4]) >= 0);
+ assert_se(oomd_memory_reclaim(h1) == true);
+
+ assert_se(hashmap_put(h1, paths[3], &ctx[3]) >= 0);
+ assert_se(oomd_memory_reclaim(h1) == false);
+}
+
+static void test_oomd_swap_free_below(void) {
+ OomdSystemContext ctx = (OomdSystemContext) {
+ .swap_total = 20971512 * 1024U,
+ .swap_used = 20971440 * 1024U,
+ };
+ assert_se(oomd_swap_free_below(&ctx, 20) == true);
+
+ ctx = (OomdSystemContext) {
+ .swap_total = 20971512 * 1024U,
+ .swap_used = 3310136 * 1024U,
+ };
+ assert_se(oomd_swap_free_below(&ctx, 20) == false);
+}
+
+static void test_oomd_sort_cgroups(void) {
+ _cleanup_hashmap_free_ Hashmap *h = NULL;
+ _cleanup_free_ OomdCGroupContext **sorted_cgroups;
+ char **paths = STRV_MAKE("/herp.slice",
+ "/herp.slice/derp.scope",
+ "/herp.slice/derp.scope/sheep.service",
+ "/zupa.slice");
+
+ OomdCGroupContext ctx[4] = {
+ { .path = paths[0],
+ .swap_usage = 20,
+ .pgscan = 60 },
+ { .path = paths[1],
+ .swap_usage = 60,
+ .pgscan = 40 },
+ { .path = paths[2],
+ .swap_usage = 40,
+ .pgscan = 20 },
+ { .path = paths[3],
+ .swap_usage = 10,
+ .pgscan = 80 },
+ };
+
+ assert_se(h = hashmap_new(&string_hash_ops));
+
+ assert_se(hashmap_put(h, "/herp.slice", &ctx[0]) >= 0);
+ assert_se(hashmap_put(h, "/herp.slice/derp.scope", &ctx[1]) >= 0);
+ assert_se(hashmap_put(h, "/herp.slice/derp.scope/sheep.service", &ctx[2]) >= 0);
+ assert_se(hashmap_put(h, "/zupa.slice", &ctx[3]) >= 0);
+
+ assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 4);
+ assert_se(sorted_cgroups[0] == &ctx[1]);
+ assert_se(sorted_cgroups[1] == &ctx[2]);
+ assert_se(sorted_cgroups[2] == &ctx[0]);
+ assert_se(sorted_cgroups[3] == &ctx[3]);
+ sorted_cgroups = mfree(sorted_cgroups);
+
+ assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan, NULL, &sorted_cgroups) == 4);
+ assert_se(sorted_cgroups[0] == &ctx[3]);
+ assert_se(sorted_cgroups[1] == &ctx[0]);
+ assert_se(sorted_cgroups[2] == &ctx[1]);
+ assert_se(sorted_cgroups[3] == &ctx[2]);
+ sorted_cgroups = mfree(sorted_cgroups);
+
+ assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan, "/herp.slice/derp.scope", &sorted_cgroups) == 2);
+ assert_se(sorted_cgroups[0] == &ctx[1]);
+ assert_se(sorted_cgroups[1] == &ctx[2]);
+ assert_se(sorted_cgroups[2] == 0);
+ assert_se(sorted_cgroups[3] == 0);
+ sorted_cgroups = mfree(sorted_cgroups);
+}
+
+int main(void) {
+ int r;
+
+ test_setup_logging(LOG_DEBUG);
+
+ test_oomd_system_context_acquire();
+ test_oomd_pressure_above();
+ test_oomd_memory_reclaim();
+ test_oomd_swap_free_below();
+ test_oomd_sort_cgroups();
+
+ /* The following tests operate on live cgroups */
+
+ r = enter_cgroup_root(NULL);
+ if (r < 0)
+ return log_tests_skipped_errno(r, "failed to enter a test cgroup scope");
+
+ test_oomd_cgroup_kill();
+ test_oomd_cgroup_context_acquire_and_insert();
+
+ return 0;
+}
diff --git a/src/shared/bus-get-properties.c b/src/shared/bus-get-properties.c
index 8ad4694046..5a123bb8f3 100644
--- a/src/shared/bus-get-properties.c
+++ b/src/shared/bus-get-properties.c
@@ -2,6 +2,7 @@
#include "bus-get-properties.h"
#include "rlimit-util.h"
+#include "stdio-util.h"
#include "string-util.h"
int bus_property_get_bool(
@@ -54,6 +55,23 @@ int bus_property_get_id128(
return sd_bus_message_append_array(reply, 'y', id->bytes, 16);
}
+int bus_property_get_percent(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ char pstr[DECIMAL_STR_MAX(int) + 2];
+ int p = *(int*) userdata;
+
+ xsprintf(pstr, "%d%%", p);
+
+ return sd_bus_message_append_basic(reply, 's', pstr);
+}
+
#if __SIZEOF_SIZE_T__ != 8
int bus_property_get_size(
sd_bus *bus,
diff --git a/src/shared/bus-get-properties.h b/src/shared/bus-get-properties.h
index 81af74309d..f3934a86a2 100644
--- a/src/shared/bus-get-properties.h
+++ b/src/shared/bus-get-properties.h
@@ -8,6 +8,7 @@
int bus_property_get_bool(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
int bus_property_set_bool(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *value, void *userdata, sd_bus_error *error);
int bus_property_get_id128(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
+int bus_property_get_percent(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
#define bus_property_get_usec ((sd_bus_property_get_t) NULL)
#define bus_property_set_usec ((sd_bus_property_set_t) NULL)
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index 6fe6131292..89e0c5bb95 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -432,7 +432,11 @@ static int bus_append_ip_address_access(sd_bus_message *m, int family, const uni
static int bus_append_cgroup_property(sd_bus_message *m, const char *field, const char *eq) {
int r;
- if (STR_IN_SET(field, "DevicePolicy", "Slice"))
+ if (STR_IN_SET(field, "DevicePolicy",
+ "Slice",
+ "ManagedOOMSwap",
+ "ManagedOOMMemoryPressure",
+ "ManagedOOMMemoryPressureLimitPercent"))
return bus_append_string(m, field, eq);
if (STR_IN_SET(field, "CPUAccounting",
diff --git a/src/shared/conf-parser.c b/src/shared/conf-parser.c
index 02a27e3a88..524f57ff80 100644
--- a/src/shared/conf-parser.c
+++ b/src/shared/conf-parser.c
@@ -1243,3 +1243,5 @@ int config_parse_vlanprotocol(const char* unit,
return 0;
}
+
+DEFINE_CONFIG_PARSE(config_parse_percent, parse_percent, "Failed to parse percent value");
diff --git a/src/shared/conf-parser.h b/src/shared/conf-parser.h
index 57787ea033..2514dcbf48 100644
--- a/src/shared/conf-parser.h
+++ b/src/shared/conf-parser.h
@@ -147,6 +147,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_ip_port);
CONFIG_PARSER_PROTOTYPE(config_parse_mtu);
CONFIG_PARSER_PROTOTYPE(config_parse_rlimit);
CONFIG_PARSER_PROTOTYPE(config_parse_vlanprotocol);
+CONFIG_PARSER_PROTOTYPE(config_parse_percent);
typedef enum Disabled {
DISABLED_CONFIGURATION,
diff --git a/src/shared/meson.build b/src/shared/meson.build
index 3f409584e6..0ed216f1aa 100644
--- a/src/shared/meson.build
+++ b/src/shared/meson.build
@@ -189,6 +189,8 @@ shared_sources = files('''
pkcs11-util.h
pretty-print.c
pretty-print.h
+ psi-util.c
+ psi-util.h
ptyfwd.c
ptyfwd.h
pwquality-util.c
diff --git a/src/shared/psi-util.c b/src/shared/psi-util.c
new file mode 100644
index 0000000000..21e965b04b
--- /dev/null
+++ b/src/shared/psi-util.c
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#include
+#include
+
+#include "alloc-util.h"
+#include "extract-word.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "parse-util.h"
+#include "psi-util.h"
+#include "string-util.h"
+#include "stat-util.h"
+#include "strv.h"
+
+int read_resource_pressure(const char *path, PressureType type, ResourcePressure *ret) {
+ _cleanup_free_ char *line = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ unsigned field_filled = 0;
+ ResourcePressure rp = {};
+ const char *t, *cline;
+ char *word;
+ int r;
+
+ assert(path);
+ assert(IN_SET(type, PRESSURE_TYPE_SOME, PRESSURE_TYPE_FULL));
+ assert(ret);
+
+ if (type == PRESSURE_TYPE_SOME)
+ t = "some";
+ else if (type == PRESSURE_TYPE_FULL)
+ t = "full";
+ else
+ return -EINVAL;
+
+ r = fopen_unlocked(path, "re", &f);
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ _cleanup_free_ char *l = NULL;
+ char *w;
+
+ r = read_line(f, LONG_LINE_MAX, &l);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ w = first_word(l, t);
+ if (w) {
+ line = TAKE_PTR(l);
+ cline = w;
+ break;
+ }
+ }
+
+ if (!line)
+ return -ENODATA;
+
+ /* extracts either avgX=Y.Z or total=X */
+ while ((r = extract_first_word(&cline, &word, NULL, 0)) > 0) {
+ _cleanup_free_ char *w = word;
+ const char *v;
+
+ if ((v = startswith(w, "avg10="))) {
+ if (field_filled & (1U << 0))
+ return -EINVAL;
+
+ field_filled |= 1U << 0;
+ r = parse_loadavg_fixed_point(v, &rp.avg10);
+ } else if ((v = startswith(w, "avg60="))) {
+ if (field_filled & (1U << 1))
+ return -EINVAL;
+
+ field_filled |= 1U << 1;
+ r = parse_loadavg_fixed_point(v, &rp.avg60);
+ } else if ((v = startswith(w, "avg300="))) {
+ if (field_filled & (1U << 2))
+ return -EINVAL;
+
+ field_filled |= 1U << 2;
+ r = parse_loadavg_fixed_point(v, &rp.avg300);
+ } else if ((v = startswith(w, "total="))) {
+ if (field_filled & (1U << 3))
+ return -EINVAL;
+
+ field_filled |= 1U << 3;
+ r = safe_atou64(v, &rp.total);
+ } else
+ continue;
+
+ if (r < 0)
+ return r;
+ }
+
+ if (r < 0)
+ return r;
+
+ if (field_filled != 15U)
+ return -EINVAL;
+
+ *ret = rp;
+ return 0;
+}
+
+int is_pressure_supported(void) {
+ const char *p;
+
+ FOREACH_STRING(p, "/proc/pressure/cpu", "/proc/pressure/io", "/proc/pressure/memory")
+ if (access(p, F_OK) < 0) {
+ if (errno == ENOENT)
+ return 0;
+ return -errno;
+ }
+
+ return 1;
+}
diff --git a/src/shared/psi-util.h b/src/shared/psi-util.h
new file mode 100644
index 0000000000..9810dbec6e
--- /dev/null
+++ b/src/shared/psi-util.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+#pragma once
+
+#include
+
+#include "parse-util.h"
+#include "time-util.h"
+
+typedef enum PressureType {
+ PRESSURE_TYPE_SOME,
+ PRESSURE_TYPE_FULL,
+} PressureType;
+
+/* Averages are stored in fixed-point with 11 bit fractions */
+typedef struct ResourcePressure {
+ loadavg_t avg10;
+ loadavg_t avg60;
+ loadavg_t avg300;
+ usec_t total;
+} ResourcePressure;
+
+/** Upstream 4.20+ format
+ *
+ * some avg10=0.22 avg60=0.17 avg300=1.11 total=58761459
+ * full avg10=0.23 avg60=0.16 avg300=1.08 total=58464525
+ */
+int read_resource_pressure(const char *path, PressureType type, ResourcePressure *ret);
+
+/* Was the kernel compiled with CONFIG_PSI=y? 1 if yes, 0 if not, negative on error. */
+int is_pressure_supported(void);
diff --git a/src/shared/tests.c b/src/shared/tests.c
index a5cb486c99..fe6d9dfbd5 100644
--- a/src/shared/tests.c
+++ b/src/shared/tests.c
@@ -254,7 +254,7 @@ static int allocate_scope(void) {
return 0;
}
-int enter_cgroup_subroot(char **ret_cgroup) {
+static int enter_cgroup(char **ret_cgroup, bool enter_subroot) {
_cleanup_free_ char *cgroup_root = NULL, *cgroup_subroot = NULL;
CGroupMask supported;
int r;
@@ -268,7 +268,13 @@ int enter_cgroup_subroot(char **ret_cgroup) {
return log_warning_errno(r, "cg_pid_get_path(NULL, 0, ...) failed: %m");
assert(r >= 0);
- assert_se(asprintf(&cgroup_subroot, "%s/%" PRIx64, cgroup_root, random_u64()) >= 0);
+ if (enter_subroot)
+ assert_se(asprintf(&cgroup_subroot, "%s/%" PRIx64, cgroup_root, random_u64()) >= 0);
+ else {
+ cgroup_subroot = strdup(cgroup_root);
+ assert_se(cgroup_subroot != NULL);
+ }
+
assert_se(cg_mask_supported(&supported) >= 0);
/* If this fails, then we don't mind as the later cgroup operations will fail too, and it's fine if
@@ -287,3 +293,11 @@ int enter_cgroup_subroot(char **ret_cgroup) {
return 0;
}
+
+int enter_cgroup_subroot(char **ret_cgroup) {
+ return enter_cgroup(ret_cgroup, true);
+}
+
+int enter_cgroup_root(char **ret_cgroup) {
+ return enter_cgroup(ret_cgroup, false);
+}
diff --git a/src/shared/tests.h b/src/shared/tests.h
index 6817ef4860..505ca39775 100644
--- a/src/shared/tests.h
+++ b/src/shared/tests.h
@@ -20,6 +20,7 @@ static inline bool manager_errno_skip_test(int r) {
char* setup_fake_runtime_dir(void);
int enter_cgroup_subroot(char **ret_cgroup);
+int enter_cgroup_root(char **ret_cgroup);
int get_testdata_dir(const char *suffix, char **ret);
const char* get_catalog_dir(void);
bool slow_tests_enabled(void);
diff --git a/src/shared/varlink.c b/src/shared/varlink.c
index 86b5f08ae7..fabfe78280 100644
--- a/src/shared/varlink.c
+++ b/src/shared/varlink.c
@@ -418,6 +418,11 @@ static int varlink_test_disconnect(Varlink *v) {
if (IN_SET(v->state, VARLINK_IDLE_CLIENT) && (v->write_disconnected || v->got_pollhup))
goto disconnect;
+ /* The server is still expecting to write more, but its write end is disconnected and it got a POLLHUP
+ * (i.e. from a disconnected client), so disconnect. */
+ if (IN_SET(v->state, VARLINK_PENDING_METHOD, VARLINK_PENDING_METHOD_MORE) && v->write_disconnected && v->got_pollhup)
+ goto disconnect;
+
return 0;
disconnect:
diff --git a/src/shared/varlink.h b/src/shared/varlink.h
index 06a34b480d..030db39b2f 100644
--- a/src/shared/varlink.h
+++ b/src/shared/varlink.h
@@ -171,3 +171,4 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(VarlinkServer *, varlink_server_unref);
#define VARLINK_ERROR_METHOD_NOT_FOUND "org.varlink.service.MethodNotFound"
#define VARLINK_ERROR_METHOD_NOT_IMPLEMENTED "org.varlink.service.MethodNotImplemented"
#define VARLINK_ERROR_INVALID_PARAMETER "org.varlink.service.InvalidParameter"
+#define VARLINK_ERROR_SUBSCRIPTION_TAKEN "org.varlink.service.SubscriptionTaken"
diff --git a/src/systemd/sd-messages.h b/src/systemd/sd-messages.h
index 05f00ed577..eea8c2c900 100644
--- a/src/systemd/sd-messages.h
+++ b/src/systemd/sd-messages.h
@@ -127,6 +127,9 @@ _SD_BEGIN_DECLARATIONS;
#define SD_MESSAGE_OVERMOUNTING SD_ID128_MAKE(1d,ee,03,69,c7,fc,47,36,b7,09,9b,38,ec,b4,6e,e7)
#define SD_MESSAGE_OVERMOUNTING_STR SD_ID128_MAKE_STR(1d,ee,03,69,c7,fc,47,36,b7,09,9b,38,ec,b4,6e,e7)
+#define SD_MESSAGE_UNIT_OOMD_KILL SD_ID128_MAKE(d9,89,61,1b,15,e4,4c,9d,bf,31,e3,c8,12,56,e4,ed)
+#define SD_MESSAGE_UNIT_OOMD_KILL_STR SD_ID128_MAKE_STR(d9,89,61,1b,15,e4,4c,9d,bf,31,e3,c8,12,56,e4,ed)
+
#define SD_MESSAGE_UNIT_OUT_OF_MEMORY SD_ID128_MAKE(fe,6f,aa,94,e7,77,46,63,a0,da,52,71,78,91,d8,ef)
#define SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR SD_ID128_MAKE_STR(fe,6f,aa,94,e7,77,46,63,a0,da,52,71,78,91,d8,ef)
diff --git a/src/test/meson.build b/src/test/meson.build
index 9bb3499963..60dec9512c 100644
--- a/src/test/meson.build
+++ b/src/test/meson.build
@@ -804,6 +804,10 @@ tests += [
[['src/test/test-local-addresses.c'],
[],
[]],
+
+ [['src/test/test-psi-util.c'],
+ [],
+ []],
]
############################################################
diff --git a/src/test/test-parse-util.c b/src/test/test-parse-util.c
index 3806c3f8cf..d4f908f5d4 100644
--- a/src/test/test-parse-util.c
+++ b/src/test/test-parse-util.c
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: LGPL-2.1+ */
#include
+#include
#include
#include
#include
@@ -929,6 +930,42 @@ static void test_parse_mtu(void) {
assert_se(parse_mtu(AF_UNSPEC, "", &mtu) == -EINVAL);
}
+static void test_parse_loadavg_fixed_point(void) {
+ loadavg_t fp;
+
+ assert_se(parse_loadavg_fixed_point("1.23", &fp) == 0);
+ assert_se(LOAD_INT(fp) == 1);
+ assert_se(LOAD_FRAC(fp) == 23);
+
+ assert_se(parse_loadavg_fixed_point("1.80", &fp) == 0);
+ assert_se(LOAD_INT(fp) == 1);
+ assert_se(LOAD_FRAC(fp) == 80);
+
+ assert_se(parse_loadavg_fixed_point("0.07", &fp) == 0);
+ assert_se(LOAD_INT(fp) == 0);
+ assert_se(LOAD_FRAC(fp) == 7);
+
+ assert_se(parse_loadavg_fixed_point("0.00", &fp) == 0);
+ assert_se(LOAD_INT(fp) == 0);
+ assert_se(LOAD_FRAC(fp) == 0);
+
+ assert_se(parse_loadavg_fixed_point("4096.57", &fp) == 0);
+ assert_se(LOAD_INT(fp) == 4096);
+ assert_se(LOAD_FRAC(fp) == 57);
+
+ /* Caps out at 2 digit fracs */
+ assert_se(parse_loadavg_fixed_point("1.100", &fp) == -ERANGE);
+
+ assert_se(parse_loadavg_fixed_point("4096.4096", &fp) == -ERANGE);
+ assert_se(parse_loadavg_fixed_point("-4000.5", &fp) == -ERANGE);
+ assert_se(parse_loadavg_fixed_point("18446744073709551615.5", &fp) == -ERANGE);
+ assert_se(parse_loadavg_fixed_point("foobar", &fp) == -EINVAL);
+ assert_se(parse_loadavg_fixed_point("3333", &fp) == -EINVAL);
+ assert_se(parse_loadavg_fixed_point("1.2.3", &fp) == -EINVAL);
+ assert_se(parse_loadavg_fixed_point(".", &fp) == -EINVAL);
+ assert_se(parse_loadavg_fixed_point("", &fp) == -EINVAL);
+}
+
int main(int argc, char *argv[]) {
log_parse_environment();
log_open();
@@ -955,6 +992,7 @@ int main(int argc, char *argv[]) {
test_parse_errno();
test_parse_syscall_and_errno();
test_parse_mtu();
+ test_parse_loadavg_fixed_point();
return 0;
}
diff --git a/src/test/test-psi-util.c b/src/test/test-psi-util.c
new file mode 100644
index 0000000000..bde8ef80b1
--- /dev/null
+++ b/src/test/test-psi-util.c
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#include
+
+#include "alloc-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "psi-util.h"
+#include "tests.h"
+
+static void test_read_mem_pressure(void) {
+ _cleanup_(unlink_tempfilep) char path[] = "/tmp/pressurereadtestXXXXXX";
+ ResourcePressure rp;
+
+ if (geteuid() != 0)
+ return (void) log_tests_skipped("not root");
+
+ assert_se(mkstemp(path));
+
+ assert_se(read_resource_pressure("/verylikelynonexistentpath", PRESSURE_TYPE_SOME, &rp) < 0);
+ assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) < 0);
+
+ assert_se(write_string_file(path, "herpdederp\n", WRITE_STRING_FILE_CREATE) == 0);
+ assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) < 0);
+
+ /* Pressure file with some invalid values*/
+ assert_se(write_string_file(path, "some avg10=0.22=55 avg60=0.17=8 avg300=1.11=00 total=58761459\n"
+ "full avg10=0.23=55 avg60=0.16=8 avg300=1.08=00 total=58464525", WRITE_STRING_FILE_CREATE) == 0);
+ assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) < 0);
+
+ /* Same pressure valid values as below but with duplicate avg60 field */
+ assert_se(write_string_file(path, "some avg10=0.22 avg60=0.17 avg60=0.18 avg300=1.11 total=58761459\n"
+ "full avg10=0.23 avg60=0.16 avg300=1.08 total=58464525", WRITE_STRING_FILE_CREATE) == 0);
+ assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) < 0);
+
+ assert_se(write_string_file(path, "some avg10=0.22 avg60=0.17 avg300=1.11 total=58761459\n"
+ "full avg10=0.23 avg60=0.16 avg300=1.08 total=58464525", WRITE_STRING_FILE_CREATE) == 0);
+ assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) == 0);
+ assert_se(LOAD_INT(rp.avg10) == 0);
+ assert_se(LOAD_FRAC(rp.avg10) == 22);
+ assert_se(LOAD_INT(rp.avg60) == 0);
+ assert_se(LOAD_FRAC(rp.avg60) == 17);
+ assert_se(LOAD_INT(rp.avg300) == 1);
+ assert_se(LOAD_FRAC(rp.avg300) == 11);
+ assert_se(rp.total == 58761459);
+ assert(read_resource_pressure(path, PRESSURE_TYPE_FULL, &rp) == 0);
+ assert_se(LOAD_INT(rp.avg10) == 0);
+ assert_se(LOAD_FRAC(rp.avg10) == 23);
+ assert_se(LOAD_INT(rp.avg60) == 0);
+ assert_se(LOAD_FRAC(rp.avg60) == 16);
+ assert_se(LOAD_INT(rp.avg300) == 1);
+ assert_se(LOAD_FRAC(rp.avg300) == 8);
+ assert_se(rp.total == 58464525);
+
+ /* Pressure file with extra unsupported fields */
+ assert_se(write_string_file(path, "some avg5=0.55 avg10=0.22 avg60=0.17 avg300=1.11 total=58761459\n"
+ "full avg10=0.23 avg60=0.16 avg300=1.08 avg600=2.00 total=58464525", WRITE_STRING_FILE_CREATE) == 0);
+ assert_se(read_resource_pressure(path, PRESSURE_TYPE_SOME, &rp) == 0);
+ assert_se(LOAD_INT(rp.avg10) == 0);
+ assert_se(LOAD_FRAC(rp.avg10) == 22);
+ assert_se(LOAD_INT(rp.avg60) == 0);
+ assert_se(LOAD_FRAC(rp.avg60) == 17);
+ assert_se(LOAD_INT(rp.avg300) == 1);
+ assert_se(LOAD_FRAC(rp.avg300) == 11);
+ assert_se(rp.total == 58761459);
+ assert(read_resource_pressure(path, PRESSURE_TYPE_FULL, &rp) == 0);
+ assert_se(LOAD_INT(rp.avg10) == 0);
+ assert_se(LOAD_FRAC(rp.avg10) == 23);
+ assert_se(LOAD_INT(rp.avg60) == 0);
+ assert_se(LOAD_FRAC(rp.avg60) == 16);
+ assert_se(LOAD_INT(rp.avg300) == 1);
+ assert_se(LOAD_FRAC(rp.avg300) == 8);
+ assert_se(rp.total == 58464525);
+}
+
+int main(void) {
+ test_setup_logging(LOG_DEBUG);
+ test_read_mem_pressure();
+ return 0;
+}
diff --git a/src/test/test-tables.c b/src/test/test-tables.c
index 59f90b76ec..7273611143 100644
--- a/src/test/test-tables.c
+++ b/src/test/test-tables.c
@@ -3,6 +3,7 @@
#include "architecture.h"
#include "automount.h"
#include "cgroup.h"
+#include "cgroup-util.h"
#include "compress.h"
#include "condition.h"
#include "device-private.h"
@@ -71,6 +72,7 @@ int main(int argc, char **argv) {
test_table(locale_variable, VARIABLE_LC);
test_table(log_target, LOG_TARGET);
test_table(mac_address_policy, MAC_ADDRESS_POLICY);
+ test_table(managed_oom_mode, MANAGED_OOM_MODE);
test_table(manager_state, MANAGER_STATE);
test_table(manager_timestamp, MANAGER_TIMESTAMP);
test_table(mount_exec_command, MOUNT_EXEC_COMMAND);
diff --git a/sysusers.d/systemd.conf.m4 b/sysusers.d/systemd.conf.m4
index ef5a3cb619..fdfdcf553c 100644
--- a/sysusers.d/systemd.conf.m4
+++ b/sysusers.d/systemd.conf.m4
@@ -9,6 +9,9 @@ g systemd-journal - -
m4_ifdef(`ENABLE_NETWORKD',
u systemd-network - "systemd Network Management"
)m4_dnl
+m4_ifdef(`ENABLE_OOMD',
+u systemd-oom - "systemd Userspace OOM Killer"
+)m4_dnl
m4_ifdef(`ENABLE_RESOLVE',
u systemd-resolve - "systemd Resolver"
)m4_dnl
diff --git a/units/meson.build b/units/meson.build
index 08c39c99b3..e94e7f7efd 100644
--- a/units/meson.build
+++ b/units/meson.build
@@ -201,6 +201,7 @@ in_units = [
['systemd-networkd.service', 'ENABLE_NETWORKD'],
['systemd-networkd-wait-online.service', 'ENABLE_NETWORKD'],
['systemd-nspawn@.service', ''],
+ ['systemd-oomd.service', 'ENABLE_OOMD'],
['systemd-portabled.service', 'ENABLE_PORTABLED',
'dbus-org.freedesktop.portable1.service'],
['systemd-userdbd.service', 'ENABLE_USERDB'],
diff --git a/units/systemd-oomd.service.in b/units/systemd-oomd.service.in
new file mode 100644
index 0000000000..a270a0ed78
--- /dev/null
+++ b/units/systemd-oomd.service.in
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: LGPL-2.1+
+#
+# This file is part of systemd.
+#
+# systemd is free software; you can redistribute it and/or modify it
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or
+# (at your option) any later version.
+
+[Unit]
+Description=Userspace Out-Of-Memory (OOM) Killer
+Documentation=man:systemd-oomd.service(8)
+ConditionCapability=CAP_KILL
+DefaultDependencies=no
+Before=multi-user.target shutdown.target
+Conflicts=shutdown.target
+
+[Service]
+AmbientCapabilities=CAP_KILL CAP_DAC_OVERRIDE
+BusName=org.freedesktop.oom1
+CapabilityBoundingSet=CAP_KILL CAP_DAC_OVERRIDE
+ExecStart=@rootlibexecdir@/systemd-oomd
+IPAddressDeny=any
+LockPersonality=yes
+MemoryDenyWriteExecute=yes
+# Reserve some minimum amount of memory so that systemd-oomd can continue to
+# run in resource starved scenarios.
+MemoryMin=64M
+MemoryLow=64M
+NoNewPrivileges=yes
+OOMScoreAdjust=-900
+PrivateDevices=yes
+PrivateTmp=yes
+ProtectClock=yes
+ProtectHome=yes
+ProtectHostname=yes
+ProtectKernelLogs=yes
+ProtectKernelModules=yes
+ProtectKernelTunables=yes
+ProtectSystem=strict
+Restart=on-failure
+RestrictAddressFamilies=AF_UNIX
+RestrictNamespaces=yes
+RestrictRealtime=yes
+RestrictSUIDSGID=yes
+SystemCallArchitectures=native
+SystemCallErrorNumber=EPERM
+SystemCallFilter=@system-service
+Type=notify
+User=systemd-oom
+@SERVICE_WATCHDOG@
+
+[Install]
+WantedBy=multi-user.target
+Alias=dbus-org.freedesktop.oom1.service