Merge pull request #12037 from poettering/oom-state

add cgroupv2 oom killer event handling to service management
2019-04-11 18:57:47 +02:00 · 2019-04-11 18:57:47 +02:00 · 54f802ff8a
parent 3b21fd06ed 7d1d177b8d
commit 54f802ff8a
25 changed files with 528 additions and 57 deletions
--- a/4
+++ b/4
@ -62,8 +62,6 @@ Features:

 * bootctl,sd-boot: actually honour the "architecture" key

-* set memory.oom.group in cgroup v2 for all leaf cgroups (kernel v4.19+)
-
 * add a new syscall group "@esoteric" for more esoteric stuff such as bpf() and
  usefaultd() and make systemd-analyze check for it.

@ -71,8 +69,6 @@ Features:
  first. i.e. look for all places we use string_erase()/string_free_erase() and
  augment them with mlock(). Also use MADV_DONTDUMP

-* whenever oom_kill memory.event event is triggered print a nice log message
-
 * Move RestrictAddressFamily= to the new cgroup create socket

 * support the bind/connect/sendmsg cgroup stuff for sandboxing, and possibly
--- a/catalog/systemd.catalog.in
+++ b/catalog/systemd.catalog.in
@ -391,3 +391,16 @@ The following "tags" are possible:
 - "overflowgid-not-65534" — the kernel group ID used for "unknown" users (with
  NFS or user namespaces) is not 65534
 Current system is tagged as @TAINT@.
+
+-- fe6faa94e7774663a0da52717891d8ef
+Subject: A process of @UNIT@ unit has been killed by the OOM killer.
+Defined-By: systemd
+Support: %SUPPORT_URL%
+
+A process of unit @UNIT has been killed by the Linux kernel out-of-memory (OOM)
+killer logic. This usually indicates that the system is low on memory and that
+memory needed to be freed. A process associated with @UNIT@ has been determined
+as the best process to terminate and has been forcibly terminated by the
+kernel.
+
+Note that the memory pressure might or might not have been caused by @UNIT@.
--- a/man/systemd-system.conf.xml
+++ b/man/systemd-system.conf.xml
@ -364,6 +364,17 @@
        limits are only defaults for units, they are not applied to PID 1
        itself.</para></listitem>
      </varlistentry>
+
+      <varlistentry>
+        <term><varname>DefaultOOMPolicy=</varname></term>
+
+        <listitem><para>Configure the default policy for reacting to processes being killed by the Linux
+        Out-Of-Memory (OOM) killer. This may be used to pick a global default for the per-unit
+        <varname>OOMPolicy=</varname> setting. See
+        <citerefentry><refentrytitle>systemd.service</refentrytitle><manvolnum>5</manvolnum></citerefentry>
+        for details. Note that this default is not used for services that have <varname>Delegate=</varname>
+        turned on.</para></listitem>
+      </varlistentry>
    </variablelist>
  </refsect1>

--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@ -651,11 +651,17 @@ CapabilityBoundingSet=~CAP_B CAP_C</programlisting>
      <varlistentry>
        <term><varname>OOMScoreAdjust=</varname></term>

-        <listitem><para>Sets the adjustment level for the Out-Of-Memory killer for executed processes. Takes an integer
-        between -1000 (to disable OOM killing for this process) and 1000 (to make killing of this process under memory
-        pressure very likely). See <ulink
-        url="https://www.kernel.org/doc/Documentation/filesystems/proc.txt">proc.txt</ulink> for
-        details.</para></listitem>
+        <listitem><para>Sets the adjustment value for the Linux kernel's Out-Of-Memory (OOM) killer score for
+        executed processes. Takes an integer between -1000 (to disable OOM killing of processes of this unit)
+        and 1000 (to make killing of processes of this unit under memory pressure very likely). See <ulink
+        url="https://www.kernel.org/doc/Documentation/filesystems/proc.txt">proc.txt</ulink> for details. If
+        not specified defaults to the OOM score adjustment level of the service manager itself, which is
+        normally at 0.</para>
+
+        <para>Use the <varname>OOMPolicy=</varname> setting of service units to configure how the service
+        manager shall react to the kernel OOM killer terminating a process of the service.  See
+        <citerefentry><refentrytitle>systemd.service</refentrytitle><manvolnum>5</manvolnum></citerefentry>
+        for details.</para></listitem>
      </varlistentry>

      <varlistentry>
--- a/man/systemd.service.xml
+++ b/man/systemd.service.xml
@ -963,6 +963,29 @@
        above.</para></listitem>
      </varlistentry>

+      <varlistentry>
+        <term><varname>OOMPolicy=</varname></term>
+
+        <listitem><para>Configure the Out-Of-Memory (OOM) killer policy. On Linux, when memory becomes scarce
+        the kernel might decide to kill a running process in order to free up memory and reduce memory
+        pressure. This setting takes one of <constant>continue</constant>, <constant>stop</constant> or
+        <constant>kill</constant>. If set to <constant>continue</constant> and a process of the service is
+        killed by the kernel's OOM killer this is logged but the service continues running. If set to
+        <constant>stop</constant> the event is logged but the service is terminated cleanly by the service
+        manager. If set to <constant>kill</constant> and one of the service's processes is killed by the OOM
+        killer the kernel is instructed to kill all remaining processes of the service, too. Defaults to the
+        setting <varname>DefaultOOMPolicy=</varname> in
+        <citerefentry><refentrytitle>system.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry> is
+        set to, except for services where <varname>Delegate=</varname> is turned on, where it defaults to
+        <constant>continue</constant>.</para>
+
+        <para>Use the <varname>OOMScoreAdjust=</varname> setting to configure whether processes of the unit
+        shall be considered preferred or less preferred candidates for process termination by the Linux OOM
+        killer logic. See
+        <citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry> for
+        details.</para></listitem>
+      </varlistentry>
+
    </variablelist>

    <para>Check
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@ -3,6 +3,8 @@
 #include <fcntl.h>
 #include <fnmatch.h>

+#include "sd-messages.h"
+
 #include "alloc-util.h"
 #include "blockdev-util.h"
 #include "bpf-devices.h"
@ -1141,6 +1143,8 @@ static void cgroup_context_apply(
                        cgroup_apply_unified_memory_limit(u, "memory.max", max);
                        cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);

+                        (void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group));
+
                } else {
                        char buf[DECIMAL_STR_MAX(uint64_t) + 1];
                        uint64_t val;
@ -1568,15 +1572,14 @@ int unit_set_cgroup_path(Unit *u, const char *path) {

        assert(u);

+        if (streq_ptr(u->cgroup_path, path))
+                return 0;
+
        if (path) {
                p = strdup(path);
                if (!p)
                        return -ENOMEM;
-        } else
-                p = NULL;
-
-        if (streq_ptr(u->cgroup_path, p))
-                return 0;
+        }

        if (p) {
                r = hashmap_put(u->manager->cgroup_unit, p, u);
@ -1585,7 +1588,6 @@ int unit_set_cgroup_path(Unit *u, const char *path) {
        }

        unit_release_cgroup(u);
-
        u->cgroup_path = TAKE_PTR(p);

        return 1;
@ -1597,10 +1599,13 @@ int unit_watch_cgroup(Unit *u) {

        assert(u);

+        /* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if
+         * cgroupv2 is available. */
+
        if (!u->cgroup_path)
                return 0;

-        if (u->cgroup_inotify_wd >= 0)
+        if (u->cgroup_control_inotify_wd >= 0)
                return 0;

        /* Only applies to the unified hierarchy */
@ -1610,11 +1615,11 @@ int unit_watch_cgroup(Unit *u) {
        if (r == 0)
                return 0;

-        /* Don't watch the root slice, it's pointless. */
+        /* No point in watch the top-level slice, it's never going to run empty. */
        if (unit_has_name(u, SPECIAL_ROOT_SLICE))
                return 0;

-        r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
+        r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops);
        if (r < 0)
                return log_oom();

@ -1622,20 +1627,82 @@ int unit_watch_cgroup(Unit *u) {
        if (r < 0)
                return log_oom();

-        u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
-        if (u->cgroup_inotify_wd < 0) {
+        u->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
+        if (u->cgroup_control_inotify_wd < 0) {

-                if (errno == ENOENT) /* If the directory is already
-                                      * gone we don't need to track
-                                      * it, so this is not an error */
+                if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
+                                      * is not an error */
                        return 0;

-                return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
+                return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", u->cgroup_path);
        }

-        r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
+        r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd), u);
        if (r < 0)
-                return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
+                return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor to hash map: %m");
+
+        return 0;
+}
+
+int unit_watch_cgroup_memory(Unit *u) {
+        _cleanup_free_ char *events = NULL;
+        CGroupContext *c;
+        int r;
+
+        assert(u);
+
+        /* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if
+         * cgroupv2 is available. */
+
+        if (!u->cgroup_path)
+                return 0;
+
+        c = unit_get_cgroup_context(u);
+        if (!c)
+                return 0;
+
+        /* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie
+         * this to memory accounting, in a way watching for OOM kills is a form of memory accounting after
+         * all. */
+        if (!c->memory_accounting)
+                return 0;
+
+        /* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and
+         * we also don't want to generate a log message for each parent cgroup of a process. */
+        if (u->type == UNIT_SLICE)
+                return 0;
+
+        if (u->cgroup_memory_inotify_wd >= 0)
+                return 0;
+
+        /* Only applies to the unified hierarchy */
+        r = cg_all_unified();
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine whether the memory controller is unified: %m");
+        if (r == 0)
+                return 0;
+
+        r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops);
+        if (r < 0)
+                return log_oom();
+
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events);
+        if (r < 0)
+                return log_oom();
+
+        u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
+        if (u->cgroup_memory_inotify_wd < 0) {
+
+                if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
+                                      * is not an error */
+                        return 0;
+
+                return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", u->cgroup_path);
+        }
+
+        r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor to hash map: %m");

        return 0;
 }
@ -1692,6 +1759,7 @@ static int unit_create_cgroup(

        /* Start watching it */
        (void) unit_watch_cgroup(u);
+        (void) unit_watch_cgroup_memory(u);

        /* Preserve enabled controllers in delegated units, adjust others. */
        if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) {
@ -2225,12 +2293,20 @@ void unit_release_cgroup(Unit *u) {
                u->cgroup_path = mfree(u->cgroup_path);
        }

-        if (u->cgroup_inotify_wd >= 0) {
-                if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
-                        log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring: %m", u->cgroup_inotify_wd, u->id);
+        if (u->cgroup_control_inotify_wd >= 0) {
+                if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_control_inotify_wd) < 0)
+                        log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", u->cgroup_control_inotify_wd, u->id);

-                (void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
-                u->cgroup_inotify_wd = -1;
+                (void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd));
+                u->cgroup_control_inotify_wd = -1;
+        }
+
+        if (u->cgroup_memory_inotify_wd >= 0) {
+                if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0)
+                        log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id);
+
+                (void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd));
+                u->cgroup_memory_inotify_wd = -1;
        }
 }

@ -2479,6 +2555,106 @@ void unit_add_to_cgroup_empty_queue(Unit *u) {
                log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
 }

+static int unit_check_oom(Unit *u) {
+        _cleanup_free_ char *oom_kill = NULL;
+        bool increased;
+        uint64_t c;
+        int r;
+
+        if (!u->cgroup_path)
+                return 0;
+
+        r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill);
+        if (r < 0)
+                return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m");
+
+        r = safe_atou64(oom_kill, &c);
+        if (r < 0)
+                return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m");
+
+        increased = c > u->oom_kill_last;
+        u->oom_kill_last = c;
+
+        if (!increased)
+                return 0;
+
+        log_struct(LOG_NOTICE,
+                   "MESSAGE_ID=" SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR,
+                   LOG_UNIT_ID(u),
+                   LOG_UNIT_INVOCATION_ID(u),
+                   LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer."));
+
+        if (UNIT_VTABLE(u)->notify_cgroup_oom)
+                UNIT_VTABLE(u)->notify_cgroup_oom(u);
+
+        return 1;
+}
+
+static int on_cgroup_oom_event(sd_event_source *s, void *userdata) {
+        Manager *m = userdata;
+        Unit *u;
+        int r;
+
+        assert(s);
+        assert(m);
+
+        u = m->cgroup_oom_queue;
+        if (!u)
+                return 0;
+
+        assert(u->in_cgroup_oom_queue);
+        u->in_cgroup_oom_queue = false;
+        LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u);
+
+        if (m->cgroup_oom_queue) {
+                /* More stuff queued, let's make sure we remain enabled */
+                r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m");
+        }
+
+        (void) unit_check_oom(u);
+        return 0;
+}
+
+static void unit_add_to_cgroup_oom_queue(Unit *u) {
+        int r;
+
+        assert(u);
+
+        if (u->in_cgroup_oom_queue)
+                return;
+        if (!u->cgroup_path)
+                return;
+
+        LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
+        u->in_cgroup_oom_queue = true;
+
+        /* Trigger the defer event */
+        if (!u->manager->cgroup_oom_event_source) {
+                _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+
+                r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to create cgroup oom event source: %m");
+                        return;
+                }
+
+                r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
+                        return;
+                }
+
+                (void) sd_event_source_set_description(s, "cgroup-oom");
+                u->manager->cgroup_oom_event_source = TAKE_PTR(s);
+        }
+
+        r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT);
+        if (r < 0)
+                log_error_errno(r, "Failed to enable cgroup oom event source: %m");
+}
+
 static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
        Manager *m = userdata;

@ -2510,15 +2686,16 @@ static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents,
                                /* The watch was just removed */
                                continue;

-                        u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
-                        if (!u) /* Not that inotify might deliver
-                                 * events for a watch even after it
-                                 * was removed, because it was queued
-                                 * before the removal. Let's ignore
-                                 * this here safely. */
-                                continue;
+                        /* Note that inotify might deliver events for a watch even after it was removed,
+                         * because it was queued before the removal. Let's ignore this here safely. */

-                        unit_add_to_cgroup_empty_queue(u);
+                        u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd));
+                        if (u)
+                                unit_add_to_cgroup_empty_queue(u);
+
+                        u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd));
+                        if (u)
+                                unit_add_to_cgroup_oom_queue(u);
                }
        }
 }
@ -2606,6 +2783,9 @@ int manager_setup_cgroup(Manager *m) {
        if (r < 0)
                return log_error_errno(r, "Failed to create cgroup empty event source: %m");

+        /* Schedule cgroup empty checks early, but after having processed service notification messages or
+         * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
+         * notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
        r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
        if (r < 0)
                return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
@ -2632,9 +2812,10 @@ int manager_setup_cgroup(Manager *m) {
                if (r < 0)
                        return log_error_errno(r, "Failed to watch control group inotify object: %m");

-                /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
-                 * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
-                r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
+                /* Process cgroup empty notifications early. Note that when this event is dispatched it'll
+                 * just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
+                 * handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
+                r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9);
                if (r < 0)
                        return log_error_errno(r, "Failed to set priority of inotify event source: %m");

@ -2704,7 +2885,8 @@ void manager_shutdown_cgroup(Manager *m, bool delete) {

        m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);

-        m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
+        m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit);
+        m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit);

        m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
        m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
--- a/src/core/cgroup.h
+++ b/src/core/cgroup.h
@ -79,6 +79,9 @@ struct CGroupContext {
        bool tasks_accounting;
        bool ip_accounting;

+        /* Configures the memory.oom.group attribute (on unified) */
+        bool memory_oom_group;
+
        bool delegate;
        CGroupMask delegate_controllers;
        CGroupMask disable_controllers;
@ -174,6 +177,7 @@ int unit_realize_cgroup(Unit *u);
 void unit_release_cgroup(Unit *u);
 void unit_prune_cgroup(Unit *u);
 int unit_watch_cgroup(Unit *u);
+int unit_watch_cgroup_memory(Unit *u);

 void unit_add_to_cgroup_empty_queue(Unit *u);

--- a/src/core/dbus-manager.c
+++ b/src/core/dbus-manager.c
@ -43,6 +43,8 @@ static UnitFileFlags unit_file_bools_to_flags(bool runtime, bool force) {
               (force   ? UNIT_FILE_FORCE   : 0);
 }

+BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_oom_policy, oom_policy, OOMPolicy);
+
 static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_version, "s", GIT_VERSION);
 static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_features, "s", SYSTEMD_FEATURES);
 static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_architecture, "s", architecture_to_string(uname_architecture()));
@ -2452,6 +2454,7 @@ const sd_bus_vtable bus_manager_vtable[] = {
        SD_BUS_PROPERTY("DefaultLimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
        SD_BUS_PROPERTY("DefaultTasksMax", "t", NULL, offsetof(Manager, default_tasks_max), SD_BUS_VTABLE_PROPERTY_CONST),
        SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, default_oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),

        SD_BUS_METHOD("GetUnit", "s", "o", method_get_unit, SD_BUS_VTABLE_UNPRIVILEGED),
        SD_BUS_METHOD("GetUnitByPID", "u", "o", method_get_unit_by_pid, SD_BUS_VTABLE_UNPRIVILEGED),
--- a/src/core/dbus-manager.h
+++ b/src/core/dbus-manager.h
@ -12,3 +12,5 @@ void bus_manager_send_reloading(Manager *m, bool active);
 void bus_manager_send_change_signal(Manager *m);

 int verify_run_space_and_log(const char *message);
+
+int bus_property_get_oom_policy(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
--- a/src/core/dbus-service.c
+++ b/src/core/dbus-service.c
@ -10,6 +10,7 @@
 #include "dbus-cgroup.h"
 #include "dbus-execute.h"
 #include "dbus-kill.h"
+#include "dbus-manager.h"
 #include "dbus-service.h"
 #include "dbus-util.h"
 #include "exit-status.h"
@ -127,6 +128,7 @@ const sd_bus_vtable bus_service_vtable[] = {
        SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
        SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
        SD_BUS_PROPERTY("NRestarts", "u", bus_property_get_unsigned, offsetof(Service, n_restarts), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
+        SD_BUS_PROPERTY("OOMPolicy", "s", bus_property_get_oom_policy, offsetof(Service, oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),

        BUS_EXEC_STATUS_VTABLE("ExecMain", offsetof(Service, main_exec_status), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
        BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
@ -257,6 +259,7 @@ static int bus_set_transient_std_fd(
 static BUS_DEFINE_SET_TRANSIENT_PARSE(notify_access, NotifyAccess, notify_access_from_string);
 static BUS_DEFINE_SET_TRANSIENT_PARSE(service_type, ServiceType, service_type_from_string);
 static BUS_DEFINE_SET_TRANSIENT_PARSE(service_restart, ServiceRestart, service_restart_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(oom_policy, OOMPolicy, oom_policy_from_string);
 static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(bus_name, service_name_is_valid);

 static int bus_service_set_transient_property(
@ -291,6 +294,9 @@ static int bus_service_set_transient_property(
        if (streq(name, "Type"))
                return bus_set_transient_service_type(u, name, &s->type, message, flags, error);

+        if (streq(name, "OOMPolicy"))
+                return bus_set_transient_oom_policy(u, name, &s->oom_policy, message, flags, error);
+
        if (streq(name, "RestartUSec"))
                return bus_set_transient_usec(u, name, &s->restart_usec, message, flags, error);

--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@ -333,6 +333,7 @@ Service.Sockets,                 config_parse_service_sockets,       0,
 Service.BusPolicy,               config_parse_warn_compat,           DISABLED_LEGACY,               0
 Service.USBFunctionDescriptors,  config_parse_unit_path_printf,      0,                             offsetof(Service, usb_function_descriptors)
 Service.USBFunctionStrings,      config_parse_unit_path_printf,      0,                             offsetof(Service, usb_function_strings)
+Service.OOMPolicy,               config_parse_oom_policy,            0,                             offsetof(Service, oom_policy)
 EXEC_CONTEXT_CONFIG_ITEMS(Service)m4_dnl
 CGROUP_CONTEXT_CONFIG_ITEMS(Service)m4_dnl
 KILL_CONTEXT_CONFIG_ITEMS(Service)m4_dnl
--- a/src/core/load-fragment.c
+++ b/src/core/load-fragment.c
@ -86,6 +86,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_runtime_preserve_mode, exec_preserve_mode,
 DEFINE_CONFIG_PARSE_ENUM(config_parse_service_type, service_type, ServiceType, "Failed to parse service type");
 DEFINE_CONFIG_PARSE_ENUM(config_parse_service_restart, service_restart, ServiceRestart, "Failed to parse service restart specifier");
 DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_bind, socket_address_bind_ipv6_only_or_bool, SocketAddressBindIPv6Only, "Failed to parse bind IPv6 only value");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_oom_policy, oom_policy, OOMPolicy, "Failed to parse OOM policy");
 DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_ip_tos, ip_tos, int, -1, "Failed to parse IP TOS value");
 DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint64_t, "Invalid block IO weight");
 DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight");
--- a/src/core/load-fragment.h
+++ b/src/core/load-fragment.h
@ -106,6 +106,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_collect_mode);
 CONFIG_PARSER_PROTOTYPE(config_parse_pid_file);
 CONFIG_PARSER_PROTOTYPE(config_parse_exit_status);
 CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers);
+CONFIG_PARSER_PROTOTYPE(config_parse_oom_policy);

 /* gperf prototypes */
 const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length);
--- a/src/core/main.c
+++ b/src/core/main.c
@ -135,6 +135,7 @@ static bool arg_default_tasks_accounting = true;
 static uint64_t arg_default_tasks_max = UINT64_MAX;
 static sd_id128_t arg_machine_id = {};
 static EmergencyAction arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE;
+static OOMPolicy arg_default_oom_policy = OOM_STOP;

 _noreturn_ static void freeze_or_exit_or_reboot(void) {

@ -725,6 +726,7 @@ static int parse_config_file(void) {
                { "Manager", "DefaultTasksAccounting",    config_parse_bool,             0, &arg_default_tasks_accounting          },
                { "Manager", "DefaultTasksMax",           config_parse_tasks_max,        0, &arg_default_tasks_max                 },
                { "Manager", "CtrlAltDelBurstAction",     config_parse_emergency_action, 0, &arg_cad_burst_action                  },
+                { "Manager", "DefaultOOMPolicy",          config_parse_oom_policy,       0, &arg_default_oom_policy                },
                {}
        };

@ -780,6 +782,7 @@ static void set_manager_defaults(Manager *m) {
        m->default_memory_accounting = arg_default_memory_accounting;
        m->default_tasks_accounting = arg_default_tasks_accounting;
        m->default_tasks_max = arg_default_tasks_max;
+        m->default_oom_policy = arg_default_oom_policy;

        (void) manager_set_default_rlimits(m, arg_default_rlimit);

--- a/src/core/manager.c
+++ b/src/core/manager.c
@ -764,6 +764,8 @@ int manager_new(UnitFileScope scope, ManagerTestRunFlags test_run_flags, Manager
                .have_ask_password = -EINVAL, /* we don't know */
                .first_boot = -1,
                .test_run_flags = test_run_flags,
+
+                .default_oom_policy = OOM_STOP,
        };

 #if ENABLE_EFI
@ -992,11 +994,10 @@ static int manager_setup_cgroups_agent(Manager *m) {
                if (r < 0)
                        return log_error_errno(r, "Failed to allocate cgroups agent event source: %m");

-                /* Process cgroups notifications early, but after having processed service notification messages or
-                 * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of notification,
-                 * and we collected the metadata the notification and SIGCHLD stuff offers first. Also see handling of
-                 * cgroup inotify for the unified cgroup stuff. */
-                r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-4);
+                /* Process cgroups notifications early. Note that when the agent notification is received
+                 * we'll just enqueue the unit in the cgroup empty queue, hence pick a high priority than
+                 * that. Also see handling of cgroup inotify for the unified cgroup stuff. */
+                r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-9);
                if (r < 0)
                        return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m");

@ -4715,3 +4716,11 @@ static const char *const manager_timestamp_table[_MANAGER_TIMESTAMP_MAX] = {
 };

 DEFINE_STRING_TABLE_LOOKUP(manager_timestamp, ManagerTimestamp);
+
+static const char* const oom_policy_table[_OOM_POLICY_MAX] = {
+        [OOM_CONTINUE] = "continue",
+        [OOM_STOP] = "stop",
+        [OOM_KILL] = "kill",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(oom_policy, OOMPolicy);
--- a/src/core/manager.h
+++ b/src/core/manager.h
@ -56,6 +56,14 @@ typedef enum StatusType {
        STATUS_TYPE_EMERGENCY,
 } StatusType;

+typedef enum OOMPolicy {
+        OOM_CONTINUE,          /* The kernel kills the process it wants to kill, and that's it */
+        OOM_STOP,              /* The kernel kills the process it wants to kill, and we stop the unit */
+        OOM_KILL,              /* The kernel kills the process it wants to kill, and all others in the unit, and we stop the unit */
+        _OOM_POLICY_MAX,
+        _OOM_POLICY_INVALID = -1
+} OOMPolicy;
+
 /* Notes:
 * 1. TIMESTAMP_FIRMWARE, TIMESTAMP_LOADER, TIMESTAMP_KERNEL, TIMESTAMP_INITRD,
 *    TIMESTAMP_SECURITY_START, and TIMESTAMP_SECURITY_FINISH are set only when
@ -159,6 +167,9 @@ struct Manager {
        /* Units whose cgroup ran empty */
        LIST_HEAD(Unit, cgroup_empty_queue);

+        /* Units whose memory.event fired */
+        LIST_HEAD(Unit, cgroup_oom_queue);
+
        /* Target units whose default target dependencies haven't been set yet */
        LIST_HEAD(Unit, target_deps_queue);

@ -268,10 +279,15 @@ struct Manager {
        /* Notifications from cgroups, when the unified hierarchy is used is done via inotify. */
        int cgroup_inotify_fd;
        sd_event_source *cgroup_inotify_event_source;
-        Hashmap *cgroup_inotify_wd_unit;
+
+        /* Maps for finding the unit for each inotify watch descriptor for the cgroup.events and
+         * memory.events cgroupv2 attributes. */
+        Hashmap *cgroup_control_inotify_wd_unit;
+        Hashmap *cgroup_memory_inotify_wd_unit;

        /* A defer event for handling cgroup empty events and processing them after SIGCHLD in all cases. */
        sd_event_source *cgroup_empty_event_source;
+        sd_event_source *cgroup_oom_event_source;

        /* Make sure the user cannot accidentally unmount our cgroup
         * file system */
@ -328,6 +344,8 @@ struct Manager {
        uint64_t default_tasks_max;
        usec_t default_timer_accuracy_usec;

+        OOMPolicy default_oom_policy;
+
        int original_log_level;
        LogTarget original_log_target;
        bool log_level_overridden:1;
@ -519,3 +537,6 @@ void manager_disable_confirm_spawn(void);
 const char *manager_timestamp_to_string(ManagerTimestamp m) _const_;
 ManagerTimestamp manager_timestamp_from_string(const char *s) _pure_;
 ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s);
+
+const char* oom_policy_to_string(OOMPolicy i) _const_;
+OOMPolicy oom_policy_from_string(const char *s) _pure_;
--- a/src/core/service.c
+++ b/src/core/service.c
@ -112,6 +112,8 @@ static void service_init(Unit *u) {
                EXEC_KEYRING_PRIVATE : EXEC_KEYRING_INHERIT;

        s->watchdog_original_usec = USEC_INFINITY;
+
+        s->oom_policy = _OOM_POLICY_INVALID;
 }

 static void service_unwatch_control_pid(Service *s) {
@ -731,6 +733,15 @@ static int service_add_extras(Service *s) {
            (s->type == SERVICE_NOTIFY || s->watchdog_usec > 0 || s->n_fd_store_max > 0))
                s->notify_access = NOTIFY_MAIN;

+        /* If no OOM policy was explicitly set, then default to the configure default OOM policy. Except when
+         * delegation is on, in that case it we assume the payload knows better what to do and can process
+         * things in a more focussed way. */
+        if (s->oom_policy < 0)
+                s->oom_policy = s->cgroup_context.delegate ? OOM_CONTINUE : UNIT(s)->manager->default_oom_policy;
+
+        /* Let the kernel do the killing if that's requested. */
+        s->cgroup_context.memory_oom_group = s->oom_policy == OOM_KILL;
+
        r = service_add_default_dependencies(s);
        if (r < 0)
                return r;
@ -799,7 +810,8 @@ static void service_dump(Unit *u, FILE *f, const char *prefix) {
                "%sType: %s\n"
                "%sRestart: %s\n"
                "%sNotifyAccess: %s\n"
-                "%sNotifyState: %s\n",
+                "%sNotifyState: %s\n"
+                "%sOOMPolicy: %s\n",
                prefix, service_state_to_string(s->state),
                prefix, service_result_to_string(s->result),
                prefix, service_result_to_string(s->reload_result),
@ -810,7 +822,8 @@ static void service_dump(Unit *u, FILE *f, const char *prefix) {
                prefix, service_type_to_string(s->type),
                prefix, service_restart_to_string(s->restart),
                prefix, notify_access_to_string(s->notify_access),
-                prefix, notify_state_to_string(s->notify_state));
+                prefix, notify_state_to_string(s->notify_state),
+                prefix, oom_policy_to_string(s->oom_policy));

        if (s->control_pid > 0)
                fprintf(f,
@ -3148,7 +3161,7 @@ static void service_notify_cgroup_empty_event(Unit *u) {

        assert(u);

-        log_unit_debug(u, "cgroup is empty");
+        log_unit_debug(u, "Control group is empty.");

        switch (s->state) {

@ -3211,6 +3224,57 @@ static void service_notify_cgroup_empty_event(Unit *u) {
        }
 }

+static void service_notify_cgroup_oom_event(Unit *u) {
+        Service *s = SERVICE(u);
+
+        log_unit_debug(u, "Process of control group was killed by the OOM killer.");
+
+        if (s->oom_policy == OOM_CONTINUE)
+                return;
+
+        switch (s->state) {
+
+        case SERVICE_START_PRE:
+        case SERVICE_START:
+        case SERVICE_START_POST:
+        case SERVICE_STOP:
+                if (s->oom_policy == OOM_STOP)
+                        service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_OOM_KILL);
+                else if (s->oom_policy == OOM_KILL)
+                        service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL);
+
+                break;
+
+        case SERVICE_EXITED:
+        case SERVICE_RUNNING:
+                if (s->oom_policy == OOM_STOP)
+                        service_enter_stop(s, SERVICE_FAILURE_OOM_KILL);
+                else if (s->oom_policy == OOM_KILL)
+                        service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL);
+
+                break;
+
+        case SERVICE_STOP_WATCHDOG:
+        case SERVICE_STOP_SIGTERM:
+                service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL);
+                break;
+
+        case SERVICE_STOP_SIGKILL:
+        case SERVICE_FINAL_SIGKILL:
+                if (s->result == SERVICE_SUCCESS)
+                        s->result = SERVICE_FAILURE_OOM_KILL;
+                break;
+
+        case SERVICE_STOP_POST:
+        case SERVICE_FINAL_SIGTERM:
+                service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_OOM_KILL);
+                break;
+
+        default:
+                ;
+        }
+}
+
 static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
        bool notify_dbus = true;
        Service *s = SERVICE(u);
@ -4116,6 +4180,7 @@ static const char* const service_result_table[_SERVICE_RESULT_MAX] = {
        [SERVICE_FAILURE_CORE_DUMP] = "core-dump",
        [SERVICE_FAILURE_WATCHDOG] = "watchdog",
        [SERVICE_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
+        [SERVICE_FAILURE_OOM_KILL] = "oom-kill",
 };

 DEFINE_STRING_TABLE_LOOKUP(service_result, ServiceResult);
@ -4169,6 +4234,7 @@ const UnitVTable service_vtable = {
        .reset_failed = service_reset_failed,

        .notify_cgroup_empty = service_notify_cgroup_empty_event,
+        .notify_cgroup_oom = service_notify_cgroup_oom_event,
        .notify_message = service_notify_message,

        .main_pid = service_main_pid,
--- a/src/core/service.h
+++ b/src/core/service.h
@ -67,6 +67,7 @@ typedef enum ServiceResult {
        SERVICE_FAILURE_CORE_DUMP,
        SERVICE_FAILURE_WATCHDOG,
        SERVICE_FAILURE_START_LIMIT_HIT,
+        SERVICE_FAILURE_OOM_KILL,
        _SERVICE_RESULT_MAX,
        _SERVICE_RESULT_INVALID = -1
 } ServiceResult;
@ -184,6 +185,8 @@ struct Service {

        unsigned n_restarts;
        bool flush_n_restarts;
+
+        OOMPolicy oom_policy;
 };

 extern const UnitVTable service_vtable;
--- a/src/core/unit.c
+++ b/src/core/unit.c
@ -91,7 +91,8 @@ Unit *unit_new(Manager *m, size_t size) {
        u->unit_file_state = _UNIT_FILE_STATE_INVALID;
        u->unit_file_preset = -1;
        u->on_failure_job_mode = JOB_REPLACE;
-        u->cgroup_inotify_wd = -1;
+        u->cgroup_control_inotify_wd = -1;
+        u->cgroup_memory_inotify_wd = -1;
        u->job_timeout = USEC_INFINITY;
        u->job_running_timeout = USEC_INFINITY;
        u->ref_uid = UID_INVALID;
@ -3245,6 +3246,9 @@ int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs) {
        if (u->cpu_usage_last != NSEC_INFINITY)
                (void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, u->cpu_usage_last);

+        if (u->oom_kill_last > 0)
+                (void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, u->oom_kill_last);
+
        if (u->cgroup_path)
                (void) serialize_item(f, "cgroup", u->cgroup_path);

@ -3478,6 +3482,14 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) {

                        continue;

+                } else if (streq(l, "oom-kill-last")) {
+
+                        r = safe_atou64(v, &u->oom_kill_last);
+                        if (r < 0)
+                                log_unit_debug(u, "Failed to read OOM kill last %s, ignoring.", v);
+
+                        continue;
+
                } else if (streq(l, "cgroup")) {

                        r = unit_set_cgroup_path(u, v);
@ -3485,6 +3497,7 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) {
                                log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", v);

                        (void) unit_watch_cgroup(u);
+                        (void) unit_watch_cgroup_memory(u);

                        continue;
                } else if (streq(l, "cgroup-realized")) {
--- a/src/core/unit.h
+++ b/src/core/unit.h
@ -200,6 +200,9 @@ typedef struct Unit {
        /* cgroup empty queue */
        LIST_FIELDS(Unit, cgroup_empty_queue);

+        /* cgroup OOM queue */
+        LIST_FIELDS(Unit, cgroup_oom_queue);
+
        /* Target dependencies queue */
        LIST_FIELDS(Unit, target_deps_queue);

@ -246,13 +249,19 @@ typedef struct Unit {
        nsec_t cpu_usage_base;
        nsec_t cpu_usage_last; /* the most recently read value */

+        /* The  current counter of the oom_kill field in the memory.events cgroup attribute */
+        uint64_t oom_kill_last;
+
        /* Counterparts in the cgroup filesystem */
        char *cgroup_path;
        CGroupMask cgroup_realized_mask;           /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroup v1) */
        CGroupMask cgroup_enabled_mask;            /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */
        CGroupMask cgroup_invalidated_mask;        /* A mask specifiying controllers which shall be considered invalidated, and require re-realization */
        CGroupMask cgroup_members_mask;            /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */
-        int cgroup_inotify_wd;
+
+        /* Inotify watch descriptors for watching cgroup.events and memory.events on cgroupv2 */
+        int cgroup_control_inotify_wd;
+        int cgroup_memory_inotify_wd;

        /* Device Controller BPF program */
        BPFProgram *bpf_device_control_installed;
@ -320,6 +329,7 @@ typedef struct Unit {
        bool in_gc_queue:1;
        bool in_cgroup_realize_queue:1;
        bool in_cgroup_empty_queue:1;
+        bool in_cgroup_oom_queue:1;
        bool in_target_deps_queue:1;
        bool in_stop_when_unneeded_queue:1;

@ -494,10 +504,12 @@ typedef struct UnitVTable {
        /* Reset failed state if we are in failed state */
        void (*reset_failed)(Unit *u);

-        /* Called whenever any of the cgroups this unit watches for
-         * ran empty */
+        /* Called whenever any of the cgroups this unit watches for ran empty */
        void (*notify_cgroup_empty)(Unit *u);

+        /* Called whenever an OOM kill event on this unit was seen */
+        void (*notify_cgroup_oom)(Unit *u);
+
        /* Called whenever a process of this unit sends us a message */
        void (*notify_message)(Unit *u, const struct ucred *ucred, char **tags, FDSet *fds);

--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@ -1314,7 +1314,7 @@ static int bus_append_service_property(sd_bus_message *m, const char *field, con

        if (STR_IN_SET(field,
                       "PIDFile", "Type", "Restart", "BusName", "NotifyAccess",
-                       "USBFunctionDescriptors", "USBFunctionStrings"))
+                       "USBFunctionDescriptors", "USBFunctionStrings", "OOMPolicy"))

                return bus_append_string(m, field, eq);

--- a/src/systemd/sd-messages.h
+++ b/src/systemd/sd-messages.h
@ -125,6 +125,9 @@ _SD_BEGIN_DECLARATIONS;
 #define SD_MESSAGE_OVERMOUNTING           SD_ID128_MAKE(1d,ee,03,69,c7,fc,47,36,b7,09,9b,38,ec,b4,6e,e7)
 #define SD_MESSAGE_OVERMOUNTING_STR       SD_ID128_MAKE_STR(1d,ee,03,69,c7,fc,47,36,b7,09,9b,38,ec,b4,6e,e7)

+#define SD_MESSAGE_UNIT_OUT_OF_MEMORY     SD_ID128_MAKE(fe,6f,aa,94,e7,77,46,63,a0,da,52,71,78,91,d8,ef)
+#define SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR SD_ID128_MAKE_STR(fe,6f,aa,94,e7,77,46,63,a0,da,52,71,78,91,d8,ef)
+
 #define SD_MESSAGE_LID_OPENED             SD_ID128_MAKE(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,6f)
 #define SD_MESSAGE_LID_OPENED_STR         SD_ID128_MAKE_STR(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,6f)
 #define SD_MESSAGE_LID_CLOSED             SD_ID128_MAKE(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,70)
--- a/test/TEST-31-OOMPOLICY/Makefile
+++ b/test/TEST-31-OOMPOLICY/Makefile
@ -0,0 +1 @@
+../TEST-01-BASIC/Makefile
--- a/test/TEST-31-OOMPOLICY/test.sh
+++ b/test/TEST-31-OOMPOLICY/test.sh
@ -0,0 +1,52 @@
+#!/bin/bash
+# -*- mode: shell-script; indent-tabs-mode: nil; sh-basic-offset: 4; -*-
+# ex: ts=8 sw=4 sts=4 et filetype=sh
+set -e
+TEST_DESCRIPTION="test OOM killer logic"
+TEST_NO_NSPAWN=1
+
+. $TEST_BASE_DIR/test-functions
+
+UNIFIED_CGROUP_HIERARCHY=yes
+
+test_setup() {
+    create_empty_image
+    mkdir -p $TESTDIR/root
+    mount ${LOOPDEV}p1 $TESTDIR/root
+
+    (
+        LOG_LEVEL=5
+        eval $(udevadm info --export --query=env --name=${LOOPDEV}p2)
+
+        setup_basic_environment
+
+        # mask some services that we do not want to run in these tests
+        ln -fs /dev/null $initdir/etc/systemd/system/systemd-hwdb-update.service
+        ln -fs /dev/null $initdir/etc/systemd/system/systemd-journal-catalog-update.service
+        ln -fs /dev/null $initdir/etc/systemd/system/systemd-networkd.service
+        ln -fs /dev/null $initdir/etc/systemd/system/systemd-networkd.socket
+        ln -fs /dev/null $initdir/etc/systemd/system/systemd-resolved.service
+        ln -fs /dev/null $initdir/etc/systemd/system/systemd-machined.service
+
+        # setup the testsuite service
+        cat >$initdir/etc/systemd/system/testsuite.service <<EOF
+[Unit]
+Description=Testsuite service
+
+[Service]
+ExecStart=/testsuite.sh
+Type=oneshot
+StandardOutput=tty
+StandardError=tty
+MemoryAccounting=yes
+EOF
+        cp testsuite.sh $initdir/
+
+        setup_testsuite
+    ) || return 1
+
+    ddebug "umount $TESTDIR/root"
+    umount $TESTDIR/root
+}
+
+do_test "$@"
--- a/test/TEST-31-OOMPOLICY/testsuite.sh
+++ b/test/TEST-31-OOMPOLICY/testsuite.sh
@ -0,0 +1,39 @@
+#!/bin/bash
+# -*- mode: shell-script; indent-tabs-mode: nil; sh-basic-offset: 4; -*-
+# ex: ts=8 sw=4 sts=4 et filetype=sh
+set -ex
+set -o pipefail
+
+# Let's run this test only if the "memory.oom.group" cgroupfs attribute
+# exists. This test is a bit too strict, since the "memory.events"/"oom_kill"
+# logic has been around since a longer time than "memory.oom.group", but it's
+# an easier thing to test for, and also: let's not get confused by older
+# kernels where the concept was still new.
+
+if test -f /sys/fs/cgroup/system.slice/testsuite.service/memory.oom.group ; then
+
+    systemd-analyze log-level debug
+    systemd-analyze log-target console
+
+    # Run a service that is guaranteed to be the first candidate for OOM killing
+    systemd-run --unit=oomtest.service -p Type=exec -p OOMScoreAdjust=1000 -p OOMPolicy=stop -p MemoryAccounting=yes /bin/sleep infinity
+
+    # Trigger an OOM killer run
+    echo 1 > /proc/sys/kernel/sysrq
+    echo f > /proc/sysrq-trigger
+
+    while : ; do
+        STATE=`systemctl show -p ActiveState --value oomtest.service`
+        [ "$STATE" = "failed" ] && break
+        sleep .5
+    done
+
+    RESULT=`systemctl show -p Result --value oomtest.service`
+    test "$RESULT" = "oom-kill"
+
+    systemd-analyze log-level info
+fi
+
+echo OK > /testok
+
+exit 0