Merge pull request #7464 from poettering/cgroup-control-fix

various cgroup fixes
2017-11-26 11:34:24 +01:00 · 2017-11-26 11:34:24 +01:00 · b45f436f61
parent dd5082a9de 48f1b5e51d
commit b45f436f61
14 changed files with 203 additions and 200 deletions
--- a/10
+++ b/10
@ -24,6 +24,12 @@ Janitorial Clean-ups:

 Features:

+* add some special mode to LogsDirectory=/StateDirectory=… that allows
+  declaring these directories without necessarily pulling in deps for them, or
+  creating them when starting up. That way, we could declare that
+  systemd-journald writes to /var/log/journal, which could be useful when we
+  doing disk usage calculations and so on.
+
 * add a new RuntimeDirectoryPreserve= mode that defines a similar lifecycle for
  the runtime dir as we maintain for the fdstore: i.e. keep it around as long
  as the unit is running or has a job queued.
@ -45,8 +51,6 @@ Features:
 * add a way to lock down cgroup migration: a boolean, which when set for a unit
  makes sure the processes in it can never migrate out of it

-* complain if a unit starts up and there are already processes in its cgroup
-
 * blog about fd store and restartable services

 * document Environment=SYSTEMD_LOG_LEVEL=debug drop-in in debugging document
@ -303,8 +307,6 @@ Features:
  the specified range and generates sane error messages for incorrect
  specifications.

-* do something about "/control" subcgroups in the unified cgroup hierarchy
-
 * when we detect that there are waiting jobs but no running jobs, do something

 * push CPUAffinity= also into the "cpuset" cgroup controller (only after the cpuset controller got ported to the unified hierarchy)
--- a/mkosi.build
+++ b/mkosi.build
@ -28,7 +28,7 @@ export LC_CTYPE=en_US.UTF-8

 sysvinit_path=`realpath /etc/init.d`

-[ -f "$BUILDDIR"/build.ninja ] || meson "$BUILDDIR" -D "sysvinit-path=$sysvinit_path"
+[ -f "$BUILDDIR"/build.ninja ] || meson "$BUILDDIR" -D "sysvinit-path=$sysvinit_path" -D "default-hierarchy=unified"
 ninja -C "$BUILDDIR" all
 [ "$WITH_TESTS" = 0 ] || ninja -C "$BUILDDIR" test || ( RET="$?" ; cat "$BUILDDIR"/meson-logs/testlog.txt ; exit "$RET" )
 ninja -C "$BUILDDIR" install
--- a/src/basic/cgroup-util.c
+++ b/src/basic/cgroup-util.c
@ -876,115 +876,87 @@ int cg_attach_fallback(const char *controller, const char *path, pid_t pid) {
        return r;
 }

-int cg_set_group_access(
+int cg_set_access(
                const char *controller,
                const char *path,
-                mode_t mode,
                uid_t uid,
                gid_t gid) {

-        _cleanup_free_ char *fs = NULL;
-        int r;
+        struct Attribute {
+                const char *name;
+                bool fatal;
+        };

-        if (mode == MODE_INVALID && uid == UID_INVALID && gid == GID_INVALID)
+        /* cgroupsv1, aka legacy/non-unified */
+        static const struct Attribute legacy_attributes[] = {
+                { "cgroup.procs",           true  },
+                { "tasks",                  false },
+                { "cgroup.clone_children",  false },
+                {},
+        };
+
+        /* cgroupsv2, aka unified */
+        static const struct Attribute unified_attributes[] = {
+                { "cgroup.procs",           true  },
+                { "cgroup.subtree_control", true  },
+                { "cgroup.threads",         false },
+                {},
+        };
+
+        static const struct Attribute* const attributes[] = {
+                [false] = legacy_attributes,
+                [true]  = unified_attributes,
+        };
+
+        _cleanup_free_ char *fs = NULL;
+        const struct Attribute *i;
+        int r, unified;
+
+        assert(path);
+
+        if (uid == UID_INVALID && gid == GID_INVALID)
                return 0;

-        if (mode != MODE_INVALID)
-                mode &= 0777;
+        unified = cg_unified_controller(controller);
+        if (unified < 0)
+                return unified;

+        /* Configure access to the cgroup itself */
        r = cg_get_path(controller, path, NULL, &fs);
        if (r < 0)
                return r;

-        r = chmod_and_chown(fs, mode, uid, gid);
+        r = chmod_and_chown(fs, 0755, uid, gid);
        if (r < 0)
                return r;

-        r = cg_hybrid_unified();
-        if (r < 0)
-                return r;
-        if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
-                r = cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, mode, uid, gid);
-                if (r < 0)
-                        log_debug_errno(r, "Failed to set group access on compatibility systemd cgroup %s, ignoring: %m", path);
-        }
-
-        return 0;
-}
-
-int cg_set_task_access(
-                const char *controller,
-                const char *path,
-                mode_t mode,
-                uid_t uid,
-                gid_t gid) {
-
-        _cleanup_free_ char *fs = NULL;
-        int r;
-
-        assert(path);
-
-        if (mode == MODE_INVALID && uid == UID_INVALID && gid == GID_INVALID)
-                return 0;
-
-        if (mode != MODE_INVALID)
-                mode &= 0666;
-
-        /* For both the legacy and unified hierarchies, "cgroup.procs" is the main entry point for PIDs */
-        r = cg_get_path(controller, path, "cgroup.procs", &fs);
-        if (r < 0)
-                return r;
-
-        r = chmod_and_chown(fs, mode, uid, gid);
-        if (r < 0)
-                return r;
-
-        r = cg_unified_controller(controller);
-        if (r < 0)
-                return r;
-        if (r == 0) {
-                const char *fn;
-
-                /* Compatibility: on cgroupsv1 always keep values for the legacy files "tasks" and
-                 * "cgroup.clone_children" in sync with "cgroup.procs". Since this is legacy stuff, we don't care if
-                 * this fails. */
-
-                FOREACH_STRING(fn,
-                               "tasks",
-                               "cgroup.clone_children") {
-
-                        fs = mfree(fs);
-
-                        r = cg_get_path(controller, path, fn, &fs);
-                        if (r < 0)
-                                log_debug_errno(r, "Failed to get path for %s of %s, ignoring: %m", fn, path);
-
-                        r = chmod_and_chown(fs, mode, uid, gid);
-                        if (r < 0)
-                                log_debug_errno(r, "Failed to to change ownership/access mode for %s of %s, ignoring: %m", fn, path);
-                }
-        } else {
-                /* On the unified controller, we want to permit subtree controllers too. */
-
+        /* Configure access to the cgroup's attributes */
+        for (i = attributes[unified]; i->name; i++) {
                fs = mfree(fs);
-                r = cg_get_path(controller, path, "cgroup.subtree_control", &fs);
+
+                r = cg_get_path(controller, path, i->name, &fs);
                if (r < 0)
                        return r;

-                r = chmod_and_chown(fs, mode, uid, gid);
-                if (r < 0)
-                        return r;
+                r = chmod_and_chown(fs, 0644, uid, gid);
+                if (r < 0) {
+                        if (i->fatal)
+                                return r;
+
+                        log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", fs);
+                }
        }

-        r = cg_hybrid_unified();
-        if (r < 0)
-                return r;
-        if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
-                /* Always propagate access mode from unified to legacy controller */
-
-                r = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, mode, uid, gid);
+        if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
+                r = cg_hybrid_unified();
                if (r < 0)
-                        log_debug_errno(r, "Failed to set task access on compatibility systemd cgroup %s, ignoring: %m", path);
+                        return r;
+                if (r > 0) {
+                        /* Always propagate access mode from unified to legacy controller */
+                        r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, uid, gid);
+                        if (r < 0)
+                                log_debug_errno(r, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path);
+                }
        }

        return 0;
--- a/src/basic/cgroup-util.h
+++ b/src/basic/cgroup-util.h
@ -188,8 +188,7 @@ int cg_set_attribute(const char *controller, const char *path, const char *attri
 int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret);
 int cg_get_keyed_attribute(const char *controller, const char *path, const char *attribute, const char **keys, char **values);

-int cg_set_group_access(const char *controller, const char *path, mode_t mode, uid_t uid, gid_t gid);
-int cg_set_task_access(const char *controller, const char *path, mode_t mode, uid_t uid, gid_t gid);
+int cg_set_access(const char *controller, const char *path, uid_t uid, gid_t gid);

 int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags);
 int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size);
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@ -677,9 +677,11 @@ static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_
                              "Failed to set %s: %m", file);
 }

-static void cgroup_apply_firewall(Unit *u, CGroupContext *c) {
+static void cgroup_apply_firewall(Unit *u) {
        int r;

+        assert(u);
+
        if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
                                    * not recursive we don't ever touch the bpf on them */
                return;
@ -1031,7 +1033,7 @@ static void cgroup_context_apply(
        }

        if (apply_bpf)
-                cgroup_apply_firewall(u, c);
+                cgroup_apply_firewall(u);
 }

 CGroupMask cgroup_context_get_mask(CGroupContext *c) {
@ -1392,6 +1394,31 @@ int unit_watch_cgroup(Unit *u) {
        return 0;
 }

+int unit_pick_cgroup_path(Unit *u) {
+        _cleanup_free_ char *path = NULL;
+        int r;
+
+        assert(u);
+
+        if (u->cgroup_path)
+                return 0;
+
+        if (!UNIT_HAS_CGROUP_CONTEXT(u))
+                return -EINVAL;
+
+        path = unit_default_cgroup_path(u);
+        if (!path)
+                return log_oom();
+
+        r = unit_set_cgroup_path(u, path);
+        if (r == -EEXIST)
+                return log_unit_error_errno(u, r, "Control group %s exists already.", path);
+        if (r < 0)
+                return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
+
+        return 0;
+}
+
 static int unit_create_cgroup(
                Unit *u,
                CGroupMask target_mask,
@ -1407,19 +1434,10 @@ static int unit_create_cgroup(
        if (!c)
                return 0;

-        if (!u->cgroup_path) {
-                _cleanup_free_ char *path = NULL;
-
-                path = unit_default_cgroup_path(u);
-                if (!path)
-                        return log_oom();
-
-                r = unit_set_cgroup_path(u, path);
-                if (r == -EEXIST)
-                        return log_unit_error_errno(u, r, "Control group %s exists already.", path);
-                if (r < 0)
-                        return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
-        }
+        /* Figure out our cgroup path */
+        r = unit_pick_cgroup_path(u);
+        if (r < 0)
+                return r;

        /* First, create our own group */
        r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
@ -1503,6 +1521,27 @@ static bool unit_has_mask_realized(
                 (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
 }

+static void unit_add_to_cgroup_realize_queue(Unit *u) {
+        assert(u);
+
+        if (u->in_cgroup_realize_queue)
+                return;
+
+        LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
+        u->in_cgroup_realize_queue = true;
+}
+
+static void unit_remove_from_cgroup_realize_queue(Unit *u) {
+        assert(u);
+
+        if (!u->in_cgroup_realize_queue)
+                return;
+
+        LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
+        u->in_cgroup_realize_queue = false;
+}
+
+
 /* Check if necessary controllers and attributes for a unit are in place.
 *
 * If so, do nothing.
@ -1516,10 +1555,7 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) {

        assert(u);

-        if (u->in_cgroup_realize_queue) {
-                LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
-                u->in_cgroup_realize_queue = false;
-        }
+        unit_remove_from_cgroup_realize_queue(u);

        target_mask = unit_get_target_mask(u);
        enable_mask = unit_get_enable_mask(u);
@ -1552,16 +1588,6 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
        return 0;
 }

-static void unit_add_to_cgroup_realize_queue(Unit *u) {
-        assert(u);
-
-        if (u->in_cgroup_realize_queue)
-                return;
-
-        LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
-        u->in_cgroup_realize_queue = true;
-}
-
 unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
        ManagerState state;
        unsigned n = 0;
@ -1575,6 +1601,12 @@ unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
        while ((i = m->cgroup_realize_queue)) {
                assert(i->in_cgroup_realize_queue);

+                if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
+                        /* Maybe things changed, and the unit is not actually active anymore? */
+                        unit_remove_from_cgroup_realize_queue(i);
+                        continue;
+                }
+
                r = unit_realize_cgroup_now(i, state);
                if (r < 0)
                        log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
@ -2351,7 +2383,6 @@ int unit_get_ip_accounting(
        fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
                u->ip_accounting_ingress_map_fd :
                u->ip_accounting_egress_map_fd;
-
        if (fd < 0)
                return -ENODATA;

@ -2421,7 +2452,7 @@ void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
        if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
                m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;

-        if ((u->cgroup_realized_mask & m) == 0)
+        if ((u->cgroup_realized_mask & m) == 0) /* NOP? */
                return;

        u->cgroup_realized_mask &= ~m;
@ -2434,7 +2465,7 @@ void unit_invalidate_cgroup_bpf(Unit *u) {
        if (!UNIT_HAS_CGROUP_CONTEXT(u))
                return;

-        if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED)
+        if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
                return;

        u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
--- a/src/core/cgroup.h
+++ b/src/core/cgroup.h
@ -169,6 +169,7 @@ void unit_update_cgroup_members_masks(Unit *u);

 char *unit_default_cgroup_path(Unit *u);
 int unit_set_cgroup_path(Unit *u, const char *path);
+int unit_pick_cgroup_path(Unit *u);

 int unit_realize_cgroup(Unit *u);
 void unit_release_cgroup(Unit *u);
--- a/src/core/execute.c
+++ b/src/core/execute.c
@ -3009,17 +3009,12 @@ static int exec_child(
                }
        }

-        /* If delegation is enabled we'll pass ownership of the cgroup
-         * (but only in systemd's own controller hierarchy!) to the
-         * user of the new process. */
+        /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
+         * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
+         * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
+         * touch a single hierarchy too. */
        if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
-                r = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0644, uid, gid);
-                if (r < 0) {
-                        *exit_status = EXIT_CGROUP;
-                        return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
-                }
-
-                r = cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0755, uid, gid);
+                r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
                if (r < 0) {
                        *exit_status = EXIT_CGROUP;
                        return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
--- a/src/core/mount.c
+++ b/src/core/mount.c
@ -938,9 +938,6 @@ static void mount_enter_mounting(Mount *m) {

        assert(m);

-        m->control_command_id = MOUNT_EXEC_MOUNT;
-        m->control_command = m->exec_command + MOUNT_EXEC_MOUNT;
-
        r = unit_fail_if_symlink(UNIT(m), m->where);
        if (r < 0)
                goto fail;
@ -949,6 +946,11 @@ static void mount_enter_mounting(Mount *m) {

        unit_warn_if_dir_nonempty(UNIT(m), m->where);

+        unit_warn_leftover_processes(UNIT(m));
+
+        m->control_command_id = MOUNT_EXEC_MOUNT;
+        m->control_command = m->exec_command + MOUNT_EXEC_MOUNT;
+
        /* Create the source directory for bind-mounts if needed */
        p = get_mount_parameters_fragment(m);
        if (p && mount_is_bind(p))
--- a/src/core/scope.c
+++ b/src/core/scope.c
@ -460,18 +460,6 @@ static int scope_deserialize_item(Unit *u, const char *key, const char *value, F
        return 0;
 }

-static bool scope_check_gc(Unit *u) {
-        assert(u);
-
-        /* Never clean up scopes that still have a process around,
-         * even if the scope is formally dead. */
-
-        if (!u->cgroup_path)
-                return false;
-
-        return cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path) <= 0;
-}
-
 static void scope_notify_cgroup_empty_event(Unit *u) {
        Scope *s = SCOPE(u);
        assert(u);
@ -639,8 +627,6 @@ const UnitVTable scope_vtable = {
        .active_state = scope_active_state,
        .sub_state_to_string = scope_sub_state_to_string,

-        .check_gc = scope_check_gc,
-
        .sigchld_event = scope_sigchld_event,

        .reset_failed = scope_reset_failed,
--- a/src/core/service.c
+++ b/src/core/service.c
@ -1343,11 +1343,6 @@ static int service_spawn(
        if (!final_env)
                return -ENOMEM;

-        if ((flags & EXEC_IS_CONTROL) && UNIT(s)->cgroup_path) {
-                exec_params.cgroup_path = strjoina(UNIT(s)->cgroup_path, "/control");
-                (void) cg_create(SYSTEMD_CGROUP_CONTROLLER, exec_params.cgroup_path);
-        }
-
        /* System services should get a new keyring by default. */
        SET_FLAG(exec_params.flags, EXEC_NEW_KEYRING, MANAGER_IS_SYSTEM(UNIT(s)->manager));

@ -1789,39 +1784,22 @@ fail:
        service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
 }

-static void service_kill_control_processes(Service *s) {
+static void service_kill_control_process(Service *s) {
        int r;

        assert(s);

-        if (s->control_pid > 0) {
-                r = kill_and_sigcont(s->control_pid, SIGKILL);
-                if (r < 0) {
-                        _cleanup_free_ char *comm = NULL;
+        if (s->control_pid <= 0)
+                return;

-                        (void) get_process_comm(s->control_pid, &comm);
+        r = kill_and_sigcont(s->control_pid, SIGKILL);
+        if (r < 0) {
+                _cleanup_free_ char *comm = NULL;

-                        log_unit_debug_errno(UNIT(s), r, "Failed to kill control process " PID_FMT " (%s), ignoring: %m",
-                                             s->control_pid, strna(comm));
-                }
-        }
+                (void) get_process_comm(s->control_pid, &comm);

-        if (UNIT(s)->cgroup_path) {
-                _cleanup_set_free_ Set *pid_set = NULL;
-                char *p;
-
-                if (s->control_pid > 0) {
-                        r = set_make(&pid_set, PID_TO_PTR(s->control_pid), NULL);
-                        if (r < 0) {
-                                log_oom();
-                                return;
-                        }
-                }
-
-                p = strjoina(UNIT(s)->cgroup_path, "/control");
-                r = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, p, SIGKILL, CGROUP_SIGCONT|CGROUP_IGNORE_SELF|CGROUP_REMOVE, pid_set, NULL, NULL);
-                if (r < 0)
-                        log_unit_debug_errno(UNIT(s), r, "Failed to send SIGKILL to processes of control group %s: %m", p);
+                log_unit_debug_errno(UNIT(s), r, "Failed to kill control process " PID_FMT " (%s), ignoring: %m",
+                                     s->control_pid, strna(comm));
        }
 }

@ -1836,10 +1814,7 @@ static void service_enter_start(Service *s) {
        service_unwatch_control_pid(s);
        service_unwatch_main_pid(s);

-        /* We want to ensure that nobody leaks processes from
-         * START_PRE here, so let's go on a killing spree, People
-         * should not spawn long running processes from START_PRE. */
-        service_kill_control_processes(s);
+        unit_warn_leftover_processes(UNIT(s));

        if (s->type == SERVICE_FORKING) {
                s->control_command_id = SERVICE_EXEC_START;
@ -1927,9 +1902,8 @@ static void service_enter_start_pre(Service *s) {

        s->control_command = s->exec_command[SERVICE_EXEC_START_PRE];
        if (s->control_command) {
-                /* Before we start anything, let's clear up what might
-                 * be left from previous runs. */
-                service_kill_control_processes(s);
+
+                unit_warn_leftover_processes(UNIT(s));

                s->control_command_id = SERVICE_EXEC_START_PRE;

@ -2746,10 +2720,11 @@ static bool service_check_gc(Unit *u) {

        assert(s);

-        /* Never clean up services that still have a process around,
-         * even if the service is formally dead. */
-        if (cgroup_good(s) > 0 ||
-            main_pid_good(s) > 0 ||
+        /* Never clean up services that still have a process around, even if the service is formally dead. Note that
+         * unit_check_gc() already checked our cgroup for us, we just check our two additional PIDs, too, in case they
+         * have moved outside of the cgroup. */
+
+        if (main_pid_good(s) > 0 ||
            control_pid_good(s) > 0)
                return true;

@ -3084,11 +3059,6 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
                if (s->result == SERVICE_SUCCESS)
                        s->result = f;

-                /* Immediately get rid of the cgroup, so that the
-                 * kernel doesn't delay the cgroup empty messages for
-                 * the service cgroup any longer than necessary */
-                service_kill_control_processes(s);
-
                if (s->control_command &&
                    s->control_command->command_next &&
                    f == SERVICE_SUCCESS) {
@ -3251,7 +3221,7 @@ static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *us

        case SERVICE_RELOAD:
                log_unit_warning(UNIT(s), "Reload operation timed out. Killing reload process.");
-                service_kill_control_processes(s);
+                service_kill_control_process(s);
                s->reload_result = SERVICE_FAILURE_TIMEOUT;
                service_enter_running(s, SERVICE_SUCCESS);
                break;
--- a/src/core/socket.c
+++ b/src/core/socket.c
@ -2187,6 +2187,9 @@ static void socket_enter_start_pre(Socket *s) {
        assert(s);

        socket_unwatch_control_pid(s);
+
+        unit_warn_leftover_processes(UNIT(s));
+
        s->control_command_id = SOCKET_EXEC_START_PRE;
        s->control_command = s->exec_command[SOCKET_EXEC_START_PRE];

--- a/src/core/swap.c
+++ b/src/core/swap.c
@ -734,6 +734,8 @@ static void swap_enter_activating(Swap *s) {

        assert(s);

+        unit_warn_leftover_processes(UNIT(s));
+
        s->control_command_id = SWAP_EXEC_ACTIVATE;
        s->control_command = s->exec_command + SWAP_EXEC_ACTIVATE;

--- a/src/core/unit.c
+++ b/src/core/unit.c
@ -108,6 +108,7 @@ Unit *unit_new(Manager *m, size_t size) {
        u->ref_uid = UID_INVALID;
        u->ref_gid = GID_INVALID;
        u->cpu_usage_last = NSEC_INFINITY;
+        u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;

        u->ip_accounting_ingress_map_fd = -1;
        u->ip_accounting_egress_map_fd = -1;
@ -333,6 +334,7 @@ int unit_set_description(Unit *u, const char *description) {

 bool unit_check_gc(Unit *u) {
        UnitActiveState state;
+        int r;

        assert(u);

@ -380,6 +382,17 @@ bool unit_check_gc(Unit *u) {
                assert_not_reached("Unknown garbage collection mode");
        }

+        if (u->cgroup_path) {
+                /* If the unit has a cgroup, then check whether there's anything in it. If so, we should stay
+                 * around. Units with active processes should never be collected. */
+
+                r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
+                if (r < 0)
+                        log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
+                if (r <= 0)
+                        return true;
+        }
+
        if (UNIT_VTABLE(u)->check_gc)
                if (UNIT_VTABLE(u)->check_gc(u))
                        return true;
@ -5183,6 +5196,31 @@ int unit_prepare_exec(Unit *u) {
        return 0;
 }

+static void log_leftover(pid_t pid, int sig, void *userdata) {
+        _cleanup_free_ char *comm = NULL;
+
+        (void) get_process_comm(pid, &comm);
+
+        if (comm && comm[0] == '(') /* Most likely our own helper process (PAM?), ignore */
+                return;
+
+        log_unit_warning(userdata,
+                         "Found left-over process " PID_FMT " (%s) in control group while starting unit. Ignoring.\n"
+                         "This usually indicates unclean termination of a previous run, or service implementation deficiencies.",
+                         pid, strna(comm));
+}
+
+void unit_warn_leftover_processes(Unit *u) {
+        assert(u);
+
+        (void) unit_pick_cgroup_path(u);
+
+        if (!u->cgroup_path)
+                return;
+
+        (void) cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, 0, 0, NULL, log_leftover, u);
+}
+
 static const char* const collect_mode_table[_COLLECT_MODE_MAX] = {
        [COLLECT_INACTIVE] = "inactive",
        [COLLECT_INACTIVE_OR_FAILED] = "inactive-or-failed",
--- a/src/core/unit.h
+++ b/src/core/unit.h
@ -768,6 +768,8 @@ void unit_unlink_state_files(Unit *u);

 int unit_prepare_exec(Unit *u);

+void unit_warn_leftover_processes(Unit *u);
+
 /* Macros which append UNIT= or USER_UNIT= to the message */

 #define log_unit_full(unit, level, error, ...)                          \