cgroup: after determining that a cgroup is empty, asynchronously dispatch this

This makes sure that if we learn via inotify or another event source
that a cgroup is empty, and we checked that this is indeed the case (as
we might get spurious notifications through inotify, as the inotify
logic through the "cgroups.event" is pretty unspecific and might be
trigger for a variety of reasons), then we'll enqueue a defer event for
it, at a priority lower than SIGCHLD handling, so that we know for sure
that if there's waitid() data for a process we used it before
considering the cgroup empty notification.

Fixes: #6608
This commit is contained in:
Lennart Poettering 2017-09-26 22:43:08 +02:00
parent 91a6073ef7
commit 09e2465407
7 changed files with 112 additions and 24 deletions

View file

@ -1794,17 +1794,28 @@ int unit_watch_all_pids(Unit *u) {
return unit_watch_pids_in_path(u, u->cgroup_path);
}
int unit_notify_cgroup_empty(Unit *u) {
static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
Manager *m = userdata;
Unit *u;
int r;
assert(u);
assert(s);
assert(m);
if (!u->cgroup_path)
u = m->cgroup_empty_queue;
if (!u)
return 0;
r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
if (r <= 0)
return r;
assert(u->in_cgroup_empty_queue);
u->in_cgroup_empty_queue = false;
LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
if (m->cgroup_empty_queue) {
/* More stuff queued, let's make sure we remain enabled */
r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
if (r < 0)
log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
}
unit_add_to_gc_queue(u);
@ -1814,6 +1825,51 @@ int unit_notify_cgroup_empty(Unit *u) {
return 0;
}
void unit_add_to_cgroup_empty_queue(Unit *u) {
int r;
assert(u);
/* Note that there are four different ways how cgroup empty events reach us:
*
* 1. On the unified hierarchy we get an inotify event on the cgroup
*
* 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
*
* 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
*
* 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
* soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
*
* Regardless which way we got the notification, we'll verify it here, and then add it to a separate
* queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
* SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
* (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
* case for scope units). */
if (u->in_cgroup_empty_queue)
return;
/* Let's verify that the cgroup is really empty */
if (!u->cgroup_path)
return;
r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
if (r < 0) {
log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
return;
}
if (r == 0)
return;
LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
u->in_cgroup_empty_queue = true;
/* Trigger the defer event */
r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
if (r < 0)
log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
}
static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
Manager *m = userdata;
@ -1853,7 +1909,7 @@ static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents,
* this here safely. */
continue;
(void) unit_notify_cgroup_empty(u);
unit_add_to_cgroup_empty_queue(u);
}
}
}
@ -1918,11 +1974,26 @@ int manager_setup_cgroup(Manager *m) {
log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
}
/* 3. Install agent */
/* 3. Allocate cgroup empty defer event source */
m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
if (r < 0)
return log_error_errno(r, "Failed to create cgroup empty event source: %m");
r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
if (r < 0)
return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
if (r < 0)
return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
(void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
/* 4. Install notifier inotify object, or agent */
if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
/* In the unified hierarchy we can get
* cgroup empty notifications via inotify. */
/* In the unified hierarchy we can get cgroup empty notifications via inotify. */
m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
safe_close(m->cgroup_inotify_fd);
@ -1937,7 +2008,7 @@ int manager_setup_cgroup(Manager *m) {
/* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
* see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-5);
r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
if (r < 0)
return log_error_errno(r, "Failed to set priority of inotify event source: %m");
@ -1957,33 +2028,31 @@ int manager_setup_cgroup(Manager *m) {
log_debug("Release agent already installed.");
}
/* 4. Make sure we are in the special "init.scope" unit in the root slice. */
/* 5. Make sure we are in the special "init.scope" unit in the root slice. */
scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
if (r < 0)
return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
/* also, move all other userspace processes remaining
* in the root cgroup into that scope. */
/* Also, move all other userspace processes remaining in the root cgroup into that scope. */
r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
if (r < 0)
log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
/* 5. And pin it, so that it cannot be unmounted */
/* 6. And pin it, so that it cannot be unmounted */
safe_close(m->pin_cgroupfs_fd);
m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
if (m->pin_cgroupfs_fd < 0)
return log_error_errno(errno, "Failed to open pin file: %m");
/* 6. Always enable hierarchical support if it exists... */
/* 7. Always enable hierarchical support if it exists... */
if (!all_unified && m->test_run_flags == 0)
(void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
/* 7. Figure out which controllers are supported */
/* 8. Figure out which controllers are supported, and log about it */
r = cg_mask_supported(&m->cgroup_supported);
if (r < 0)
return log_error_errno(r, "Failed to determine supported controllers: %m");
for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
@ -1998,6 +2067,8 @@ void manager_shutdown_cgroup(Manager *m, bool delete) {
if (delete && m->cgroup_root)
(void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
@ -2079,13 +2150,17 @@ int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
assert(m);
assert(cgroup);
/* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
* or from the --system instance */
log_debug("Got cgroup empty notification for: %s", cgroup);
u = manager_get_unit_by_cgroup(m, cgroup);
if (!u)
return 0;
return unit_notify_cgroup_empty(u);
unit_add_to_cgroup_empty_queue(u);
return 1;
}
int unit_get_memory_current(Unit *u, uint64_t *ret) {

View file

@ -172,6 +172,8 @@ void unit_release_cgroup(Unit *u);
void unit_prune_cgroup(Unit *u);
int unit_watch_cgroup(Unit *u);
void unit_add_to_cgroup_empty_queue(Unit *u);
int unit_attach_pids_to_cgroup(Unit *u);
int manager_setup_cgroup(Manager *m);
@ -200,7 +202,6 @@ int unit_reset_ip_accounting(Unit *u);
cc ? cc->name : false; \
})
int unit_notify_cgroup_empty(Unit *u);
int manager_notify_cgroup_empty(Manager *m, const char *group);
void unit_invalidate_cgroup(Unit *u, CGroupMask m);

View file

@ -859,7 +859,7 @@ static int manager_setup_cgroups_agent(Manager *m) {
* SIGCHLD signals, so that a cgroup running empty is always just the last safety net of notification,
* and we collected the metadata the notification and SIGCHLD stuff offers first. Also see handling of
* cgroup inotify for the unified cgroup stuff. */
r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-5);
r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-4);
if (r < 0)
return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m");

View file

@ -121,6 +121,9 @@ struct Manager {
/* Units that should be realized */
LIST_HEAD(Unit, cgroup_realize_queue);
/* Units whose cgroup ran empty */
LIST_HEAD(Unit, cgroup_empty_queue);
sd_event *event;
/* We use two hash tables here, since the same PID might be
@ -229,12 +232,14 @@ struct Manager {
CGroupMask cgroup_supported;
char *cgroup_root;
/* Notifications from cgroups, when the unified hierarchy is
* used is done via inotify. */
/* Notifications from cgroups, when the unified hierarchy is used is done via inotify. */
int cgroup_inotify_fd;
sd_event_source *cgroup_inotify_event_source;
Hashmap *cgroup_inotify_wd_unit;
/* A defer event for handling cgroup empty events and processing them after SIGCHLD in all cases. */
sd_event_source *cgroup_empty_event_source;
/* Make sure the user cannot accidentally unmount our cgroup
* file system */
int pin_cgroupfs_fd;

View file

@ -3213,7 +3213,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
/* If the PID set is empty now, then let's finish this off
(On unified we use proper notifications) */
if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) == 0 && set_isempty(u->pids))
service_notify_cgroup_empty_event(u);
unit_add_to_cgroup_empty_queue(u);
}
static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {

View file

@ -591,6 +591,9 @@ void unit_free(Unit *u) {
if (u->in_cgroup_realize_queue)
LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
if (u->in_cgroup_empty_queue)
LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
unit_release_cgroup(u);
unit_unref_uid_gid(u, false);

View file

@ -168,6 +168,9 @@ struct Unit {
/* CGroup realize members queue */
LIST_FIELDS(Unit, cgroup_realize_queue);
/* cgroup empty queue */
LIST_FIELDS(Unit, cgroup_empty_queue);
/* PIDs we keep an eye on. Note that a unit might have many
* more, but these are the ones we care enough about to
* process SIGCHLD for */
@ -264,6 +267,7 @@ struct Unit {
bool in_cleanup_queue:1;
bool in_gc_queue:1;
bool in_cgroup_realize_queue:1;
bool in_cgroup_empty_queue:1;
bool sent_dbus_new_signal:1;