Merge pull request #6928 from poettering/cgroup-empty-race

rework cgroup empty notification handling (i.e. a fix for #6608)
This commit is contained in:
Zbigniew Jędrzejewski-Szmek 2017-09-28 08:48:21 +02:00 committed by GitHub
commit 9500b9209b
14 changed files with 168 additions and 65 deletions

View file

@ -325,6 +325,9 @@ static void automount_enter_dead(Automount *a, AutomountResult f) {
if (a->result == AUTOMOUNT_SUCCESS)
a->result = f;
if (a->result != AUTOMOUNT_SUCCESS)
log_unit_warning(UNIT(a), "Failed with result '%s'.", automount_result_to_string(a->result));
automount_set_state(a, a->result != AUTOMOUNT_SUCCESS ? AUTOMOUNT_FAILED : AUTOMOUNT_DEAD);
}

View file

@ -1496,9 +1496,9 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
assert(u);
if (u->in_cgroup_queue) {
LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
u->in_cgroup_queue = false;
if (u->in_cgroup_realize_queue) {
LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
u->in_cgroup_realize_queue = false;
}
target_mask = unit_get_target_mask(u);
@ -1532,26 +1532,28 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
return 0;
}
static void unit_add_to_cgroup_queue(Unit *u) {
static void unit_add_to_cgroup_realize_queue(Unit *u) {
assert(u);
if (u->in_cgroup_queue)
if (u->in_cgroup_realize_queue)
return;
LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
u->in_cgroup_queue = true;
LIST_PREPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
u->in_cgroup_realize_queue = true;
}
unsigned manager_dispatch_cgroup_queue(Manager *m) {
unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
ManagerState state;
unsigned n = 0;
Unit *i;
int r;
assert(m);
state = manager_state(m);
while ((i = m->cgroup_queue)) {
assert(i->in_cgroup_queue);
while ((i = m->cgroup_realize_queue)) {
assert(i->in_cgroup_realize_queue);
r = unit_realize_cgroup_now(i, state);
if (r < 0)
@ -1563,7 +1565,7 @@ unsigned manager_dispatch_cgroup_queue(Manager *m) {
return n;
}
static void unit_queue_siblings(Unit *u) {
static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
Unit *slice;
/* This adds the siblings of the specified unit and the
@ -1597,7 +1599,7 @@ static void unit_queue_siblings(Unit *u) {
unit_get_needs_bpf(m)))
continue;
unit_add_to_cgroup_queue(m);
unit_add_to_cgroup_realize_queue(m);
}
u = slice;
@ -1622,7 +1624,7 @@ int unit_realize_cgroup(Unit *u) {
* iteration. */
/* Add all sibling slices to the cgroup queue. */
unit_queue_siblings(u);
unit_add_siblings_to_cgroup_realize_queue(u);
/* And realize this one now (and apply the values) */
return unit_realize_cgroup_now(u, manager_state(u->manager));
@ -1792,17 +1794,28 @@ int unit_watch_all_pids(Unit *u) {
return unit_watch_pids_in_path(u, u->cgroup_path);
}
int unit_notify_cgroup_empty(Unit *u) {
static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
Manager *m = userdata;
Unit *u;
int r;
assert(u);
assert(s);
assert(m);
if (!u->cgroup_path)
u = m->cgroup_empty_queue;
if (!u)
return 0;
r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
if (r <= 0)
return r;
assert(u->in_cgroup_empty_queue);
u->in_cgroup_empty_queue = false;
LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
if (m->cgroup_empty_queue) {
/* More stuff queued, let's make sure we remain enabled */
r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
if (r < 0)
log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
}
unit_add_to_gc_queue(u);
@ -1812,6 +1825,51 @@ int unit_notify_cgroup_empty(Unit *u) {
return 0;
}
void unit_add_to_cgroup_empty_queue(Unit *u) {
int r;
assert(u);
/* Note that there are four different ways how cgroup empty events reach us:
*
* 1. On the unified hierarchy we get an inotify event on the cgroup
*
* 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
*
* 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
*
* 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
* soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
*
* Regardless which way we got the notification, we'll verify it here, and then add it to a separate
* queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
* SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
* (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
* case for scope units). */
if (u->in_cgroup_empty_queue)
return;
/* Let's verify that the cgroup is really empty */
if (!u->cgroup_path)
return;
r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
if (r < 0) {
log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
return;
}
if (r == 0)
return;
LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
u->in_cgroup_empty_queue = true;
/* Trigger the defer event */
r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
if (r < 0)
log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
}
static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
Manager *m = userdata;
@ -1826,7 +1884,7 @@ static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents,
l = read(fd, &buffer, sizeof(buffer));
if (l < 0) {
if (errno == EINTR || errno == EAGAIN)
if (IN_SET(errno, EINTR, EAGAIN))
return 0;
return log_error_errno(errno, "Failed to read control group inotify events: %m");
@ -1851,7 +1909,7 @@ static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents,
* this here safely. */
continue;
(void) unit_notify_cgroup_empty(u);
unit_add_to_cgroup_empty_queue(u);
}
}
}
@ -1916,11 +1974,26 @@ int manager_setup_cgroup(Manager *m) {
log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
}
/* 3. Install agent */
/* 3. Allocate cgroup empty defer event source */
m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
if (r < 0)
return log_error_errno(r, "Failed to create cgroup empty event source: %m");
r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
if (r < 0)
return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
if (r < 0)
return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
(void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
/* 4. Install notifier inotify object, or agent */
if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
/* In the unified hierarchy we can get
* cgroup empty notifications via inotify. */
/* In the unified hierarchy we can get cgroup empty notifications via inotify. */
m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
safe_close(m->cgroup_inotify_fd);
@ -1935,7 +2008,7 @@ int manager_setup_cgroup(Manager *m) {
/* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
* see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-5);
r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
if (r < 0)
return log_error_errno(r, "Failed to set priority of inotify event source: %m");
@ -1955,33 +2028,31 @@ int manager_setup_cgroup(Manager *m) {
log_debug("Release agent already installed.");
}
/* 4. Make sure we are in the special "init.scope" unit in the root slice. */
/* 5. Make sure we are in the special "init.scope" unit in the root slice. */
scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
if (r < 0)
return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
/* also, move all other userspace processes remaining
* in the root cgroup into that scope. */
/* Also, move all other userspace processes remaining in the root cgroup into that scope. */
r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
if (r < 0)
log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
/* 5. And pin it, so that it cannot be unmounted */
/* 6. And pin it, so that it cannot be unmounted */
safe_close(m->pin_cgroupfs_fd);
m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
if (m->pin_cgroupfs_fd < 0)
return log_error_errno(errno, "Failed to open pin file: %m");
/* 6. Always enable hierarchical support if it exists... */
/* 7. Always enable hierarchical support if it exists... */
if (!all_unified && m->test_run_flags == 0)
(void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
/* 7. Figure out which controllers are supported */
/* 8. Figure out which controllers are supported, and log about it */
r = cg_mask_supported(&m->cgroup_supported);
if (r < 0)
return log_error_errno(r, "Failed to determine supported controllers: %m");
for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
@ -1996,6 +2067,8 @@ void manager_shutdown_cgroup(Manager *m, bool delete) {
if (delete && m->cgroup_root)
(void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
@ -2077,13 +2150,17 @@ int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
assert(m);
assert(cgroup);
/* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
* or from the --system instance */
log_debug("Got cgroup empty notification for: %s", cgroup);
u = manager_get_unit_by_cgroup(m, cgroup);
if (!u)
return 0;
return unit_notify_cgroup_empty(u);
unit_add_to_cgroup_empty_queue(u);
return 1;
}
int unit_get_memory_current(Unit *u, uint64_t *ret) {
@ -2329,7 +2406,7 @@ void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
return;
u->cgroup_realized_mask &= ~m;
unit_add_to_cgroup_queue(u);
unit_add_to_cgroup_realize_queue(u);
}
void unit_invalidate_cgroup_bpf(Unit *u) {
@ -2342,7 +2419,7 @@ void unit_invalidate_cgroup_bpf(Unit *u) {
return;
u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
unit_add_to_cgroup_queue(u);
unit_add_to_cgroup_realize_queue(u);
/* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
* list of our children includes our own. */

View file

@ -172,12 +172,14 @@ void unit_release_cgroup(Unit *u);
void unit_prune_cgroup(Unit *u);
int unit_watch_cgroup(Unit *u);
void unit_add_to_cgroup_empty_queue(Unit *u);
int unit_attach_pids_to_cgroup(Unit *u);
int manager_setup_cgroup(Manager *m);
void manager_shutdown_cgroup(Manager *m, bool delete);
unsigned manager_dispatch_cgroup_queue(Manager *m);
unsigned manager_dispatch_cgroup_realize_queue(Manager *m);
Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup);
Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid);
@ -200,7 +202,6 @@ int unit_reset_ip_accounting(Unit *u);
cc ? cc->name : false; \
})
int unit_notify_cgroup_empty(Unit *u);
int manager_notify_cgroup_empty(Manager *m, const char *group);
void unit_invalidate_cgroup(Unit *u, CGroupMask m);

View file

@ -859,7 +859,7 @@ static int manager_setup_cgroups_agent(Manager *m) {
* SIGCHLD signals, so that a cgroup running empty is always just the last safety net of notification,
* and we collected the metadata the notification and SIGCHLD stuff offers first. Also see handling of
* cgroup inotify for the unified cgroup stuff. */
r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-5);
r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-4);
if (r < 0)
return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m");
@ -2367,7 +2367,7 @@ int manager_loop(Manager *m) {
if (manager_dispatch_cleanup_queue(m) > 0)
continue;
if (manager_dispatch_cgroup_queue(m) > 0)
if (manager_dispatch_cgroup_realize_queue(m) > 0)
continue;
if (manager_dispatch_dbus_queue(m) > 0)

View file

@ -119,7 +119,10 @@ struct Manager {
LIST_HEAD(Job, gc_job_queue);
/* Units that should be realized */
LIST_HEAD(Unit, cgroup_queue);
LIST_HEAD(Unit, cgroup_realize_queue);
/* Units whose cgroup ran empty */
LIST_HEAD(Unit, cgroup_empty_queue);
sd_event *event;
@ -229,12 +232,14 @@ struct Manager {
CGroupMask cgroup_supported;
char *cgroup_root;
/* Notifications from cgroups, when the unified hierarchy is
* used is done via inotify. */
/* Notifications from cgroups, when the unified hierarchy is used is done via inotify. */
int cgroup_inotify_fd;
sd_event_source *cgroup_inotify_event_source;
Hashmap *cgroup_inotify_wd_unit;
/* A defer event for handling cgroup empty events and processing them after SIGCHLD in all cases. */
sd_event_source *cgroup_empty_event_source;
/* Make sure the user cannot accidentally unmount our cgroup
* file system */
int pin_cgroupfs_fd;

View file

@ -796,6 +796,9 @@ static void mount_enter_dead(Mount *m, MountResult f) {
if (m->result == MOUNT_SUCCESS)
m->result = f;
if (m->result != MOUNT_SUCCESS)
log_unit_warning(UNIT(m), "Failed with result '%s'.", mount_result_to_string(m->result));
mount_set_state(m, m->result != MOUNT_SUCCESS ? MOUNT_FAILED : MOUNT_DEAD);
exec_runtime_destroy(m->exec_runtime);

View file

@ -457,6 +457,9 @@ static void path_enter_dead(Path *p, PathResult f) {
if (p->result == PATH_SUCCESS)
p->result = f;
if (p->result != PATH_SUCCESS)
log_unit_warning(UNIT(p), "Failed with result '%s'.", path_result_to_string(p->result));
path_set_state(p, p->result != PATH_SUCCESS ? PATH_FAILED : PATH_DEAD);
}

View file

@ -253,6 +253,9 @@ static void scope_enter_dead(Scope *s, ScopeResult f) {
if (s->result == SCOPE_SUCCESS)
s->result = f;
if (s->result != SCOPE_SUCCESS)
log_unit_warning(UNIT(s), "Failed with result '%s'.", scope_result_to_string(s->result));
scope_set_state(s, s->result != SCOPE_SUCCESS ? SCOPE_FAILED : SCOPE_DEAD);
}

View file

@ -1512,12 +1512,13 @@ static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart)
if (s->result == SERVICE_SUCCESS)
s->result = f;
if (s->result != SERVICE_SUCCESS)
log_unit_warning(UNIT(s), "Failed with result '%s'.", service_result_to_string(s->result));
service_set_state(s, s->result != SERVICE_SUCCESS ? SERVICE_FAILED : SERVICE_DEAD);
if (s->result != SERVICE_SUCCESS) {
log_unit_warning(UNIT(s), "Failed with result '%s'.", service_result_to_string(s->result));
if (s->result != SERVICE_SUCCESS)
emergency_action(UNIT(s)->manager, s->emergency_action, UNIT(s)->reboot_arg, "service failed");
}
if (allow_restart && service_shall_restart(s)) {
@ -3213,7 +3214,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
/* If the PID set is empty now, then let's finish this off
(On unified we use proper notifications) */
if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) == 0 && set_isempty(u->pids))
service_notify_cgroup_empty_event(u);
unit_add_to_cgroup_empty_queue(u);
}
static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {

View file

@ -2000,6 +2000,9 @@ static void socket_enter_dead(Socket *s, SocketResult f) {
if (s->result == SOCKET_SUCCESS)
s->result = f;
if (s->result != SOCKET_SUCCESS)
log_unit_warning(UNIT(s), "Failed with result '%s'.", socket_result_to_string(s->result));
socket_set_state(s, s->result != SOCKET_SUCCESS ? SOCKET_FAILED : SOCKET_DEAD);
exec_runtime_destroy(s->exec_runtime);

View file

@ -665,6 +665,9 @@ static void swap_enter_dead(Swap *s, SwapResult f) {
if (s->result == SWAP_SUCCESS)
s->result = f;
if (s->result != SWAP_SUCCESS)
log_unit_warning(UNIT(s), "Failed with result '%s'.", swap_result_to_string(s->result));
swap_set_state(s, s->result != SWAP_SUCCESS ? SWAP_FAILED : SWAP_DEAD);
exec_runtime_destroy(s->exec_runtime);

View file

@ -296,6 +296,9 @@ static void timer_enter_dead(Timer *t, TimerResult f) {
if (t->result == TIMER_SUCCESS)
t->result = f;
if (t->result != TIMER_SUCCESS)
log_unit_warning(UNIT(t), "Failed with result '%s'.", timer_result_to_string(t->result));
timer_set_state(t, t->result != TIMER_SUCCESS ? TIMER_FAILED : TIMER_DEAD);
}

View file

@ -314,22 +314,16 @@ int unit_choose_id(Unit *u, const char *name) {
}
int unit_set_description(Unit *u, const char *description) {
char *s;
int r;
assert(u);
if (isempty(description))
s = NULL;
else {
s = strdup(description);
if (!s)
return -ENOMEM;
}
r = free_and_strdup(&u->description, empty_to_null(description));
if (r < 0)
return r;
if (r > 0)
unit_add_to_dbus_queue(u);
free(u->description);
u->description = s;
unit_add_to_dbus_queue(u);
return 0;
}
@ -588,8 +582,11 @@ void unit_free(Unit *u) {
if (u->in_gc_queue)
LIST_REMOVE(gc_queue, u->manager->gc_unit_queue, u);
if (u->in_cgroup_queue)
LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
if (u->in_cgroup_realize_queue)
LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
if (u->in_cgroup_empty_queue)
LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
unit_release_cgroup(u);
@ -2275,7 +2272,7 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su
check_unneeded_dependencies(u);
if (ns != os && ns == UNIT_FAILED) {
log_unit_notice(u, "Unit entered failed state.");
log_unit_debug(u, "Unit entered failed state.");
unit_start_on_failure(u);
}
}

View file

@ -166,10 +166,10 @@ struct Unit {
LIST_FIELDS(Unit, gc_queue);
/* CGroup realize members queue */
LIST_FIELDS(Unit, cgroup_queue);
LIST_FIELDS(Unit, cgroup_realize_queue);
/* Units with the same CGroup netclass */
LIST_FIELDS(Unit, cgroup_netclass);
/* cgroup empty queue */
LIST_FIELDS(Unit, cgroup_empty_queue);
/* PIDs we keep an eye on. Note that a unit might have many
* more, but these are the ones we care enough about to
@ -266,7 +266,8 @@ struct Unit {
bool in_dbus_queue:1;
bool in_cleanup_queue:1;
bool in_gc_queue:1;
bool in_cgroup_queue:1;
bool in_cgroup_realize_queue:1;
bool in_cgroup_empty_queue:1;
bool sent_dbus_new_signal:1;