diff --git a/src/cgroups-agent/cgroups-agent.c b/src/cgroups-agent/cgroups-agent.c index aadfba0707..333ce110d3 100644 --- a/src/cgroups-agent/cgroups-agent.c +++ b/src/cgroups-agent/cgroups-agent.c @@ -18,15 +18,22 @@ ***/ #include +#include -#include "sd-bus.h" - -#include "bus-util.h" +#include "fd-util.h" #include "log.h" +#include "socket-util.h" int main(int argc, char *argv[]) { - _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; - int r; + + static const union sockaddr_union sa = { + .un.sun_family = AF_UNIX, + .un.sun_path = "/run/systemd/cgroups-agent", + }; + + _cleanup_close_ int fd = -1; + ssize_t n; + size_t l; if (argc != 2) { log_error("Incorrect number of arguments."); @@ -37,27 +44,22 @@ int main(int argc, char *argv[]) { log_parse_environment(); log_open(); - /* We send this event to the private D-Bus socket and then the - * system instance will forward this to the system bus. We do - * this to avoid an activation loop when we start dbus when we - * are called when the dbus service is shut down. */ - - r = bus_connect_system_systemd(&bus); - if (r < 0) { - /* If we couldn't connect we assume this was triggered - * while systemd got restarted/transitioned from - * initrd to the system, so let's ignore this */ - log_debug_errno(r, "Failed to get D-Bus connection: %m"); + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0); + if (fd < 0) { + log_debug_errno(errno, "Failed to allocate socket: %m"); return EXIT_FAILURE; } - r = sd_bus_emit_signal(bus, - "/org/freedesktop/systemd1/agent", - "org.freedesktop.systemd1.Agent", - "Released", - "s", argv[1]); - if (r < 0) { - log_debug_errno(r, "Failed to send signal message on private connection: %m"); + l = strlen(argv[1]); + + n = sendto(fd, argv[1], l, 0, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(sa.un.sun_path)); + if (n < 0) { + log_debug_errno(errno, "Failed to send cgroups agent message: %m"); + return EXIT_FAILURE; + } + + if ((size_t) n != l) { + log_debug("Datagram size mismatch"); return EXIT_FAILURE; } diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 25cc6962f9..1a94b188cb 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1312,7 +1312,9 @@ int manager_setup_cgroup(Manager *m) { if (r < 0) return log_error_errno(r, "Failed to watch control group inotify object: %m"); - r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5); + /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also + * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */ + r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-5); if (r < 0) return log_error_errno(r, "Failed to set priority of inotify event source: %m"); @@ -1458,6 +1460,8 @@ int manager_notify_cgroup_empty(Manager *m, const char *cgroup) { assert(m); assert(cgroup); + log_debug("Got cgroup empty notification for: %s", cgroup); + u = manager_get_unit_by_cgroup(m, cgroup); if (!u) return 0; diff --git a/src/core/dbus.c b/src/core/dbus.c index 263955d874..c8375a0475 100644 --- a/src/core/dbus.c +++ b/src/core/dbus.c @@ -71,28 +71,42 @@ int bus_send_queued_message(Manager *m) { return 0; } +int bus_forward_agent_released(Manager *m, const char *path) { + int r; + + assert(m); + assert(path); + + if (!MANAGER_IS_SYSTEM(m)) + return 0; + + if (!m->system_bus) + return 0; + + /* If we are running a system instance we forward the agent message on the system bus, so that the user + * instances get notified about this, too */ + + r = sd_bus_emit_signal(m->system_bus, + "/org/freedesktop/systemd1/agent", + "org.freedesktop.systemd1.Agent", + "Released", + "s", path); + if (r < 0) + return log_warning_errno(r, "Failed to propagate agent release message: %m"); + + return 1; +} + static int signal_agent_released(sd_bus_message *message, void *userdata, sd_bus_error *error) { _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; - const char *cgroup, *me; Manager *m = userdata; + const char *cgroup; uid_t sender_uid; - sd_bus *bus; int r; assert(message); assert(m); - /* ignore recursive events sent by us on the system/user bus */ - bus = sd_bus_message_get_bus(message); - if (!sd_bus_is_server(bus)) { - r = sd_bus_get_unique_name(bus, &me); - if (r < 0) - return r; - - if (streq_ptr(sd_bus_message_get_sender(message), me)) - return 0; - } - /* only accept org.freedesktop.systemd1.Agent from UID=0 */ r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds); if (r < 0) @@ -110,16 +124,6 @@ static int signal_agent_released(sd_bus_message *message, void *userdata, sd_bus } manager_notify_cgroup_empty(m, cgroup); - - /* if running as system-instance, forward under our name */ - if (MANAGER_IS_SYSTEM(m) && m->system_bus) { - r = sd_bus_message_rewind(message, 1); - if (r >= 0) - r = sd_bus_send(m->system_bus, message, NULL); - if (r < 0) - log_warning_errno(r, "Failed to forward Released message: %m"); - } - return 0; } @@ -690,25 +694,6 @@ static int bus_on_connection(sd_event_source *s, int fd, uint32_t revents, void return 0; } - if (MANAGER_IS_SYSTEM(m)) { - /* When we run as system instance we get the Released - * signal via a direct connection */ - - r = sd_bus_add_match( - bus, - NULL, - "type='signal'," - "interface='org.freedesktop.systemd1.Agent'," - "member='Released'," - "path='/org/freedesktop/systemd1/agent'", - signal_agent_released, m); - - if (r < 0) { - log_warning_errno(r, "Failed to register Released match on new connection bus: %m"); - return 0; - } - } - r = bus_setup_disconnected_match(m, bus); if (r < 0) return 0; @@ -906,8 +891,8 @@ static int bus_setup_system(Manager *m, sd_bus *bus) { assert(m); assert(bus); - /* On kdbus or if we are a user instance we get the Released message via the system bus */ - if (MANAGER_IS_USER(m) || m->kdbus_fd >= 0) { + /* if we are a user instance we get the Released message via the system bus */ + if (MANAGER_IS_USER(m)) { r = sd_bus_add_match( bus, NULL, diff --git a/src/core/dbus.h b/src/core/dbus.h index e16a84fbb8..6baaffbd75 100644 --- a/src/core/dbus.h +++ b/src/core/dbus.h @@ -40,3 +40,5 @@ int bus_verify_manage_units_async(Manager *m, sd_bus_message *call, sd_bus_error int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error); int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error); int bus_verify_set_environment_async(Manager *m, sd_bus_message *call, sd_bus_error *error); + +int bus_forward_agent_released(Manager *m, const char *path); diff --git a/src/core/manager.c b/src/core/manager.c index bd00c224f4..17b940c11a 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -87,6 +87,7 @@ #include "watchdog.h" #define NOTIFY_RCVBUF_SIZE (8*1024*1024) +#define CGROUPS_AGENT_RCVBUF_SIZE (8*1024*1024) /* Initial delay and the interval for printing status messages about running jobs */ #define JOBS_IN_PROGRESS_WAIT_USEC (5*USEC_PER_SEC) @@ -94,6 +95,7 @@ #define JOBS_IN_PROGRESS_PERIOD_DIVISOR 3 static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); @@ -484,11 +486,11 @@ static int manager_setup_signals(Manager *m) { (void) sd_event_source_set_description(m->signal_event_source, "manager-signal"); - /* Process signals a bit earlier than the rest of things, but - * later than notify_fd processing, so that the notify - * processing can still figure out to which process/service a - * message belongs, before we reap the process. */ - r = sd_event_source_set_priority(m->signal_event_source, SD_EVENT_PRIORITY_NORMAL-5); + /* Process signals a bit earlier than the rest of things, but later than notify_fd processing, so that the + * notify processing can still figure out to which process/service a message belongs, before we reap the + * process. Also, process this before handling cgroup notifications, so that we always collect child exit + * status information before detecting that there's no process in a cgroup. */ + r = sd_event_source_set_priority(m->signal_event_source, SD_EVENT_PRIORITY_NORMAL-6); if (r < 0) return r; @@ -581,12 +583,12 @@ int manager_new(UnitFileScope scope, bool test_run, Manager **_m) { m->idle_pipe[0] = m->idle_pipe[1] = m->idle_pipe[2] = m->idle_pipe[3] = -1; - m->pin_cgroupfs_fd = m->notify_fd = m->signal_fd = m->time_change_fd = - m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->cgroup_inotify_fd = -1; + m->pin_cgroupfs_fd = m->notify_fd = m->cgroups_agent_fd = m->signal_fd = m->time_change_fd = + m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->cgroup_inotify_fd = + m->ask_password_inotify_fd = -1; m->current_job_id = 1; /* start as id #1, so that we can leave #0 around as "null-like" value */ - m->ask_password_inotify_fd = -1; m->have_ask_password = -EINVAL; /* we don't know */ m->first_boot = -1; @@ -722,8 +724,8 @@ static int manager_setup_notify(Manager *m) { if (r < 0) return log_error_errno(r, "Failed to allocate notify event source: %m"); - /* Process signals a bit earlier than SIGCHLD, so that we can - * still identify to which service an exit message belongs */ + /* Process notification messages a bit earlier than SIGCHLD, so that we can still identify to which + * service an exit message belongs. */ r = sd_event_source_set_priority(m->notify_event_source, SD_EVENT_PRIORITY_NORMAL-7); if (r < 0) return log_error_errno(r, "Failed to set priority of notify event source: %m"); @@ -734,6 +736,79 @@ static int manager_setup_notify(Manager *m) { return 0; } +static int manager_setup_cgroups_agent(Manager *m) { + + static const union sockaddr_union sa = { + .un.sun_family = AF_UNIX, + .un.sun_path = "/run/systemd/cgroups-agent", + }; + int r; + + /* This creates a listening socket we receive cgroups agent messages on. We do not use D-Bus for delivering + * these messages from the cgroups agent binary to PID 1, as the cgroups agent binary is very short-living, and + * each instance of it needs a new D-Bus connection. Since D-Bus connections are SOCK_STREAM/AF_UNIX, on + * overloaded systems the backlog of the D-Bus socket becomes relevant, as not more than the configured number + * of D-Bus connections may be queued until the kernel will start dropping further incoming connections, + * possibly resulting in lost cgroups agent messages. To avoid this, we'll use a private SOCK_DGRAM/AF_UNIX + * socket, where no backlog is relevant as communication may take place without an actual connect() cycle, and + * we thus won't lose messages. + * + * Note that PID 1 will forward the agent message to system bus, so that the user systemd instance may listen + * to it. The system instance hence listens on this special socket, but the user instances listen on the system + * bus for these messages. */ + + if (m->test_run) + return 0; + + if (!MANAGER_IS_SYSTEM(m)) + return 0; + + if (cg_unified() > 0) /* We don't need this anymore on the unified hierarchy */ + return 0; + + if (m->cgroups_agent_fd < 0) { + _cleanup_close_ int fd = -1; + + /* First free all secondary fields */ + m->cgroups_agent_event_source = sd_event_source_unref(m->cgroups_agent_event_source); + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate cgroups agent socket: %m"); + + fd_inc_rcvbuf(fd, CGROUPS_AGENT_RCVBUF_SIZE); + + (void) unlink(sa.un.sun_path); + + /* Only allow root to connect to this socket */ + RUN_WITH_UMASK(0077) + r = bind(fd, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(sa.un.sun_path)); + if (r < 0) + return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path); + + m->cgroups_agent_fd = fd; + fd = -1; + } + + if (!m->cgroups_agent_event_source) { + r = sd_event_add_io(m->event, &m->cgroups_agent_event_source, m->cgroups_agent_fd, EPOLLIN, manager_dispatch_cgroups_agent_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to allocate cgroups agent event source: %m"); + + /* Process cgroups notifications early, but after having processed service notification messages or + * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of notification, + * and we collected the metadata the notification and SIGCHLD stuff offers first. Also see handling of + * cgroup inotify for the unified cgroup stuff. */ + r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-5); + if (r < 0) + return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m"); + + (void) sd_event_source_set_description(m->cgroups_agent_event_source, "manager-cgroups-agent"); + } + + return 0; +} + static int manager_setup_kdbus(Manager *m) { _cleanup_free_ char *p = NULL; @@ -944,12 +1019,14 @@ Manager* manager_free(Manager *m) { sd_event_source_unref(m->signal_event_source); sd_event_source_unref(m->notify_event_source); + sd_event_source_unref(m->cgroups_agent_event_source); sd_event_source_unref(m->time_change_event_source); sd_event_source_unref(m->jobs_in_progress_event_source); sd_event_source_unref(m->run_queue_event_source); safe_close(m->signal_fd); safe_close(m->notify_fd); + safe_close(m->cgroups_agent_fd); safe_close(m->time_change_fd); safe_close(m->kdbus_fd); @@ -1142,6 +1219,10 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds) { if (q < 0 && r == 0) r = q; + q = manager_setup_cgroups_agent(m); + if (q < 0 && r == 0) + r = q; + /* We might have deserialized the kdbus control fd, but if we * didn't, then let's create the bus now. */ manager_setup_kdbus(m); @@ -1479,6 +1560,35 @@ static unsigned manager_dispatch_dbus_queue(Manager *m) { return n; } +static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = userdata; + char buf[PATH_MAX+1]; + ssize_t n; + + n = recv(fd, buf, sizeof(buf), 0); + if (n < 0) + return log_error_errno(errno, "Failed to read cgroups agent message: %m"); + if (n == 0) { + log_error("Got zero-length cgroups agent message, ignoring."); + return 0; + } + if ((size_t) n >= sizeof(buf)) { + log_error("Got overly long cgroups agent message, ignoring."); + return 0; + } + + if (memchr(buf, 0, n)) { + log_error("Got cgroups agent message with embedded NUL byte, ignoring."); + return 0; + } + buf[n] = 0; + + manager_notify_cgroup_empty(m, buf); + bus_forward_agent_released(m, buf); + + return 0; +} + static void manager_invoke_notify_message(Manager *m, Unit *u, pid_t pid, const char *buf, size_t n, FDSet *fds) { _cleanup_strv_free_ char **tags = NULL; @@ -2265,6 +2375,16 @@ int manager_serialize(Manager *m, FILE *f, FDSet *fds, bool switching_root) { fprintf(f, "notify-socket=%s\n", m->notify_socket); } + if (m->cgroups_agent_fd >= 0) { + int copy; + + copy = fdset_put_dup(fds, m->cgroups_agent_fd); + if (copy < 0) + return copy; + + fprintf(f, "cgroups-agent-fd=%i\n", copy); + } + if (m->kdbus_fd >= 0) { int copy; @@ -2432,6 +2552,17 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) { free(m->notify_socket); m->notify_socket = n; + } else if (startswith(l, "cgroups-agent-fd=")) { + int fd; + + if (safe_atoi(l + 17, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_debug("Failed to parse cgroups agent fd: %s", l + 10); + else { + m->cgroups_agent_event_source = sd_event_source_unref(m->cgroups_agent_event_source); + safe_close(m->cgroups_agent_fd); + m->cgroups_agent_fd = fdset_remove(fds, fd); + } + } else if (startswith(l, "kdbus-fd=")) { int fd; @@ -2552,6 +2683,10 @@ int manager_reload(Manager *m) { if (q < 0 && r >= 0) r = q; + q = manager_setup_cgroups_agent(m); + if (q < 0 && r >= 0) + r = q; + /* Third, fire things up! */ manager_coldplug(m); diff --git a/src/core/manager.h b/src/core/manager.h index 17f84e6963..4bccca75cb 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -132,6 +132,9 @@ struct Manager { int notify_fd; sd_event_source *notify_event_source; + int cgroups_agent_fd; + sd_event_source *cgroups_agent_event_source; + int signal_fd; sd_event_source *signal_event_source;