From f8f3f9263e51db180bd78a5f3b152aefd25427ee Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 30 Oct 2019 17:22:49 +0100 Subject: [PATCH] sd-event: add pidfd support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for watching for process exits via Linux new pidfd concept. This makes watching processes and killing them race-free if properly used, fixing a long-standing UNIX misdesign. This patch adds implicit and explicit pidfd support to sd-event: if a process shall be watched and is specified by PID we will now internally create a pidfd for it and use that, if available. Alternatively a new constructor for child process event sources is added that takes pidfds as input. Besides mere watching of child processes via pidfd two additional features are added: → sd_event_source_send_child_signal() allows sending a signal to the process being watched in the safest way possible (wrapping the new pidfd_send_signal() syscall). → sd_event_source_set_child_process_own() allows marking a process watched for destruction as soon as the event source is freed. This is currently implemented in userspace, but hopefully will become a kernel feature eventually. Altogether this means an sd_event_source object is now a safe and stable concept for referencing processes in race-free way, with automatic fallback to pre-pidfd kernels. Note that this patch adds support for this only to sd-event, not to PID 1. That's because PID 1 needs to use waitid(P_ALL) for reaping any process that might get reparented to it. This currently semantically conflicts with pidfd use for watching processes since we P_ALL is undirected and thus might reap process earlier than the pidfd notifies process end, which is hard to handle. The kernel will likely gain a concept for excluding specific pidfds from P_ALL watching, as soon as that is around we can start making use of this in PID 1 too. --- src/libsystemd/libsystemd.sym | 11 + src/libsystemd/sd-event/event-source.h | 8 +- src/libsystemd/sd-event/sd-event.c | 416 +++++++++++++++++++++++-- src/systemd/sd-event.h | 12 + 4 files changed, 425 insertions(+), 22 deletions(-) diff --git a/src/libsystemd/libsystemd.sym b/src/libsystemd/libsystemd.sym index 5ec42e0f1f..bed81bf173 100644 --- a/src/libsystemd/libsystemd.sym +++ b/src/libsystemd/libsystemd.sym @@ -682,3 +682,14 @@ global: sd_bus_object_vtable_format; sd_event_source_disable_unref; } LIBSYSTEMD_241; + +LIBSYSTEMD_245 { +global: + sd_event_add_child_pidfd; + sd_event_source_get_child_pidfd; + sd_event_source_get_child_pidfd_own; + sd_event_source_set_child_pidfd_own; + sd_event_source_get_child_process_own; + sd_event_source_set_child_process_own; + sd_event_source_send_child_signal; +} LIBSYSTEMD_243; diff --git a/src/libsystemd/sd-event/event-source.h b/src/libsystemd/sd-event/event-source.h index 99ab8fc169..08eb9b6a61 100644 --- a/src/libsystemd/sd-event/event-source.h +++ b/src/libsystemd/sd-event/event-source.h @@ -34,7 +34,7 @@ typedef enum EventSourceType { * we know how to dispatch it */ typedef enum WakeupType { WAKEUP_NONE, - WAKEUP_EVENT_SOURCE, + WAKEUP_EVENT_SOURCE, /* either I/O or pidfd wakeup */ WAKEUP_CLOCK_DATA, WAKEUP_SIGNAL_DATA, WAKEUP_INOTIFY_DATA, @@ -96,6 +96,12 @@ struct sd_event_source { siginfo_t siginfo; pid_t pid; int options; + int pidfd; + bool registered:1; /* whether the pidfd is registered in the epoll */ + bool pidfd_owned:1; /* close pidfd when event source is freed */ + bool process_owned:1; /* kill+reap process when event source is freed */ + bool exited:1; /* true if process exited (i.e. if there's value in SIGKILLing it if we want to get rid of it) */ + bool waited:1; /* true if process was waited for (i.e. if there's value in waitid(P_PID)'ing it if we want to get rid of it) */ } child; struct { sd_event_handler_t callback; diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c index e348fb699d..4693049f4c 100644 --- a/src/libsystemd/sd-event/sd-event.c +++ b/src/libsystemd/sd-event/sd-event.c @@ -9,6 +9,7 @@ #include "sd-id128.h" #include "alloc-util.h" +#include "env-util.h" #include "event-source.h" #include "fd-util.h" #include "fs-util.h" @@ -28,6 +29,14 @@ #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC) +static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) { + /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */ + return s && + s->type == SOURCE_CHILD && + s->child.pidfd >= 0 && + s->child.options == WEXITED; +} + static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = { [SOURCE_IO] = "io", [SOURCE_TIME_REALTIME] = "realtime", @@ -401,6 +410,51 @@ static int source_io_register( return 0; } +static void source_child_pidfd_unregister(sd_event_source *s) { + assert(s); + assert(s->type == SOURCE_CHILD); + + if (event_pid_changed(s->event)) + return; + + if (!s->child.registered) + return; + + if (EVENT_SOURCE_WATCH_PIDFD(s)) + if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0) + log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m", + strna(s->description), event_source_type_to_string(s->type)); + + s->child.registered = false; +} + +static int source_child_pidfd_register(sd_event_source *s, int enabled) { + int r; + + assert(s); + assert(s->type == SOURCE_CHILD); + assert(enabled != SD_EVENT_OFF); + + if (EVENT_SOURCE_WATCH_PIDFD(s)) { + struct epoll_event ev; + + ev = (struct epoll_event) { + .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0), + .data.ptr = s, + }; + + if (s->child.registered) + r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->child.pidfd, &ev); + else + r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->child.pidfd, &ev); + if (r < 0) + return -errno; + } + + s->child.registered = true; + return 0; +} + static clockid_t event_source_type_to_clock(EventSourceType t) { switch (t) { @@ -611,9 +665,8 @@ static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) assert(e); - /* Rechecks if the specified signal is still something we are - * interested in. If not, we'll unmask it, and possibly drop - * the signalfd for it. */ + /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it, + * and possibly drop the signalfd for it. */ if (sig == SIGCHLD && e->n_enabled_child_sources > 0) @@ -704,9 +757,13 @@ static void source_disconnect(sd_event_source *s) { } (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid)); - event_gc_signal_data(s->event, &s->priority, SIGCHLD); } + if (EVENT_SOURCE_WATCH_PIDFD(s)) + source_child_pidfd_unregister(s); + else + event_gc_signal_data(s->event, &s->priority, SIGCHLD); + break; case SOURCE_DEFER: @@ -787,6 +844,44 @@ static void source_free(sd_event_source *s) { if (s->type == SOURCE_IO && s->io.owned) s->io.fd = safe_close(s->io.fd); + if (s->type == SOURCE_CHILD) { + /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */ + + if (s->child.process_owned) { + + if (!s->child.exited) { + bool sent = false; + + if (s->child.pidfd >= 0) { + if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) { + if (errno == ESRCH) /* Already dead */ + sent = true; + else if (!ERRNO_IS_NOT_SUPPORTED(errno)) + log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m", + s->child.pid); + } else + sent = true; + } + + if (!sent) + if (kill(s->child.pid, SIGKILL) < 0) + if (errno != ESRCH) /* Already dead */ + log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m", + s->child.pid); + } + + if (!s->child.waited) { + siginfo_t si = {}; + + /* Reap the child if we can */ + (void) waitid(P_PID, s->child.pid, &si, WEXITED); + } + } + + if (s->child.pidfd_owned) + s->child.pidfd = safe_close(s->child.pidfd); + } + if (s->destroy_callback) s->destroy_callback(s->userdata); @@ -1121,6 +1216,11 @@ _public_ int sd_event_add_signal( return 0; } +static bool shall_use_pidfd(void) { + /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */ + return getenv_bool_secure("SYSTEMD_PIDFD") != 0; +} + _public_ int sd_event_add_child( sd_event *e, sd_event_source **ret, @@ -1152,30 +1252,136 @@ _public_ int sd_event_add_child( if (!s) return -ENOMEM; + s->wakeup = WAKEUP_EVENT_SOURCE; s->child.pid = pid; s->child.options = options; s->child.callback = callback; s->userdata = userdata; s->enabled = SD_EVENT_ONESHOT; + /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we + * pin the PID, and make regular waitid() handling race-free. */ + + if (shall_use_pidfd()) { + s->child.pidfd = pidfd_open(s->child.pid, 0); + if (s->child.pidfd < 0) { + /* Propagate errors unless the syscall is not supported or blocked */ + if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) + return -errno; + } else + s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */ + } else + s->child.pidfd = -1; + + r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s); + if (r < 0) + return r; + + e->n_enabled_child_sources++; + + if (EVENT_SOURCE_WATCH_PIDFD(s)) { + /* We have a pidfd and we only want to watch for exit */ + + r = source_child_pidfd_register(s, s->enabled); + if (r < 0) { + e->n_enabled_child_sources--; + return r; + } + } else { + /* We have no pidfd or we shall wait for some other event than WEXITED */ + + r = event_make_signal_data(e, SIGCHLD, NULL); + if (r < 0) { + e->n_enabled_child_sources--; + return r; + } + + e->need_process_child = true; + } + + if (ret) + *ret = s; + + TAKE_PTR(s); + return 0; +} + +_public_ int sd_event_add_child_pidfd( + sd_event *e, + sd_event_source **ret, + int pidfd, + int options, + sd_event_child_handler_t callback, + void *userdata) { + + + _cleanup_(source_freep) sd_event_source *s = NULL; + pid_t pid; + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(pidfd >= 0, -EBADF); + assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL); + assert_return(options != 0, -EINVAL); + assert_return(callback, -EINVAL); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(e), -ECHILD); + + r = hashmap_ensure_allocated(&e->child_sources, NULL); + if (r < 0) + return r; + + r = pidfd_get_pid(pidfd, &pid); + if (r < 0) + return r; + + if (hashmap_contains(e->child_sources, PID_TO_PTR(pid))) + return -EBUSY; + + s = source_new(e, !ret, SOURCE_CHILD); + if (!s) + return -ENOMEM; + + s->wakeup = WAKEUP_EVENT_SOURCE; + s->child.pidfd = pidfd; + s->child.pid = pid; + s->child.options = options; + s->child.callback = callback; + s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */ + s->userdata = userdata; + s->enabled = SD_EVENT_ONESHOT; + r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s); if (r < 0) return r; e->n_enabled_child_sources++; - r = event_make_signal_data(e, SIGCHLD, NULL); - if (r < 0) { - e->n_enabled_child_sources--; - return r; - } + if (EVENT_SOURCE_WATCH_PIDFD(s)) { + /* We only want to watch for WEXITED */ - e->need_process_child = true; + r = source_child_pidfd_register(s, s->enabled); + if (r < 0) { + e->n_enabled_child_sources--; + return r; + } + } else { + /* We shall wait for some other event than WEXITED */ + + r = event_make_signal_data(e, SIGCHLD, NULL); + if (r < 0) { + e->n_enabled_child_sources--; + return r; + } + + e->need_process_child = true; + } if (ret) *ret = s; - TAKE_PTR(s); + TAKE_PTR(s); return 0; } @@ -2023,7 +2229,11 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) { assert(s->event->n_enabled_child_sources > 0); s->event->n_enabled_child_sources--; - event_gc_signal_data(s->event, &s->priority, SIGCHLD); + if (EVENT_SOURCE_WATCH_PIDFD(s)) + source_child_pidfd_unregister(s); + else + event_gc_signal_data(s->event, &s->priority, SIGCHLD); + break; case SOURCE_EXIT: @@ -2097,12 +2307,25 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) { s->enabled = m; - r = event_make_signal_data(s->event, SIGCHLD, NULL); - if (r < 0) { - s->enabled = SD_EVENT_OFF; - s->event->n_enabled_child_sources--; - event_gc_signal_data(s->event, &s->priority, SIGCHLD); - return r; + if (EVENT_SOURCE_WATCH_PIDFD(s)) { + /* yes, we have pidfd */ + + r = source_child_pidfd_register(s, s->enabled); + if (r < 0) { + s->enabled = SD_EVENT_OFF; + s->event->n_enabled_child_sources--; + return r; + } + } else { + /* no pidfd, or something other to watch for than WEXITED */ + + r = event_make_signal_data(s->event, SIGCHLD, NULL); + if (r < 0) { + s->enabled = SD_EVENT_OFF; + s->event->n_enabled_child_sources--; + event_gc_signal_data(s->event, &s->priority, SIGCHLD); + return r; + } } break; @@ -2225,6 +2448,98 @@ _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) { return 0; } +_public_ int sd_event_source_get_child_pidfd(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + assert_return(!event_pid_changed(s->event), -ECHILD); + + if (s->child.pidfd < 0) + return -EOPNOTSUPP; + + return s->child.pidfd; +} + +_public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + assert_return(!event_pid_changed(s->event), -ECHILD); + assert_return(SIGNAL_VALID(sig), -EINVAL); + + /* If we already have seen indication the process exited refuse sending a signal early. This way we + * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not + * available. */ + if (s->child.exited) + return -ESRCH; + + if (s->child.pidfd >= 0) { + siginfo_t copy; + + /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the + * structure here */ + if (si) + copy = *si; + + if (pidfd_send_signal(s->child.pidfd, sig, si ? © : NULL, 0) < 0) { + /* Let's propagate the error only if the system call is not implemented or prohibited */ + if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) + return -errno; + } else + return 0; + } + + /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse + * this here. */ + if (flags != 0) + return -EOPNOTSUPP; + + if (si) { + /* We use rt_sigqueueinfo() only if siginfo_t is specified. */ + siginfo_t copy = *si; + + if (rt_sigqueueinfo(s->child.pid, sig, ©) < 0) + return -errno; + } else if (kill(s->child.pid, sig) < 0) + return -errno; + + return 0; +} + +_public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + + if (s->child.pidfd < 0) + return -EOPNOTSUPP; + + return s->child.pidfd_owned; +} + +_public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + + if (s->child.pidfd < 0) + return -EOPNOTSUPP; + + s->child.pidfd_owned = own; + return 0; +} + +_public_ int sd_event_source_get_child_process_own(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + + return s->child.process_owned; +} + +_public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + + s->child.process_owned = own; + return 0; +} + _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) { assert_return(s, -EINVAL); assert_return(mask, -EINVAL); @@ -2535,6 +2850,12 @@ static int process_child(sd_event *e) { if (s->enabled == SD_EVENT_OFF) continue; + if (s->child.exited) + continue; + + if (EVENT_SOURCE_WATCH_PIDFD(s)) /* There's a usable pidfd known for this event source? then don't waitid() for it here */ + continue; + zero(s->child.siginfo); r = waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options); @@ -2544,6 +2865,9 @@ static int process_child(sd_event *e) { if (s->child.siginfo.si_pid != 0) { bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED); + if (zombie) + s->child.exited = true; + if (!zombie && (s->child.options & WEXITED)) { /* If the child isn't dead then let's * immediately remove the state change @@ -2563,6 +2887,33 @@ static int process_child(sd_event *e) { return 0; } +static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) { + assert(e); + assert(s); + assert(s->type == SOURCE_CHILD); + + if (s->pending) + return 0; + + if (s->enabled == SD_EVENT_OFF) + return 0; + + if (!EVENT_SOURCE_WATCH_PIDFD(s)) + return 0; + + zero(s->child.siginfo); + if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0) + return -errno; + + if (s->child.siginfo.si_pid == 0) + return 0; + + if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED)) + s->child.exited = true; + + return source_set_pending(s, true); +} + static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) { bool read_one = false; int r; @@ -2847,8 +3198,10 @@ static int source_dispatch(sd_event_source *s) { r = s->child.callback(s, &s->child.siginfo, s->userdata); /* Now, reap the PID for good. */ - if (zombie) + if (zombie) { (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED); + s->child.waited = true; + } break; } @@ -3144,12 +3497,33 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) { switch (*t) { - case WAKEUP_EVENT_SOURCE: - r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events); + case WAKEUP_EVENT_SOURCE: { + sd_event_source *s = ev_queue[i].data.ptr; + + assert(s); + + switch (s->type) { + + case SOURCE_IO: + r = process_io(e, s, ev_queue[i].events); + break; + + case SOURCE_CHILD: + r = process_pidfd(e, s, ev_queue[i].events); + break; + + default: + assert_not_reached("Unexpected event source type"); + } + break; + } case WAKEUP_CLOCK_DATA: { struct clock_data *d = ev_queue[i].data.ptr; + + assert(d); + r = flush_timer(e, d->fd, ev_queue[i].events, &d->next); break; } diff --git a/src/systemd/sd-event.h b/src/systemd/sd-event.h index b14c92697b..2ec726a897 100644 --- a/src/systemd/sd-event.h +++ b/src/systemd/sd-event.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "_sd-common.h" @@ -89,6 +90,7 @@ int sd_event_add_io(sd_event *e, sd_event_source **s, int fd, uint32_t events, s int sd_event_add_time(sd_event *e, sd_event_source **s, clockid_t clock, uint64_t usec, uint64_t accuracy, sd_event_time_handler_t callback, void *userdata); int sd_event_add_signal(sd_event *e, sd_event_source **s, int sig, sd_event_signal_handler_t callback, void *userdata); int sd_event_add_child(sd_event *e, sd_event_source **s, pid_t pid, int options, sd_event_child_handler_t callback, void *userdata); +int sd_event_add_child_pidfd(sd_event *e, sd_event_source **s, int pidfd, int options, sd_event_child_handler_t callback, void *userdata); int sd_event_add_inotify(sd_event *e, sd_event_source **s, const char *path, uint32_t mask, sd_event_inotify_handler_t callback, void *userdata); int sd_event_add_defer(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata); int sd_event_add_post(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata); @@ -141,6 +143,16 @@ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec); int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock); int sd_event_source_get_signal(sd_event_source *s); int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid); +int sd_event_source_get_child_pidfd(sd_event_source *s); +int sd_event_source_get_child_pidfd_own(sd_event_source *s); +int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own); +int sd_event_source_get_child_process_own(sd_event_source *s); +int sd_event_source_set_child_process_own(sd_event_source *s, int own); +#if defined _GNU_SOURCE || (defined _POSIX_C_SOURCE && _POSIX_C_SOURCE >= 199309L) +int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags); +#else +int sd_event_source_send_child_signal(sd_event_source *s, int sig, const void *si, unsigned flags); +#endif int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *ret); int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback); int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret);