sd-event: add pidfd support

This adds support for watching for process exits via Linux new pidfd
concept. This makes watching processes and killing them race-free if
properly used, fixing a long-standing UNIX misdesign.

This patch adds implicit and explicit pidfd support to sd-event: if a
process shall be watched and is specified by PID we will now internally
create a pidfd for it and use that, if available. Alternatively a new
constructor for child process event sources is added that takes pidfds
as input.

Besides mere watching of child processes via pidfd two additional
features are added:

→ sd_event_source_send_child_signal() allows sending a signal to the
  process being watched in the safest way possible (wrapping
  the new pidfd_send_signal() syscall).

→ sd_event_source_set_child_process_own() allows marking a process
  watched for destruction as soon as the event source is freed. This is
  currently implemented in userspace, but hopefully will become a kernel
  feature eventually.

Altogether this means an sd_event_source object is now a safe and stable
concept for referencing processes in race-free way, with automatic
fallback to pre-pidfd kernels.

Note that this patch adds support for this only to sd-event, not to PID
1. That's because PID 1 needs to use waitid(P_ALL) for reaping any
process that might get reparented to it. This currently semantically
conflicts with pidfd use for watching processes since we P_ALL is
undirected and thus might reap process earlier than the pidfd notifies
process end, which is hard to handle. The kernel will likely gain a
concept for excluding specific pidfds from P_ALL watching, as soon as
that is around we can start making use of this in PID 1 too.
This commit is contained in:
Lennart Poettering 2019-10-30 17:22:49 +01:00
parent 298f466f15
commit f8f3f9263e
4 changed files with 425 additions and 22 deletions

View File

@ -682,3 +682,14 @@ global:
sd_bus_object_vtable_format;
sd_event_source_disable_unref;
} LIBSYSTEMD_241;
LIBSYSTEMD_245 {
global:
sd_event_add_child_pidfd;
sd_event_source_get_child_pidfd;
sd_event_source_get_child_pidfd_own;
sd_event_source_set_child_pidfd_own;
sd_event_source_get_child_process_own;
sd_event_source_set_child_process_own;
sd_event_source_send_child_signal;
} LIBSYSTEMD_243;

View File

@ -34,7 +34,7 @@ typedef enum EventSourceType {
* we know how to dispatch it */
typedef enum WakeupType {
WAKEUP_NONE,
WAKEUP_EVENT_SOURCE,
WAKEUP_EVENT_SOURCE, /* either I/O or pidfd wakeup */
WAKEUP_CLOCK_DATA,
WAKEUP_SIGNAL_DATA,
WAKEUP_INOTIFY_DATA,
@ -96,6 +96,12 @@ struct sd_event_source {
siginfo_t siginfo;
pid_t pid;
int options;
int pidfd;
bool registered:1; /* whether the pidfd is registered in the epoll */
bool pidfd_owned:1; /* close pidfd when event source is freed */
bool process_owned:1; /* kill+reap process when event source is freed */
bool exited:1; /* true if process exited (i.e. if there's value in SIGKILLing it if we want to get rid of it) */
bool waited:1; /* true if process was waited for (i.e. if there's value in waitid(P_PID)'ing it if we want to get rid of it) */
} child;
struct {
sd_event_handler_t callback;

View File

@ -9,6 +9,7 @@
#include "sd-id128.h"
#include "alloc-util.h"
#include "env-util.h"
#include "event-source.h"
#include "fd-util.h"
#include "fs-util.h"
@ -28,6 +29,14 @@
#define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
/* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
return s &&
s->type == SOURCE_CHILD &&
s->child.pidfd >= 0 &&
s->child.options == WEXITED;
}
static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
[SOURCE_IO] = "io",
[SOURCE_TIME_REALTIME] = "realtime",
@ -401,6 +410,51 @@ static int source_io_register(
return 0;
}
static void source_child_pidfd_unregister(sd_event_source *s) {
assert(s);
assert(s->type == SOURCE_CHILD);
if (event_pid_changed(s->event))
return;
if (!s->child.registered)
return;
if (EVENT_SOURCE_WATCH_PIDFD(s))
if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m",
strna(s->description), event_source_type_to_string(s->type));
s->child.registered = false;
}
static int source_child_pidfd_register(sd_event_source *s, int enabled) {
int r;
assert(s);
assert(s->type == SOURCE_CHILD);
assert(enabled != SD_EVENT_OFF);
if (EVENT_SOURCE_WATCH_PIDFD(s)) {
struct epoll_event ev;
ev = (struct epoll_event) {
.events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
.data.ptr = s,
};
if (s->child.registered)
r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->child.pidfd, &ev);
else
r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->child.pidfd, &ev);
if (r < 0)
return -errno;
}
s->child.registered = true;
return 0;
}
static clockid_t event_source_type_to_clock(EventSourceType t) {
switch (t) {
@ -611,9 +665,8 @@ static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig)
assert(e);
/* Rechecks if the specified signal is still something we are
* interested in. If not, we'll unmask it, and possibly drop
* the signalfd for it. */
/* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
* and possibly drop the signalfd for it. */
if (sig == SIGCHLD &&
e->n_enabled_child_sources > 0)
@ -704,9 +757,13 @@ static void source_disconnect(sd_event_source *s) {
}
(void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
event_gc_signal_data(s->event, &s->priority, SIGCHLD);
}
if (EVENT_SOURCE_WATCH_PIDFD(s))
source_child_pidfd_unregister(s);
else
event_gc_signal_data(s->event, &s->priority, SIGCHLD);
break;
case SOURCE_DEFER:
@ -787,6 +844,44 @@ static void source_free(sd_event_source *s) {
if (s->type == SOURCE_IO && s->io.owned)
s->io.fd = safe_close(s->io.fd);
if (s->type == SOURCE_CHILD) {
/* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
if (s->child.process_owned) {
if (!s->child.exited) {
bool sent = false;
if (s->child.pidfd >= 0) {
if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
if (errno == ESRCH) /* Already dead */
sent = true;
else if (!ERRNO_IS_NOT_SUPPORTED(errno))
log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
s->child.pid);
} else
sent = true;
}
if (!sent)
if (kill(s->child.pid, SIGKILL) < 0)
if (errno != ESRCH) /* Already dead */
log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
s->child.pid);
}
if (!s->child.waited) {
siginfo_t si = {};
/* Reap the child if we can */
(void) waitid(P_PID, s->child.pid, &si, WEXITED);
}
}
if (s->child.pidfd_owned)
s->child.pidfd = safe_close(s->child.pidfd);
}
if (s->destroy_callback)
s->destroy_callback(s->userdata);
@ -1121,6 +1216,11 @@ _public_ int sd_event_add_signal(
return 0;
}
static bool shall_use_pidfd(void) {
/* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
}
_public_ int sd_event_add_child(
sd_event *e,
sd_event_source **ret,
@ -1152,30 +1252,136 @@ _public_ int sd_event_add_child(
if (!s)
return -ENOMEM;
s->wakeup = WAKEUP_EVENT_SOURCE;
s->child.pid = pid;
s->child.options = options;
s->child.callback = callback;
s->userdata = userdata;
s->enabled = SD_EVENT_ONESHOT;
/* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
* pin the PID, and make regular waitid() handling race-free. */
if (shall_use_pidfd()) {
s->child.pidfd = pidfd_open(s->child.pid, 0);
if (s->child.pidfd < 0) {
/* Propagate errors unless the syscall is not supported or blocked */
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
return -errno;
} else
s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
} else
s->child.pidfd = -1;
r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
if (r < 0)
return r;
e->n_enabled_child_sources++;
if (EVENT_SOURCE_WATCH_PIDFD(s)) {
/* We have a pidfd and we only want to watch for exit */
r = source_child_pidfd_register(s, s->enabled);
if (r < 0) {
e->n_enabled_child_sources--;
return r;
}
} else {
/* We have no pidfd or we shall wait for some other event than WEXITED */
r = event_make_signal_data(e, SIGCHLD, NULL);
if (r < 0) {
e->n_enabled_child_sources--;
return r;
}
e->need_process_child = true;
}
if (ret)
*ret = s;
TAKE_PTR(s);
return 0;
}
_public_ int sd_event_add_child_pidfd(
sd_event *e,
sd_event_source **ret,
int pidfd,
int options,
sd_event_child_handler_t callback,
void *userdata) {
_cleanup_(source_freep) sd_event_source *s = NULL;
pid_t pid;
int r;
assert_return(e, -EINVAL);
assert_return(e = event_resolve(e), -ENOPKG);
assert_return(pidfd >= 0, -EBADF);
assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
assert_return(options != 0, -EINVAL);
assert_return(callback, -EINVAL);
assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
assert_return(!event_pid_changed(e), -ECHILD);
r = hashmap_ensure_allocated(&e->child_sources, NULL);
if (r < 0)
return r;
r = pidfd_get_pid(pidfd, &pid);
if (r < 0)
return r;
if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
return -EBUSY;
s = source_new(e, !ret, SOURCE_CHILD);
if (!s)
return -ENOMEM;
s->wakeup = WAKEUP_EVENT_SOURCE;
s->child.pidfd = pidfd;
s->child.pid = pid;
s->child.options = options;
s->child.callback = callback;
s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
s->userdata = userdata;
s->enabled = SD_EVENT_ONESHOT;
r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
if (r < 0)
return r;
e->n_enabled_child_sources++;
r = event_make_signal_data(e, SIGCHLD, NULL);
if (r < 0) {
e->n_enabled_child_sources--;
return r;
}
if (EVENT_SOURCE_WATCH_PIDFD(s)) {
/* We only want to watch for WEXITED */
e->need_process_child = true;
r = source_child_pidfd_register(s, s->enabled);
if (r < 0) {
e->n_enabled_child_sources--;
return r;
}
} else {
/* We shall wait for some other event than WEXITED */
r = event_make_signal_data(e, SIGCHLD, NULL);
if (r < 0) {
e->n_enabled_child_sources--;
return r;
}
e->need_process_child = true;
}
if (ret)
*ret = s;
TAKE_PTR(s);
TAKE_PTR(s);
return 0;
}
@ -2023,7 +2229,11 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
assert(s->event->n_enabled_child_sources > 0);
s->event->n_enabled_child_sources--;
event_gc_signal_data(s->event, &s->priority, SIGCHLD);
if (EVENT_SOURCE_WATCH_PIDFD(s))
source_child_pidfd_unregister(s);
else
event_gc_signal_data(s->event, &s->priority, SIGCHLD);
break;
case SOURCE_EXIT:
@ -2097,12 +2307,25 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
s->enabled = m;
r = event_make_signal_data(s->event, SIGCHLD, NULL);
if (r < 0) {
s->enabled = SD_EVENT_OFF;
s->event->n_enabled_child_sources--;
event_gc_signal_data(s->event, &s->priority, SIGCHLD);
return r;
if (EVENT_SOURCE_WATCH_PIDFD(s)) {
/* yes, we have pidfd */
r = source_child_pidfd_register(s, s->enabled);
if (r < 0) {
s->enabled = SD_EVENT_OFF;
s->event->n_enabled_child_sources--;
return r;
}
} else {
/* no pidfd, or something other to watch for than WEXITED */
r = event_make_signal_data(s->event, SIGCHLD, NULL);
if (r < 0) {
s->enabled = SD_EVENT_OFF;
s->event->n_enabled_child_sources--;
event_gc_signal_data(s->event, &s->priority, SIGCHLD);
return r;
}
}
break;
@ -2225,6 +2448,98 @@ _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
return 0;
}
_public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
assert_return(s, -EINVAL);
assert_return(s->type == SOURCE_CHILD, -EDOM);
assert_return(!event_pid_changed(s->event), -ECHILD);
if (s->child.pidfd < 0)
return -EOPNOTSUPP;
return s->child.pidfd;
}
_public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
assert_return(s, -EINVAL);
assert_return(s->type == SOURCE_CHILD, -EDOM);
assert_return(!event_pid_changed(s->event), -ECHILD);
assert_return(SIGNAL_VALID(sig), -EINVAL);
/* If we already have seen indication the process exited refuse sending a signal early. This way we
* can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
* available. */
if (s->child.exited)
return -ESRCH;
if (s->child.pidfd >= 0) {
siginfo_t copy;
/* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
* structure here */
if (si)
copy = *si;
if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
/* Let's propagate the error only if the system call is not implemented or prohibited */
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
return -errno;
} else
return 0;
}
/* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
* this here. */
if (flags != 0)
return -EOPNOTSUPP;
if (si) {
/* We use rt_sigqueueinfo() only if siginfo_t is specified. */
siginfo_t copy = *si;
if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
return -errno;
} else if (kill(s->child.pid, sig) < 0)
return -errno;
return 0;
}
_public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
assert_return(s, -EINVAL);
assert_return(s->type == SOURCE_CHILD, -EDOM);
if (s->child.pidfd < 0)
return -EOPNOTSUPP;
return s->child.pidfd_owned;
}
_public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
assert_return(s, -EINVAL);
assert_return(s->type == SOURCE_CHILD, -EDOM);
if (s->child.pidfd < 0)
return -EOPNOTSUPP;
s->child.pidfd_owned = own;
return 0;
}
_public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
assert_return(s, -EINVAL);
assert_return(s->type == SOURCE_CHILD, -EDOM);
return s->child.process_owned;
}
_public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
assert_return(s, -EINVAL);
assert_return(s->type == SOURCE_CHILD, -EDOM);
s->child.process_owned = own;
return 0;
}
_public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
assert_return(s, -EINVAL);
assert_return(mask, -EINVAL);
@ -2535,6 +2850,12 @@ static int process_child(sd_event *e) {
if (s->enabled == SD_EVENT_OFF)
continue;
if (s->child.exited)
continue;
if (EVENT_SOURCE_WATCH_PIDFD(s)) /* There's a usable pidfd known for this event source? then don't waitid() for it here */
continue;
zero(s->child.siginfo);
r = waitid(P_PID, s->child.pid, &s->child.siginfo,
WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
@ -2544,6 +2865,9 @@ static int process_child(sd_event *e) {
if (s->child.siginfo.si_pid != 0) {
bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
if (zombie)
s->child.exited = true;
if (!zombie && (s->child.options & WEXITED)) {
/* If the child isn't dead then let's
* immediately remove the state change
@ -2563,6 +2887,33 @@ static int process_child(sd_event *e) {
return 0;
}
static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
assert(e);
assert(s);
assert(s->type == SOURCE_CHILD);
if (s->pending)
return 0;
if (s->enabled == SD_EVENT_OFF)
return 0;
if (!EVENT_SOURCE_WATCH_PIDFD(s))
return 0;
zero(s->child.siginfo);
if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
return -errno;
if (s->child.siginfo.si_pid == 0)
return 0;
if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
s->child.exited = true;
return source_set_pending(s, true);
}
static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
bool read_one = false;
int r;
@ -2847,8 +3198,10 @@ static int source_dispatch(sd_event_source *s) {
r = s->child.callback(s, &s->child.siginfo, s->userdata);
/* Now, reap the PID for good. */
if (zombie)
if (zombie) {
(void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
s->child.waited = true;
}
break;
}
@ -3144,12 +3497,33 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
switch (*t) {
case WAKEUP_EVENT_SOURCE:
r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
case WAKEUP_EVENT_SOURCE: {
sd_event_source *s = ev_queue[i].data.ptr;
assert(s);
switch (s->type) {
case SOURCE_IO:
r = process_io(e, s, ev_queue[i].events);
break;
case SOURCE_CHILD:
r = process_pidfd(e, s, ev_queue[i].events);
break;
default:
assert_not_reached("Unexpected event source type");
}
break;
}
case WAKEUP_CLOCK_DATA: {
struct clock_data *d = ev_queue[i].data.ptr;
assert(d);
r = flush_timer(e, d->fd, ev_queue[i].events, &d->next);
break;
}

View File

@ -23,6 +23,7 @@
#include <sys/inotify.h>
#include <sys/signalfd.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include "_sd-common.h"
@ -89,6 +90,7 @@ int sd_event_add_io(sd_event *e, sd_event_source **s, int fd, uint32_t events, s
int sd_event_add_time(sd_event *e, sd_event_source **s, clockid_t clock, uint64_t usec, uint64_t accuracy, sd_event_time_handler_t callback, void *userdata);
int sd_event_add_signal(sd_event *e, sd_event_source **s, int sig, sd_event_signal_handler_t callback, void *userdata);
int sd_event_add_child(sd_event *e, sd_event_source **s, pid_t pid, int options, sd_event_child_handler_t callback, void *userdata);
int sd_event_add_child_pidfd(sd_event *e, sd_event_source **s, int pidfd, int options, sd_event_child_handler_t callback, void *userdata);
int sd_event_add_inotify(sd_event *e, sd_event_source **s, const char *path, uint32_t mask, sd_event_inotify_handler_t callback, void *userdata);
int sd_event_add_defer(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata);
int sd_event_add_post(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata);
@ -141,6 +143,16 @@ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec);
int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock);
int sd_event_source_get_signal(sd_event_source *s);
int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid);
int sd_event_source_get_child_pidfd(sd_event_source *s);
int sd_event_source_get_child_pidfd_own(sd_event_source *s);
int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own);
int sd_event_source_get_child_process_own(sd_event_source *s);
int sd_event_source_set_child_process_own(sd_event_source *s, int own);
#if defined _GNU_SOURCE || (defined _POSIX_C_SOURCE && _POSIX_C_SOURCE >= 199309L)
int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags);
#else
int sd_event_source_send_child_signal(sd_event_source *s, int sig, const void *si, unsigned flags);
#endif
int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *ret);
int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback);
int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret);