From d1cf2023749296284362e4b48a877e552ab2fb08 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 30 Oct 2019 12:00:49 +0100 Subject: [PATCH 01/14] sd-event: drop unnecessary local variable --- src/libsystemd/sd-event/sd-event.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c index b76dbf3d23..4a70124cd7 100644 --- a/src/libsystemd/sd-event/sd-event.c +++ b/src/libsystemd/sd-event/sd-event.c @@ -356,8 +356,6 @@ static bool event_pid_changed(sd_event *e) { } static void source_io_unregister(sd_event_source *s) { - int r; - assert(s); assert(s->type == SOURCE_IO); @@ -367,8 +365,7 @@ static void source_io_unregister(sd_event_source *s) { if (!s->io.registered) return; - r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL); - if (r < 0) + if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0) log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m", strna(s->description), event_source_type_to_string(s->type)); From 5a795bff38402d1a7e82020888eb1da5d52ad4a7 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 30 Oct 2019 12:03:13 +0100 Subject: [PATCH 02/14] sd-event: (void)ify some epoll_ctl() syscall invocations --- src/libsystemd/sd-event/sd-event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c index 4a70124cd7..e348fb699d 100644 --- a/src/libsystemd/sd-event/sd-event.c +++ b/src/libsystemd/sd-event/sd-event.c @@ -1762,7 +1762,7 @@ _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) { return r; } - epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL); + (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL); } return 0; @@ -3473,7 +3473,7 @@ _public_ int sd_event_set_watchdog(sd_event *e, int b) { } else { if (e->watchdog_fd >= 0) { - epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL); + (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL); e->watchdog_fd = safe_close(e->watchdog_fd); } } From 5f152f43d04e5aad6a3f98f45f020a66e3aac717 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 25 Oct 2019 16:06:06 +0200 Subject: [PATCH 03/14] missing: define new pidfd syscalls --- meson.build | 8 ++++++++ src/basic/missing_syscall.h | 38 +++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/meson.build b/meson.build index 21d6968abd..edde42ea74 100644 --- a/meson.build +++ b/meson.build @@ -517,6 +517,14 @@ foreach ident : [ #include '''], ['get_mempolicy', '''#include #include '''], + ['pidfd_send_signal', '''#include + #include + #include + #include '''], + ['pidfd_open', '''#include + #include + #include + #include '''], ] have = cc.has_function(ident[0], prefix : ident[1], args : '-D_GNU_SOURCE') diff --git a/src/basic/missing_syscall.h b/src/basic/missing_syscall.h index 1255d8b197..bea7be699d 100644 --- a/src/basic/missing_syscall.h +++ b/src/basic/missing_syscall.h @@ -5,8 +5,10 @@ #include #include +#include #include #include +#include #include #ifdef ARCH_MIPS @@ -524,3 +526,39 @@ static inline long missing_get_mempolicy(int *mode, unsigned long *nodemask, #define get_mempolicy missing_get_mempolicy #endif + +#if !HAVE_PIDFD_OPEN +/* may be (invalid) negative number due to libseccomp, see PR 13319 */ +# if ! (defined __NR_pidfd_open && __NR_pidfd_open > 0) +# if defined __NR_pidfd_open +# undef __NR_pidfd_open +# endif +# define __NR_pidfd_open 434 +#endif +static inline int pidfd_open(pid_t pid, unsigned flags) { +#ifdef __NR_pidfd_open + return syscall(__NR_pidfd_open, pid, flags); +#else + errno = ENOSYS; + return -1; +#endif +} +#endif + +#if !HAVE_PIDFD_SEND_SIGNAL +/* may be (invalid) negative number due to libseccomp, see PR 13319 */ +# if ! (defined __NR_pidfd_send_signal && __NR_pidfd_send_signal > 0) +# if defined __NR_pidfd_send_signal +# undef __NR_pidfd_send_signal +# endif +# define __NR_pidfd_send_signal 424 +#endif +static inline int pidfd_send_signal(int fd, int sig, siginfo_t *info, unsigned flags) { +#ifdef __NR_pidfd_open + return syscall(__NR_pidfd_send_signal, fd, sig, info, flags); +#else + errno = ENOSYS; + return -1; +#endif +} +#endif From 5ead4e85f6b58a099269b27def1cd03c4526ae19 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 30 Oct 2019 16:29:42 +0100 Subject: [PATCH 04/14] missing: add rt_sigqueueinfo() syscall definition This is not a new system call at all (since kernel 2.2), however it's not exposed in glibc (a wrapper is exposed however in sigqueue(), but it substantially simplifies the system call). Since we want a nice fallback for sending signals on non-pidfd systems for pidfd_send_signal() let's wrap rt_sigqueueinfo() since it takes the same siginfo_t parameter. --- meson.build | 4 ++++ src/basic/missing_syscall.h | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/meson.build b/meson.build index edde42ea74..3085af49fd 100644 --- a/meson.build +++ b/meson.build @@ -525,6 +525,10 @@ foreach ident : [ #include #include #include '''], + ['rt_sigqueueinfo', '''#include + #include + #include + #include '''], ] have = cc.has_function(ident[0], prefix : ident[1], args : '-D_GNU_SOURCE') diff --git a/src/basic/missing_syscall.h b/src/basic/missing_syscall.h index bea7be699d..cbda3f7c60 100644 --- a/src/basic/missing_syscall.h +++ b/src/basic/missing_syscall.h @@ -562,3 +562,9 @@ static inline int pidfd_send_signal(int fd, int sig, siginfo_t *info, unsigned f #endif } #endif + +#if !HAVE_RT_SIGQUEUEINFO +static inline int rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *info) { + return syscall(__NR_rt_sigqueueinfo, tgid, sig, info); +} +#endif From 298f466f159e1c1203d03c16c1ab9280b844bad5 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 30 Oct 2019 16:35:48 +0100 Subject: [PATCH 05/14] process-util: add helper pidfd_get_pid() It returns the pid_t a pidfd refers to. --- src/basic/process-util.c | 33 +++++++++++++++++++++++++++++++++ src/basic/process-util.h | 2 ++ 2 files changed, 35 insertions(+) diff --git a/src/basic/process-util.c b/src/basic/process-util.c index 9b6c4c31f7..718a237954 100644 --- a/src/basic/process-util.c +++ b/src/basic/process-util.c @@ -40,6 +40,7 @@ #include "rlimit-util.h" #include "signal-util.h" #include "stat-util.h" +#include "stdio-util.h" #include "string-table.h" #include "string-util.h" #include "terminal-util.h" @@ -1488,6 +1489,38 @@ int set_oom_score_adjust(int value) { WRITE_STRING_FILE_VERIFY_ON_FAILURE|WRITE_STRING_FILE_DISABLE_BUFFER); } +int pidfd_get_pid(int fd, pid_t *ret) { + char path[STRLEN("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)]; + _cleanup_free_ char *fdinfo = NULL; + char *p; + int r; + + if (fd < 0) + return -EBADF; + + xsprintf(path, "/proc/self/fdinfo/%i", fd); + + r = read_full_file(path, &fdinfo, NULL); + if (r == -ENOENT) /* if fdinfo doesn't exist we assume the process does not exist */ + return -ESRCH; + if (r < 0) + return r; + + p = startswith(fdinfo, "Pid:"); + if (!p) { + p = strstr(fdinfo, "\nPid:"); + if (!p) + return -ENOTTY; /* not a pidfd? */ + + p += 5; + } + + p += strspn(p, WHITESPACE); + p[strcspn(p, WHITESPACE)] = 0; + + return parse_pid(p, ret); +} + static const char *const ioprio_class_table[] = { [IOPRIO_CLASS_NONE] = "none", [IOPRIO_CLASS_RT] = "realtime", diff --git a/src/basic/process-util.h b/src/basic/process-util.h index 5f4e174f04..33dc871229 100644 --- a/src/basic/process-util.h +++ b/src/basic/process-util.h @@ -197,3 +197,5 @@ assert_cc(TASKS_MAX <= (unsigned long) PID_T_MAX); (pid) = 0; \ _pid_; \ }) + +int pidfd_get_pid(int fd, pid_t *ret); From f8f3f9263e51db180bd78a5f3b152aefd25427ee Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 30 Oct 2019 17:22:49 +0100 Subject: [PATCH 06/14] sd-event: add pidfd support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for watching for process exits via Linux new pidfd concept. This makes watching processes and killing them race-free if properly used, fixing a long-standing UNIX misdesign. This patch adds implicit and explicit pidfd support to sd-event: if a process shall be watched and is specified by PID we will now internally create a pidfd for it and use that, if available. Alternatively a new constructor for child process event sources is added that takes pidfds as input. Besides mere watching of child processes via pidfd two additional features are added: → sd_event_source_send_child_signal() allows sending a signal to the process being watched in the safest way possible (wrapping the new pidfd_send_signal() syscall). → sd_event_source_set_child_process_own() allows marking a process watched for destruction as soon as the event source is freed. This is currently implemented in userspace, but hopefully will become a kernel feature eventually. Altogether this means an sd_event_source object is now a safe and stable concept for referencing processes in race-free way, with automatic fallback to pre-pidfd kernels. Note that this patch adds support for this only to sd-event, not to PID 1. That's because PID 1 needs to use waitid(P_ALL) for reaping any process that might get reparented to it. This currently semantically conflicts with pidfd use for watching processes since we P_ALL is undirected and thus might reap process earlier than the pidfd notifies process end, which is hard to handle. The kernel will likely gain a concept for excluding specific pidfds from P_ALL watching, as soon as that is around we can start making use of this in PID 1 too. --- src/libsystemd/libsystemd.sym | 11 + src/libsystemd/sd-event/event-source.h | 8 +- src/libsystemd/sd-event/sd-event.c | 416 +++++++++++++++++++++++-- src/systemd/sd-event.h | 12 + 4 files changed, 425 insertions(+), 22 deletions(-) diff --git a/src/libsystemd/libsystemd.sym b/src/libsystemd/libsystemd.sym index 5ec42e0f1f..bed81bf173 100644 --- a/src/libsystemd/libsystemd.sym +++ b/src/libsystemd/libsystemd.sym @@ -682,3 +682,14 @@ global: sd_bus_object_vtable_format; sd_event_source_disable_unref; } LIBSYSTEMD_241; + +LIBSYSTEMD_245 { +global: + sd_event_add_child_pidfd; + sd_event_source_get_child_pidfd; + sd_event_source_get_child_pidfd_own; + sd_event_source_set_child_pidfd_own; + sd_event_source_get_child_process_own; + sd_event_source_set_child_process_own; + sd_event_source_send_child_signal; +} LIBSYSTEMD_243; diff --git a/src/libsystemd/sd-event/event-source.h b/src/libsystemd/sd-event/event-source.h index 99ab8fc169..08eb9b6a61 100644 --- a/src/libsystemd/sd-event/event-source.h +++ b/src/libsystemd/sd-event/event-source.h @@ -34,7 +34,7 @@ typedef enum EventSourceType { * we know how to dispatch it */ typedef enum WakeupType { WAKEUP_NONE, - WAKEUP_EVENT_SOURCE, + WAKEUP_EVENT_SOURCE, /* either I/O or pidfd wakeup */ WAKEUP_CLOCK_DATA, WAKEUP_SIGNAL_DATA, WAKEUP_INOTIFY_DATA, @@ -96,6 +96,12 @@ struct sd_event_source { siginfo_t siginfo; pid_t pid; int options; + int pidfd; + bool registered:1; /* whether the pidfd is registered in the epoll */ + bool pidfd_owned:1; /* close pidfd when event source is freed */ + bool process_owned:1; /* kill+reap process when event source is freed */ + bool exited:1; /* true if process exited (i.e. if there's value in SIGKILLing it if we want to get rid of it) */ + bool waited:1; /* true if process was waited for (i.e. if there's value in waitid(P_PID)'ing it if we want to get rid of it) */ } child; struct { sd_event_handler_t callback; diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c index e348fb699d..4693049f4c 100644 --- a/src/libsystemd/sd-event/sd-event.c +++ b/src/libsystemd/sd-event/sd-event.c @@ -9,6 +9,7 @@ #include "sd-id128.h" #include "alloc-util.h" +#include "env-util.h" #include "event-source.h" #include "fd-util.h" #include "fs-util.h" @@ -28,6 +29,14 @@ #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC) +static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) { + /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */ + return s && + s->type == SOURCE_CHILD && + s->child.pidfd >= 0 && + s->child.options == WEXITED; +} + static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = { [SOURCE_IO] = "io", [SOURCE_TIME_REALTIME] = "realtime", @@ -401,6 +410,51 @@ static int source_io_register( return 0; } +static void source_child_pidfd_unregister(sd_event_source *s) { + assert(s); + assert(s->type == SOURCE_CHILD); + + if (event_pid_changed(s->event)) + return; + + if (!s->child.registered) + return; + + if (EVENT_SOURCE_WATCH_PIDFD(s)) + if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0) + log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m", + strna(s->description), event_source_type_to_string(s->type)); + + s->child.registered = false; +} + +static int source_child_pidfd_register(sd_event_source *s, int enabled) { + int r; + + assert(s); + assert(s->type == SOURCE_CHILD); + assert(enabled != SD_EVENT_OFF); + + if (EVENT_SOURCE_WATCH_PIDFD(s)) { + struct epoll_event ev; + + ev = (struct epoll_event) { + .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0), + .data.ptr = s, + }; + + if (s->child.registered) + r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->child.pidfd, &ev); + else + r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->child.pidfd, &ev); + if (r < 0) + return -errno; + } + + s->child.registered = true; + return 0; +} + static clockid_t event_source_type_to_clock(EventSourceType t) { switch (t) { @@ -611,9 +665,8 @@ static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) assert(e); - /* Rechecks if the specified signal is still something we are - * interested in. If not, we'll unmask it, and possibly drop - * the signalfd for it. */ + /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it, + * and possibly drop the signalfd for it. */ if (sig == SIGCHLD && e->n_enabled_child_sources > 0) @@ -704,9 +757,13 @@ static void source_disconnect(sd_event_source *s) { } (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid)); - event_gc_signal_data(s->event, &s->priority, SIGCHLD); } + if (EVENT_SOURCE_WATCH_PIDFD(s)) + source_child_pidfd_unregister(s); + else + event_gc_signal_data(s->event, &s->priority, SIGCHLD); + break; case SOURCE_DEFER: @@ -787,6 +844,44 @@ static void source_free(sd_event_source *s) { if (s->type == SOURCE_IO && s->io.owned) s->io.fd = safe_close(s->io.fd); + if (s->type == SOURCE_CHILD) { + /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */ + + if (s->child.process_owned) { + + if (!s->child.exited) { + bool sent = false; + + if (s->child.pidfd >= 0) { + if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) { + if (errno == ESRCH) /* Already dead */ + sent = true; + else if (!ERRNO_IS_NOT_SUPPORTED(errno)) + log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m", + s->child.pid); + } else + sent = true; + } + + if (!sent) + if (kill(s->child.pid, SIGKILL) < 0) + if (errno != ESRCH) /* Already dead */ + log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m", + s->child.pid); + } + + if (!s->child.waited) { + siginfo_t si = {}; + + /* Reap the child if we can */ + (void) waitid(P_PID, s->child.pid, &si, WEXITED); + } + } + + if (s->child.pidfd_owned) + s->child.pidfd = safe_close(s->child.pidfd); + } + if (s->destroy_callback) s->destroy_callback(s->userdata); @@ -1121,6 +1216,11 @@ _public_ int sd_event_add_signal( return 0; } +static bool shall_use_pidfd(void) { + /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */ + return getenv_bool_secure("SYSTEMD_PIDFD") != 0; +} + _public_ int sd_event_add_child( sd_event *e, sd_event_source **ret, @@ -1152,30 +1252,136 @@ _public_ int sd_event_add_child( if (!s) return -ENOMEM; + s->wakeup = WAKEUP_EVENT_SOURCE; s->child.pid = pid; s->child.options = options; s->child.callback = callback; s->userdata = userdata; s->enabled = SD_EVENT_ONESHOT; + /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we + * pin the PID, and make regular waitid() handling race-free. */ + + if (shall_use_pidfd()) { + s->child.pidfd = pidfd_open(s->child.pid, 0); + if (s->child.pidfd < 0) { + /* Propagate errors unless the syscall is not supported or blocked */ + if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) + return -errno; + } else + s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */ + } else + s->child.pidfd = -1; + + r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s); + if (r < 0) + return r; + + e->n_enabled_child_sources++; + + if (EVENT_SOURCE_WATCH_PIDFD(s)) { + /* We have a pidfd and we only want to watch for exit */ + + r = source_child_pidfd_register(s, s->enabled); + if (r < 0) { + e->n_enabled_child_sources--; + return r; + } + } else { + /* We have no pidfd or we shall wait for some other event than WEXITED */ + + r = event_make_signal_data(e, SIGCHLD, NULL); + if (r < 0) { + e->n_enabled_child_sources--; + return r; + } + + e->need_process_child = true; + } + + if (ret) + *ret = s; + + TAKE_PTR(s); + return 0; +} + +_public_ int sd_event_add_child_pidfd( + sd_event *e, + sd_event_source **ret, + int pidfd, + int options, + sd_event_child_handler_t callback, + void *userdata) { + + + _cleanup_(source_freep) sd_event_source *s = NULL; + pid_t pid; + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(pidfd >= 0, -EBADF); + assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL); + assert_return(options != 0, -EINVAL); + assert_return(callback, -EINVAL); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(e), -ECHILD); + + r = hashmap_ensure_allocated(&e->child_sources, NULL); + if (r < 0) + return r; + + r = pidfd_get_pid(pidfd, &pid); + if (r < 0) + return r; + + if (hashmap_contains(e->child_sources, PID_TO_PTR(pid))) + return -EBUSY; + + s = source_new(e, !ret, SOURCE_CHILD); + if (!s) + return -ENOMEM; + + s->wakeup = WAKEUP_EVENT_SOURCE; + s->child.pidfd = pidfd; + s->child.pid = pid; + s->child.options = options; + s->child.callback = callback; + s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */ + s->userdata = userdata; + s->enabled = SD_EVENT_ONESHOT; + r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s); if (r < 0) return r; e->n_enabled_child_sources++; - r = event_make_signal_data(e, SIGCHLD, NULL); - if (r < 0) { - e->n_enabled_child_sources--; - return r; - } + if (EVENT_SOURCE_WATCH_PIDFD(s)) { + /* We only want to watch for WEXITED */ - e->need_process_child = true; + r = source_child_pidfd_register(s, s->enabled); + if (r < 0) { + e->n_enabled_child_sources--; + return r; + } + } else { + /* We shall wait for some other event than WEXITED */ + + r = event_make_signal_data(e, SIGCHLD, NULL); + if (r < 0) { + e->n_enabled_child_sources--; + return r; + } + + e->need_process_child = true; + } if (ret) *ret = s; - TAKE_PTR(s); + TAKE_PTR(s); return 0; } @@ -2023,7 +2229,11 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) { assert(s->event->n_enabled_child_sources > 0); s->event->n_enabled_child_sources--; - event_gc_signal_data(s->event, &s->priority, SIGCHLD); + if (EVENT_SOURCE_WATCH_PIDFD(s)) + source_child_pidfd_unregister(s); + else + event_gc_signal_data(s->event, &s->priority, SIGCHLD); + break; case SOURCE_EXIT: @@ -2097,12 +2307,25 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) { s->enabled = m; - r = event_make_signal_data(s->event, SIGCHLD, NULL); - if (r < 0) { - s->enabled = SD_EVENT_OFF; - s->event->n_enabled_child_sources--; - event_gc_signal_data(s->event, &s->priority, SIGCHLD); - return r; + if (EVENT_SOURCE_WATCH_PIDFD(s)) { + /* yes, we have pidfd */ + + r = source_child_pidfd_register(s, s->enabled); + if (r < 0) { + s->enabled = SD_EVENT_OFF; + s->event->n_enabled_child_sources--; + return r; + } + } else { + /* no pidfd, or something other to watch for than WEXITED */ + + r = event_make_signal_data(s->event, SIGCHLD, NULL); + if (r < 0) { + s->enabled = SD_EVENT_OFF; + s->event->n_enabled_child_sources--; + event_gc_signal_data(s->event, &s->priority, SIGCHLD); + return r; + } } break; @@ -2225,6 +2448,98 @@ _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) { return 0; } +_public_ int sd_event_source_get_child_pidfd(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + assert_return(!event_pid_changed(s->event), -ECHILD); + + if (s->child.pidfd < 0) + return -EOPNOTSUPP; + + return s->child.pidfd; +} + +_public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + assert_return(!event_pid_changed(s->event), -ECHILD); + assert_return(SIGNAL_VALID(sig), -EINVAL); + + /* If we already have seen indication the process exited refuse sending a signal early. This way we + * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not + * available. */ + if (s->child.exited) + return -ESRCH; + + if (s->child.pidfd >= 0) { + siginfo_t copy; + + /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the + * structure here */ + if (si) + copy = *si; + + if (pidfd_send_signal(s->child.pidfd, sig, si ? © : NULL, 0) < 0) { + /* Let's propagate the error only if the system call is not implemented or prohibited */ + if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) + return -errno; + } else + return 0; + } + + /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse + * this here. */ + if (flags != 0) + return -EOPNOTSUPP; + + if (si) { + /* We use rt_sigqueueinfo() only if siginfo_t is specified. */ + siginfo_t copy = *si; + + if (rt_sigqueueinfo(s->child.pid, sig, ©) < 0) + return -errno; + } else if (kill(s->child.pid, sig) < 0) + return -errno; + + return 0; +} + +_public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + + if (s->child.pidfd < 0) + return -EOPNOTSUPP; + + return s->child.pidfd_owned; +} + +_public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + + if (s->child.pidfd < 0) + return -EOPNOTSUPP; + + s->child.pidfd_owned = own; + return 0; +} + +_public_ int sd_event_source_get_child_process_own(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + + return s->child.process_owned; +} + +_public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + + s->child.process_owned = own; + return 0; +} + _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) { assert_return(s, -EINVAL); assert_return(mask, -EINVAL); @@ -2535,6 +2850,12 @@ static int process_child(sd_event *e) { if (s->enabled == SD_EVENT_OFF) continue; + if (s->child.exited) + continue; + + if (EVENT_SOURCE_WATCH_PIDFD(s)) /* There's a usable pidfd known for this event source? then don't waitid() for it here */ + continue; + zero(s->child.siginfo); r = waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options); @@ -2544,6 +2865,9 @@ static int process_child(sd_event *e) { if (s->child.siginfo.si_pid != 0) { bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED); + if (zombie) + s->child.exited = true; + if (!zombie && (s->child.options & WEXITED)) { /* If the child isn't dead then let's * immediately remove the state change @@ -2563,6 +2887,33 @@ static int process_child(sd_event *e) { return 0; } +static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) { + assert(e); + assert(s); + assert(s->type == SOURCE_CHILD); + + if (s->pending) + return 0; + + if (s->enabled == SD_EVENT_OFF) + return 0; + + if (!EVENT_SOURCE_WATCH_PIDFD(s)) + return 0; + + zero(s->child.siginfo); + if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0) + return -errno; + + if (s->child.siginfo.si_pid == 0) + return 0; + + if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED)) + s->child.exited = true; + + return source_set_pending(s, true); +} + static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) { bool read_one = false; int r; @@ -2847,8 +3198,10 @@ static int source_dispatch(sd_event_source *s) { r = s->child.callback(s, &s->child.siginfo, s->userdata); /* Now, reap the PID for good. */ - if (zombie) + if (zombie) { (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED); + s->child.waited = true; + } break; } @@ -3144,12 +3497,33 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) { switch (*t) { - case WAKEUP_EVENT_SOURCE: - r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events); + case WAKEUP_EVENT_SOURCE: { + sd_event_source *s = ev_queue[i].data.ptr; + + assert(s); + + switch (s->type) { + + case SOURCE_IO: + r = process_io(e, s, ev_queue[i].events); + break; + + case SOURCE_CHILD: + r = process_pidfd(e, s, ev_queue[i].events); + break; + + default: + assert_not_reached("Unexpected event source type"); + } + break; + } case WAKEUP_CLOCK_DATA: { struct clock_data *d = ev_queue[i].data.ptr; + + assert(d); + r = flush_timer(e, d->fd, ev_queue[i].events, &d->next); break; } diff --git a/src/systemd/sd-event.h b/src/systemd/sd-event.h index b14c92697b..2ec726a897 100644 --- a/src/systemd/sd-event.h +++ b/src/systemd/sd-event.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "_sd-common.h" @@ -89,6 +90,7 @@ int sd_event_add_io(sd_event *e, sd_event_source **s, int fd, uint32_t events, s int sd_event_add_time(sd_event *e, sd_event_source **s, clockid_t clock, uint64_t usec, uint64_t accuracy, sd_event_time_handler_t callback, void *userdata); int sd_event_add_signal(sd_event *e, sd_event_source **s, int sig, sd_event_signal_handler_t callback, void *userdata); int sd_event_add_child(sd_event *e, sd_event_source **s, pid_t pid, int options, sd_event_child_handler_t callback, void *userdata); +int sd_event_add_child_pidfd(sd_event *e, sd_event_source **s, int pidfd, int options, sd_event_child_handler_t callback, void *userdata); int sd_event_add_inotify(sd_event *e, sd_event_source **s, const char *path, uint32_t mask, sd_event_inotify_handler_t callback, void *userdata); int sd_event_add_defer(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata); int sd_event_add_post(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata); @@ -141,6 +143,16 @@ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec); int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock); int sd_event_source_get_signal(sd_event_source *s); int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid); +int sd_event_source_get_child_pidfd(sd_event_source *s); +int sd_event_source_get_child_pidfd_own(sd_event_source *s); +int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own); +int sd_event_source_get_child_process_own(sd_event_source *s); +int sd_event_source_set_child_process_own(sd_event_source *s, int own); +#if defined _GNU_SOURCE || (defined _POSIX_C_SOURCE && _POSIX_C_SOURCE >= 199309L) +int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags); +#else +int sd_event_source_send_child_signal(sd_event_source *s, int sig, const void *si, unsigned flags); +#endif int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *ret); int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback); int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret); From 90b15e18eef423b930d6d4694e67c37c82037d25 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 30 Oct 2019 17:37:00 +0100 Subject: [PATCH 07/14] signal-util: add new helper signal_is_blocked() --- src/basic/signal-util.c | 15 +++++++++++++++ src/basic/signal-util.h | 2 ++ 2 files changed, 17 insertions(+) diff --git a/src/basic/signal-util.c b/src/basic/signal-util.c index bfb83419c9..cb59f6ca0f 100644 --- a/src/basic/signal-util.c +++ b/src/basic/signal-util.c @@ -287,3 +287,18 @@ int signal_from_string(const char *s) { void nop_signal_handler(int sig) { /* nothing here */ } + +int signal_is_blocked(int sig) { + sigset_t ss; + int r; + + r = pthread_sigmask(SIG_SETMASK, NULL, &ss); + if (r != 0) + return -r; + + r = sigismember(&ss, sig); + if (r < 0) + return -errno; + + return r; +} diff --git a/src/basic/signal-util.h b/src/basic/signal-util.h index 92f2804cd2..3909ee341d 100644 --- a/src/basic/signal-util.h +++ b/src/basic/signal-util.h @@ -41,3 +41,5 @@ static inline const char* signal_to_string_with_check(int n) { return signal_to_string(n); } + +int signal_is_blocked(int sig); From d1b75241baa3e64acbc9f826e0bc44a198715fa7 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 30 Oct 2019 17:37:12 +0100 Subject: [PATCH 08/14] sd-event: make use of new signal_is_blocked() helper --- src/libsystemd/sd-event/sd-event.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c index 4693049f4c..bb3c542b7c 100644 --- a/src/libsystemd/sd-event/sd-event.c +++ b/src/libsystemd/sd-event/sd-event.c @@ -1165,7 +1165,6 @@ _public_ int sd_event_add_signal( _cleanup_(source_freep) sd_event_source *s = NULL; struct signal_data *d; - sigset_t ss; int r; assert_return(e, -EINVAL); @@ -1177,11 +1176,10 @@ _public_ int sd_event_add_signal( if (!callback) callback = signal_exit_callback; - r = pthread_sigmask(SIG_SETMASK, NULL, &ss); - if (r != 0) - return -r; - - if (!sigismember(&ss, sig)) + r = signal_is_blocked(sig); + if (r < 0) + return r; + if (r == 0) return -EBUSY; if (!e->signal_sources) { From ee880b37c1522346a6a641bb5d532e94511df86f Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 30 Oct 2019 17:41:15 +0100 Subject: [PATCH 09/14] sd-event: refuse sd_event_add_child() if SIGCHLD is not blocked We already refuse sd_event_add_signal() if the specified signal is not blocked, let's do this also for sd_event_add_child(), since we might need signalfd() to implement this, and this means the signal needs to be blocked. --- src/libsystemd/sd-event/sd-event.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c index bb3c542b7c..8d4a20e420 100644 --- a/src/libsystemd/sd-event/sd-event.c +++ b/src/libsystemd/sd-event/sd-event.c @@ -1239,6 +1239,20 @@ _public_ int sd_event_add_child( assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); assert_return(!event_pid_changed(e), -ECHILD); + if (e->n_enabled_child_sources == 0) { + /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available, + * for compatibility with pre-pidfd and because we don't want the reap the child processes + * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to + * take effect. + * + * (As an optimization we only do this check on the first child event source created.) */ + r = signal_is_blocked(SIGCHLD); + if (r < 0) + return r; + if (r == 0) + return -EBUSY; + } + r = hashmap_ensure_allocated(&e->child_sources, NULL); if (r < 0) return r; @@ -1326,6 +1340,14 @@ _public_ int sd_event_add_child_pidfd( assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); assert_return(!event_pid_changed(e), -ECHILD); + if (e->n_enabled_child_sources == 0) { + r = signal_is_blocked(SIGCHLD); + if (r < 0) + return r; + if (r == 0) + return -EBUSY; + } + r = hashmap_ensure_allocated(&e->child_sources, NULL); if (r < 0) return r; From 3ecb3bdc938494e6e96a87d118e68d0d2eb68663 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 30 Oct 2019 17:42:31 +0100 Subject: [PATCH 10/14] test: add test for pidfd support in sd-event --- src/libsystemd/sd-event/test-event.c | 124 +++++++++++++++++++++++++-- 1 file changed, 118 insertions(+), 6 deletions(-) diff --git a/src/libsystemd/sd-event/test-event.c b/src/libsystemd/sd-event/test-event.c index 954b93ada0..54d293ca46 100644 --- a/src/libsystemd/sd-event/test-event.c +++ b/src/libsystemd/sd-event/test-event.c @@ -9,6 +9,7 @@ #include "fs-util.h" #include "log.h" #include "macro.h" +#include "missing_syscall.h" #include "parse-util.h" #include "path-util.h" #include "process-util.h" @@ -62,6 +63,11 @@ static int child_handler(sd_event_source *s, const siginfo_t *si, void *userdata assert_se(s); assert_se(si); + assert_se(si->si_uid == getuid()); + assert_se(si->si_signo == SIGCHLD); + assert_se(si->si_code == CLD_EXITED); + assert_se(si->si_status == 78); + log_info("got child on %c", PTR_TO_INT(userdata)); assert_se(userdata == INT_TO_PTR('f')); @@ -75,6 +81,7 @@ static int child_handler(sd_event_source *s, const siginfo_t *si, void *userdata static int signal_handler(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { sd_event_source *p = NULL; pid_t pid; + siginfo_t plain_si; assert_se(s); assert_se(si); @@ -83,16 +90,41 @@ static int signal_handler(sd_event_source *s, const struct signalfd_siginfo *si, assert_se(userdata == INT_TO_PTR('e')); - assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0); + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGUSR2, -1) >= 0); pid = fork(); assert_se(pid >= 0); - if (pid == 0) - _exit(EXIT_SUCCESS); + if (pid == 0) { + sigset_t ss; + + assert_se(sigemptyset(&ss) >= 0); + assert_se(sigaddset(&ss, SIGUSR2) >= 0); + + zero(plain_si); + assert_se(sigwaitinfo(&ss, &plain_si) >= 0); + + assert_se(plain_si.si_signo == SIGUSR2); + assert_se(plain_si.si_value.sival_int == 4711); + + _exit(78); + } assert_se(sd_event_add_child(sd_event_source_get_event(s), &p, pid, WEXITED, child_handler, INT_TO_PTR('f')) >= 0); assert_se(sd_event_source_set_enabled(p, SD_EVENT_ONESHOT) >= 0); + assert_se(sd_event_source_set_child_process_own(p, true) >= 0); + + /* We can't use structured initialization here, since the structure contains various unions and these + * fields lie in overlapping (carefully aligned) unions that LLVM is allergic to allow assignments + * to */ + zero(plain_si); + plain_si.si_signo = SIGUSR2; + plain_si.si_code = SI_QUEUE; + plain_si.si_pid = getpid(); + plain_si.si_uid = getuid(); + plain_si.si_value.sival_int = 4711; + + assert_se(sd_event_source_send_child_signal(p, SIGUSR2, &plain_si, 0) >= 0); sd_event_source_unref(s); @@ -119,7 +151,7 @@ static int defer_handler(sd_event_source *s, void *userdata) { return 1; } -static bool do_quit = false; +static bool do_quit; static int time_handler(sd_event_source *s, uint64_t usec, void *userdata) { log_info("got timer on %c", PTR_TO_INT(userdata)); @@ -161,7 +193,7 @@ static int post_handler(sd_event_source *s, void *userdata) { return 2; } -static void test_basic(void) { +static void test_basic(bool with_pidfd) { sd_event *e = NULL; sd_event_source *w = NULL, *x = NULL, *y = NULL, *z = NULL, *q = NULL, *t = NULL; static const char ch = 'x'; @@ -169,6 +201,8 @@ static void test_basic(void) { uint64_t event_now; int64_t priority; + assert_se(setenv("SYSTEMD_PIDFD", yes_no(with_pidfd), 1) >= 0); + assert_se(pipe(a) >= 0); assert_se(pipe(b) >= 0); assert_se(pipe(d) >= 0); @@ -201,6 +235,8 @@ static void test_basic(void) { assert_se(sd_event_add_io(e, &x, a[0], EPOLLIN, io_handler, INT_TO_PTR('a')) >= 0); assert_se(sd_event_add_io(e, &y, b[0], EPOLLIN, io_handler, INT_TO_PTR('b')) >= 0); + + do_quit = false; assert_se(sd_event_add_time(e, &z, CLOCK_MONOTONIC, 0, 0, time_handler, INT_TO_PTR('c')) >= 0); assert_se(sd_event_add_exit(e, &q, exit_handler, INT_TO_PTR('g')) >= 0); @@ -258,6 +294,8 @@ static void test_basic(void) { safe_close_pair(b); safe_close_pair(d); safe_close_pair(k); + + assert_se(unsetenv("SYSTEMD_PIDFD") >= 0); } static void test_sd_event_now(void) { @@ -482,15 +520,89 @@ static void test_inotify(unsigned n_create_events) { sd_event_unref(e); } +static int pidfd_handler(sd_event_source *s, const siginfo_t *si, void *userdata) { + assert_se(s); + assert_se(si); + + assert_se(si->si_uid == getuid()); + assert_se(si->si_signo == SIGCHLD); + assert_se(si->si_code == CLD_EXITED); + assert_se(si->si_status == 66); + + log_info("got pidfd on %c", PTR_TO_INT(userdata)); + + assert_se(userdata == INT_TO_PTR('p')); + + assert_se(sd_event_exit(sd_event_source_get_event(s), 0) >= 0); + sd_event_source_unref(s); + + return 0; +} + +static void test_pidfd(void) { + sd_event_source *s = NULL, *t = NULL; + sd_event *e = NULL; + int pidfd; + pid_t pid, pid2; + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0); + + pid = fork(); + if (pid == 0) { + /* child */ + _exit(66); + } + + assert_se(pid > 1); + + pidfd = pidfd_open(pid, 0); + if (pidfd < 0) { + /* No pidfd_open() supported or blocked? */ + assert_se(ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno)); + (void) wait_for_terminate(pid, NULL); + return; + } + + pid2 = fork(); + if (pid2 == 0) + freeze(); + + assert_se(pid > 2); + + assert_se(sd_event_default(&e) >= 0); + assert_se(sd_event_add_child_pidfd(e, &s, pidfd, WEXITED, pidfd_handler, INT_TO_PTR('p')) >= 0); + assert_se(sd_event_source_set_child_pidfd_own(s, true) >= 0); + + /* This one should never trigger, since our second child lives forever */ + assert_se(sd_event_add_child(e, &t, pid2, WEXITED, pidfd_handler, INT_TO_PTR('q')) >= 0); + assert_se(sd_event_source_set_child_process_own(t, true) >= 0); + + assert_se(sd_event_loop(e) >= 0); + + /* Child should still be alive */ + assert_se(kill(pid2, 0) >= 0); + + t = sd_event_source_unref(t); + + /* Child should now be dead, since we dropped the ref */ + assert_se(kill(pid2, 0) < 0 && errno == ESRCH); + + sd_event_unref(e); +} + int main(int argc, char *argv[]) { test_setup_logging(LOG_INFO); - test_basic(); + test_basic(true); /* test with pidfd */ + test_basic(false); /* test without pidfd */ + test_sd_event_now(); test_rtqueue(); test_inotify(100); /* should work without overflow */ test_inotify(33000); /* should trigger a q overflow */ + test_pidfd(); + return 0; } From 68765d94fec8a053f6036d67fd9ce84497410755 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 30 Oct 2019 18:56:03 +0100 Subject: [PATCH 11/14] man: don't claim we'd unblock the specified signal in sd_event_add_signal() We don't, the signal remains blocked. We use signalfd() to be able to read the signal events without unblocking the signal. While we are at it, mention that pthread_sigmask() is fine too. --- man/sd_event_add_signal.xml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/man/sd_event_add_signal.xml b/man/sd_event_add_signal.xml index a7148ca8dd..43794bd7ce 100644 --- a/man/sd_event_add_signal.xml +++ b/man/sd_event_add_signal.xml @@ -75,14 +75,13 @@ project='man-pages'>signalfd2 for further information. - Only a single handler may be installed for a specific - signal. The signal will be unblocked by this call, and must be - blocked before this function is called in all threads (using + Only a single handler may be installed for a specific signal. The signal must be blocked in all + threads before this function is called (using sigprocmask2 or sigprocmask2). If - the handler is not specified (handler is - NULL), a default handler which causes the - program to exit cleanly will be used. + project='man-pages'>pthread_sigmask3). If + the handler is not specified (handler is NULL), a default + handler which causes the program to exit cleanly will be used. By default, the event source is enabled permanently (SD_EVENT_ON), but this may be changed with @@ -189,7 +188,9 @@ sd_event_source_set_userdata3, sd_event_source_set_floating3, signal7, - signalfd2 + signalfd2, + sigprocmask2, + pthread_sigmask3 From b3508072002cad74113934b3f0b8c21599cebfdd Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 30 Oct 2019 18:59:44 +0100 Subject: [PATCH 12/14] man: mention that SIGCHLD has to be blocked before using sd_event_add_child() --- man/sd_event_add_child.xml | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/man/sd_event_add_child.xml b/man/sd_event_add_child.xml index 509803d5c1..c146e3121b 100644 --- a/man/sd_event_add_child.xml +++ b/man/sd_event_add_child.xml @@ -100,6 +100,12 @@ SD_EVENT_OFF with sd_event_source_set_enabled3. + The SIGCHLD signal must be blocked in all threads before this function is + called (using sigprocmask2 or + pthread_sigmask3). + If the second parameter of sd_event_add_child() is passed as NULL no reference to the event source object is returned. In this case the @@ -165,8 +171,8 @@ -EBUSY - A handler is already installed for this - child process. + A handler is already installed for this child process, or + SIGCHLD is not blocked. @@ -214,7 +220,9 @@ sd_event_source_set_userdata3, sd_event_source_set_description3, sd_event_source_set_floating3, - waitid2 + waitid2, + sigprocmask2, + pthread_sigmask3 From 8089643328b20326e9b1c2684dc85a17d962b7f1 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 30 Oct 2019 19:00:12 +0100 Subject: [PATCH 13/14] man: document the new sd-event pidfd magic --- man/rules/meson.build | 10 ++- man/sd_event_add_child.xml | 150 +++++++++++++++++++++++++++++++------ 2 files changed, 136 insertions(+), 24 deletions(-) diff --git a/man/rules/meson.build b/man/rules/meson.build index 20c3d09da0..0f6897a080 100644 --- a/man/rules/meson.build +++ b/man/rules/meson.build @@ -371,7 +371,15 @@ manpages = [ ['sd_bus_wait', '3', [], ''], ['sd_event_add_child', '3', - ['sd_event_child_handler_t', 'sd_event_source_get_child_pid'], + ['sd_event_add_child_pidfd', + 'sd_event_child_handler_t', + 'sd_event_source_get_child_pid', + 'sd_event_source_get_child_pidfd', + 'sd_event_source_get_child_pidfd_own', + 'sd_event_source_get_child_process_own', + 'sd_event_source_send_child_signal', + 'sd_event_source_set_child_pidfd_own', + 'sd_event_source_set_child_process_own'], ''], ['sd_event_add_defer', '3', diff --git a/man/sd_event_add_child.xml b/man/sd_event_add_child.xml index c146e3121b..1db536ff2a 100644 --- a/man/sd_event_add_child.xml +++ b/man/sd_event_add_child.xml @@ -17,7 +17,14 @@ sd_event_add_child + sd_event_add_child_pidfd sd_event_source_get_child_pid + sd_event_source_get_child_pidfd + sd_event_source_get_child_pidfd_own + sd_event_source_set_child_pidfd_own + sd_event_source_get_child_process_own + sd_event_source_set_child_process_own + sd_event_source_send_child_signal sd_event_child_handler_t Add a child process state change event source to an event loop @@ -46,40 +53,77 @@ void *userdata + + int sd_event_add_child_pidfd + sd_event *event + sd_event_source **source + int pidfd + int options + sd_event_child_handler_t handler + void *userdata + + int sd_event_source_get_child_pid sd_event_source *source pid_t *pid + + int sd_event_source_get_child_pidfd + sd_event_source *source + + + + int sd_event_source_get_child_pidfd_own + sd_event_source *source + + + + int sd_event_source_set_child_pidfd_own + sd_event_source *source + int own + + + + int sd_event_source_get_child_process_own + sd_event_source *source + + + + int sd_event_source_set_child_process_own + sd_event_source *source + int own + + + + int sd_event_source_send_child_signal + sd_event_source *source + int sig + const siginfo_t *info + unsigned flags + + Description - sd_event_add_child() adds a new child - process state change event source to an event loop. The event loop - object is specified in the event parameter, - the event source object is returned in the - source parameter. The - pid parameter specifies the PID of the - process to watch. The handler must - reference a function to call when the process changes state. The - handler function will be passed the - userdata pointer, which may be chosen - freely by the caller. The handler also receives a pointer to a - siginfo_t structure containing - information about the child process event. The - options parameter determines which state - changes will be watched for. It must contain an OR-ed mask of - WEXITED (watch for the child process - terminating), WSTOPPED (watch for the child - process being stopped by a signal), and - WCONTINUED (watch for the child process being - resumed by a signal). See waitid2 - for further information. + sd_event_add_child() adds a new child process state change event source to an + event loop. The event loop object is specified in the event parameter, the event + source object is returned in the source parameter. The pid + parameter specifies the PID of the process to watch, which must be a direct child process of the invoking + process. The handler must reference a function to call when the process changes + state. The handler function will be passed the userdata pointer, which may be + chosen freely by the caller. The handler also receives a pointer to a siginfo_t + structure containing information about the child process event. The options + parameter determines which state changes will be watched for. It must contain an OR-ed mask of + WEXITED (watch for the child process terminating), WSTOPPED + (watch for the child process being stopped by a signal), and WCONTINUED (watch for + the child process being resumed by a signal). See waitid2 for + further information. Only a single handler may be installed for a specific child process. The handler is enabled for a single event @@ -127,6 +171,17 @@ processed first, it should leave the child processes for which child process state change event sources are installed unreaped. + sd_event_add_child_pidfd() is similar to + sd_event_add_child() but takes a file descriptor referencing the process ("pidfd") + instead of the numeric PID. A suitable file descriptor may be acquired via pidfd_open2 and + related calls. The passed file descriptor is not closed when the event source is freed again, unless + sd_event_source_set_child_pidfd_own() is used to turn this behaviour on. Note that + regardless which of sd_event_add_child() and + sd_event_add_child_pidfd() is used for allocating an event source, the watched + process has to be a direct child process of the invoking process. Also in both cases + SIGCHLD has to be blocked in the invoking process. + sd_event_source_get_child_pid() retrieves the configured PID of a child process state change event source created previously with @@ -135,6 +190,45 @@ pointer to a pid_t variable to return the process ID in. + + sd_event_source_get_child_pidfd() retrieves the file descriptor referencing + the watched process ("pidfd") if this functionality is available. On kernels that support the concept the + event loop will make use of pidfds to watch child processes, regardless if the individual event sources + are allocated via sd_event_add_child() or + sd_event_add_child_pidfd(). If the latter call was used to allocate the event + source, this function returns the file descriptor used for allocation. On kernels that do not support the + pidfd concept this function will fail with EOPNOTSUPP. This call takes the event + source object as the source parameter and returns the numeric file descriptor. + + + sd_event_source_get_child_pidfd_own() may be used to query whether the pidfd + the event source encapsulates shall be closed when the event source is freed. This function returns zero + if the pidfd shall be left open, and positive if it shall be closed automatically. By default this + setting defaults to on if the event source was allocated via sd_event_add_child() + and off if it was allocated via sd_event_add_child_pidfd(). The + sd_event_source_set_child_pidfd_own() function may be used to change the setting and + takes a boolean parameter with the new setting. + + sd_event_source_get_child_process_own() may be used to query whether the + process the event source watches shall be killed (with SIGKILL) and reaped when the + event source is freed. This function returns zero if the process shell be left running, and positive if + it shall be killed and reaped automatically. By default this setting defaults to off. The + sd_event_source_set_child_process_own() function may be used to change the setting + and takes a boolean parameter with the new setting. Note that currently if the calling process is + terminated abnormally the watched process might survive even thought the event source ceases to + exist. This behaviour might change eventually. + + sd_event_source_send_child_signal() may be used to send a UNIX signal to the + watched process. If the pidfd concept is supported in the kernel, this is implemented via pidfd_send_signal2 + and otherwise via rt_sigqueueinfo2 + (or via kill2 in case + info is NULL). The specified parameters match those of these + underlying system calls, except that the info is never modified (and is thus + declared constant). Like for the underlying system calls, the flags parameter + currently must be zero. @@ -196,6 +290,12 @@ The passed event source is not a child process event source. + + -EOPNOTSUPP + + A pidfd was requested but the kernel does not support this concept. + + @@ -222,7 +322,11 @@ sd_event_source_set_floating3, waitid2, sigprocmask2, - pthread_sigmask3 + pthread_sigmask3, + pidfd_open2, + pidfd_send_signal2, + rt_sigqueueinfo2, + kill2 From e544601536ac13a288d7476f4400c7b0f22b7ea1 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 30 Oct 2019 20:26:50 +0100 Subject: [PATCH 14/14] sd-event: refuse running default event loops in any other thread than the one they are default for --- TODO | 1 - src/libsystemd/sd-event/sd-event.c | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/TODO b/TODO index 88fd9d7f10..07f65ec80e 100644 --- a/TODO +++ b/TODO @@ -701,7 +701,6 @@ Features: - allow multiple signal handlers per signal? - document chaining of signal handler for SIGCHLD and child handlers - define more intervals where we will shift wakeup intervals around in, 1h, 6h, 24h, ... - - generate a failure of a default event loop is executed out-of-thread * investigate endianness issues of UUID vs. GUID diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c index 8d4a20e420..4940345791 100644 --- a/src/libsystemd/sd-event/sd-event.c +++ b/src/libsystemd/sd-event/sd-event.c @@ -3422,6 +3422,11 @@ _public_ int sd_event_prepare(sd_event *e) { assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); assert_return(e->state == SD_EVENT_INITIAL, -EBUSY); + /* Let's check that if we are a default event loop we are executed in the correct thread. We only do + * this check here once, since gettid() is typically not cached, and thus want to minimize + * syscalls */ + assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO); + if (e->exit_requested) goto pending;