Systemd/src/libsystemd/sd-event/event-source.h

214 lines
7.3 KiB
C
Raw Normal View History

#pragma once
/* SPDX-License-Identifier: LGPL-2.1+ */
#include <sys/epoll.h>
#include <sys/timerfd.h>
#include <sys/wait.h>
#include "sd-event.h"
#include "fs-util.h"
#include "hashmap.h"
#include "list.h"
#include "prioq.h"
typedef enum EventSourceType {
SOURCE_IO,
SOURCE_TIME_REALTIME,
SOURCE_TIME_BOOTTIME,
SOURCE_TIME_MONOTONIC,
SOURCE_TIME_REALTIME_ALARM,
SOURCE_TIME_BOOTTIME_ALARM,
SOURCE_SIGNAL,
SOURCE_CHILD,
SOURCE_DEFER,
SOURCE_POST,
SOURCE_EXIT,
SOURCE_WATCHDOG,
SOURCE_INOTIFY,
_SOURCE_EVENT_SOURCE_TYPE_MAX,
_SOURCE_EVENT_SOURCE_TYPE_INVALID = -1
} EventSourceType;
/* All objects we use in epoll events start with this value, so that
* we know how to dispatch it */
typedef enum WakeupType {
WAKEUP_NONE,
sd-event: add pidfd support This adds support for watching for process exits via Linux new pidfd concept. This makes watching processes and killing them race-free if properly used, fixing a long-standing UNIX misdesign. This patch adds implicit and explicit pidfd support to sd-event: if a process shall be watched and is specified by PID we will now internally create a pidfd for it and use that, if available. Alternatively a new constructor for child process event sources is added that takes pidfds as input. Besides mere watching of child processes via pidfd two additional features are added: → sd_event_source_send_child_signal() allows sending a signal to the process being watched in the safest way possible (wrapping the new pidfd_send_signal() syscall). → sd_event_source_set_child_process_own() allows marking a process watched for destruction as soon as the event source is freed. This is currently implemented in userspace, but hopefully will become a kernel feature eventually. Altogether this means an sd_event_source object is now a safe and stable concept for referencing processes in race-free way, with automatic fallback to pre-pidfd kernels. Note that this patch adds support for this only to sd-event, not to PID 1. That's because PID 1 needs to use waitid(P_ALL) for reaping any process that might get reparented to it. This currently semantically conflicts with pidfd use for watching processes since we P_ALL is undirected and thus might reap process earlier than the pidfd notifies process end, which is hard to handle. The kernel will likely gain a concept for excluding specific pidfds from P_ALL watching, as soon as that is around we can start making use of this in PID 1 too.
2019-10-30 17:22:49 +01:00
WAKEUP_EVENT_SOURCE, /* either I/O or pidfd wakeup */
WAKEUP_CLOCK_DATA,
WAKEUP_SIGNAL_DATA,
WAKEUP_INOTIFY_DATA,
_WAKEUP_TYPE_MAX,
_WAKEUP_TYPE_INVALID = -1,
} WakeupType;
struct inode_data;
struct sd_event_source {
WakeupType wakeup;
unsigned n_ref;
sd_event *event;
void *userdata;
sd_event_handler_t prepare;
char *description;
EventSourceType type:5;
signed int enabled:3;
bool pending:1;
bool dispatching:1;
bool floating:1;
bool exit_on_failure:1;
int64_t priority;
unsigned pending_index;
unsigned prepare_index;
uint64_t pending_iteration;
uint64_t prepare_iteration;
sd_event_destroy_t destroy_callback;
LIST_FIELDS(sd_event_source, sources);
union {
struct {
sd_event_io_handler_t callback;
int fd;
uint32_t events;
uint32_t revents;
bool registered:1;
bool owned:1;
} io;
struct {
sd_event_time_handler_t callback;
usec_t next, accuracy;
unsigned earliest_index;
unsigned latest_index;
} time;
struct {
sd_event_signal_handler_t callback;
struct signalfd_siginfo siginfo;
int sig;
} signal;
struct {
sd_event_child_handler_t callback;
siginfo_t siginfo;
pid_t pid;
int options;
sd-event: add pidfd support This adds support for watching for process exits via Linux new pidfd concept. This makes watching processes and killing them race-free if properly used, fixing a long-standing UNIX misdesign. This patch adds implicit and explicit pidfd support to sd-event: if a process shall be watched and is specified by PID we will now internally create a pidfd for it and use that, if available. Alternatively a new constructor for child process event sources is added that takes pidfds as input. Besides mere watching of child processes via pidfd two additional features are added: → sd_event_source_send_child_signal() allows sending a signal to the process being watched in the safest way possible (wrapping the new pidfd_send_signal() syscall). → sd_event_source_set_child_process_own() allows marking a process watched for destruction as soon as the event source is freed. This is currently implemented in userspace, but hopefully will become a kernel feature eventually. Altogether this means an sd_event_source object is now a safe and stable concept for referencing processes in race-free way, with automatic fallback to pre-pidfd kernels. Note that this patch adds support for this only to sd-event, not to PID 1. That's because PID 1 needs to use waitid(P_ALL) for reaping any process that might get reparented to it. This currently semantically conflicts with pidfd use for watching processes since we P_ALL is undirected and thus might reap process earlier than the pidfd notifies process end, which is hard to handle. The kernel will likely gain a concept for excluding specific pidfds from P_ALL watching, as soon as that is around we can start making use of this in PID 1 too.
2019-10-30 17:22:49 +01:00
int pidfd;
bool registered:1; /* whether the pidfd is registered in the epoll */
bool pidfd_owned:1; /* close pidfd when event source is freed */
bool process_owned:1; /* kill+reap process when event source is freed */
bool exited:1; /* true if process exited (i.e. if there's value in SIGKILLing it if we want to get rid of it) */
bool waited:1; /* true if process was waited for (i.e. if there's value in waitid(P_PID)'ing it if we want to get rid of it) */
} child;
struct {
sd_event_handler_t callback;
} defer;
struct {
sd_event_handler_t callback;
} post;
struct {
sd_event_handler_t callback;
unsigned prioq_index;
} exit;
struct {
sd_event_inotify_handler_t callback;
uint32_t mask;
struct inode_data *inode_data;
LIST_FIELDS(sd_event_source, by_inode_data);
} inotify;
};
};
struct clock_data {
WakeupType wakeup;
int fd;
/* For all clocks we maintain two priority queues each, one
* ordered for the earliest times the events may be
* dispatched, and one ordered by the latest times they must
* have been dispatched. The range between the top entries in
* the two prioqs is the time window we can freely schedule
* wakeups in */
Prioq *earliest;
Prioq *latest;
usec_t next;
bool needs_rearm:1;
};
struct signal_data {
WakeupType wakeup;
/* For each priority we maintain one signal fd, so that we
* only have to dequeue a single event per priority at a
* time. */
int fd;
int64_t priority;
sigset_t sigset;
sd_event_source *current;
};
/* A structure listing all event sources currently watching a specific inode */
struct inode_data {
/* The identifier for the inode, the combination of the .st_dev + .st_ino fields of the file */
ino_t ino;
dev_t dev;
/* An fd of the inode to watch. The fd is kept open until the next iteration of the loop, so that we can
* rearrange the priority still until then, as we need the original inode to change the priority as we need to
* add a watch descriptor to the right inotify for the priority which we can only do if we have a handle to the
* original inode. We keep a list of all inode_data objects with an open fd in the to_close list (see below) of
* the sd-event object, so that it is efficient to close everything, before entering the next event loop
* iteration. */
int fd;
/* The inotify "watch descriptor" */
int wd;
/* The combination of the mask of all inotify watches on this inode we manage. This is also the mask that has
* most recently been set on the watch descriptor. */
uint32_t combined_mask;
/* All event sources subscribed to this inode */
LIST_HEAD(sd_event_source, event_sources);
/* The inotify object we watch this inode with */
struct inotify_data *inotify_data;
/* A linked list of all inode data objects with fds to close (see above) */
LIST_FIELDS(struct inode_data, to_close);
};
/* A structure encapsulating an inotify fd */
struct inotify_data {
WakeupType wakeup;
/* For each priority we maintain one inotify fd, so that we only have to dequeue a single event per priority at
* a time */
int fd;
int64_t priority;
Hashmap *inodes; /* The inode_data structures keyed by dev+ino */
Hashmap *wd; /* The inode_data structures keyed by the watch descriptor for each */
/* The buffer we read inotify events into */
union inotify_event_buffer buffer;
size_t buffer_filled; /* fill level of the buffer */
/* How many event sources are currently marked pending for this inotify. We won't read new events off the
* inotify fd as long as there are still pending events on the inotify (because we have no strategy of queuing
* the events locally if they can't be coalesced). */
unsigned n_pending;
/* A linked list of all inotify objects with data already read, that still need processing. We keep this list
* to make it efficient to figure out what inotify objects to process data on next. */
LIST_FIELDS(struct inotify_data, buffered);
};