From bfde9421af1458e18999d787b1ab46a6a33e8bb6 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Wed, 6 Nov 2019 12:24:41 +0100 Subject: [PATCH 1/2] udevd: wait for workers to finish when exiting On some systems with lots of devices, device probing for certain drivers can take a very long time. If systemd-udevd detects a timeout and kills the worker running modprobe using SIGKILL, some devices will not be probed, or end up in unusable state. The --event-timeout option can be used to modify the maximum time spent in an uevent handler. But if systemd-udevd exits, it uses a different timeout, hard-coded to 30s, and exits when this timeout expires, causing all workers to be KILLed by systemd afterwards. In practice, this may lead to workers being killed after significantly less time than specified with the event-timeout. This is particularly significant during initrd processing: systemd-udevd will be stopped by systemd when initrd-switch-root.target is about to be isolated, which usually happens quickly after finding and mounting the root FS. If systemd-udevd is started by PID 1 (i.e. basically always), systemd will kill both udevd and the workers after expiry of TimeoutStopSec. This is actually better than the built-in udevd timeout, because it's more transparent and configurable for users. This way users can avoid the mentioned boot problem by simply increasing StopTimeoutSec= in systemd-udevd.service. If udevd is not started by systemd (standalone), this is still an improvement. udevd will kill hanging workers when the event timeout is reached, which is configurable via the udev.event_timeout= kernel command line parameter. Before this patch, udevd would simply exit with workers still running, which would then become zombie processes. With the timeout removed, the sd_event_now() assertion in manager_exit() can be dropped. --- NEWS | 11 +++++++++++ src/udev/udevd.c | 21 --------------------- 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/NEWS b/NEWS index 56d05ac7ef..15a5ceb051 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,17 @@ systemd System and Service Manager CHANGES WITH 244 in spe: + * systemd-udevd: removed the 30s timeout for killing stale workers on + exit. systemd-udevd now waits for workers to finish. The hard-coded + exit timeout of 30s was too short for some large installations, where + driver initialization could be prematurely interrupted during initrd + processing if the root file system had been mounted and init was + preparing to switch root. If udevd is run without systemd and workers + are hanging while udevd receives an exit signal, udevd will now exit + when udev.event_timeout is reached for the last hanging worker. With + systemd, the exit timeout can additionally be configured using + TimeoutStopSec= in systemd-udevd.service. + * Support for the cpuset cgroups v2 controller has been added. Processes may be restricted to specific CPUs using the new AllowedCPUs= setting, and to specific memory NUMA nodes using the new diff --git a/src/udev/udevd.c b/src/udev/udevd.c index 0ae61c1e8b..144a20ec63 100644 --- a/src/udev/udevd.c +++ b/src/udev/udevd.c @@ -774,21 +774,7 @@ set_delaying_seqnum: return true; } -static int on_exit_timeout(sd_event_source *s, uint64_t usec, void *userdata) { - Manager *manager = userdata; - - assert(manager); - - log_error("Giving up waiting for workers to finish."); - sd_event_exit(manager->event, -ETIMEDOUT); - - return 1; -} - static void manager_exit(Manager *manager) { - uint64_t usec; - int r; - assert(manager); manager->exit = true; @@ -808,13 +794,6 @@ static void manager_exit(Manager *manager) { /* discard queued events and kill workers */ event_queue_cleanup(manager, EVENT_QUEUED); manager_kill_workers(manager); - - assert_se(sd_event_now(manager->event, CLOCK_MONOTONIC, &usec) >= 0); - - r = sd_event_add_time(manager->event, NULL, CLOCK_MONOTONIC, - usec + 30 * USEC_PER_SEC, USEC_PER_SEC, on_exit_timeout, manager); - if (r < 0) - return; } /* reload requested, HUP signal received, rules changed, builtin changed */ From 7b6596d7489421842af854ed16333ea747879732 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Tue, 12 Nov 2019 16:43:42 +0100 Subject: [PATCH 2/2] udevd: fix crash when workers time out after exit is signal caught If udevd receives an exit signal, it releases its reference on the udev monitor in manager_exit(). If at this time a worker is hanging, and if the event timeout for this worker expires before udevd exits, udevd crashes in on_sigchld()->udev_monitor_send_device(), because the monitor has already been freed. Fix this by releasing the main process's monitor ref later, in manager_free(). --- src/udev/udevd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/udev/udevd.c b/src/udev/udevd.c index 144a20ec63..2bb322796b 100644 --- a/src/udev/udevd.c +++ b/src/udev/udevd.c @@ -293,6 +293,8 @@ static void manager_free(Manager *manager) { if (!manager) return; + manager->monitor = sd_device_monitor_unref(manager->monitor); + udev_builtin_exit(); if (manager->pid == getpid_cached()) @@ -789,8 +791,6 @@ static void manager_exit(Manager *manager) { manager->inotify_event = sd_event_source_unref(manager->inotify_event); manager->fd_inotify = safe_close(manager->fd_inotify); - manager->monitor = sd_device_monitor_unref(manager->monitor); - /* discard queued events and kill workers */ event_queue_cleanup(manager, EVENT_QUEUED); manager_kill_workers(manager);