diff --git a/meson.build b/meson.build index 307d1bd5f7..9c915323c4 100644 --- a/meson.build +++ b/meson.build @@ -3338,6 +3338,7 @@ foreach tuple : tests type = tuple.length() >= 5 ? tuple[4] : '' defs = tuple.length() >= 6 ? tuple[5] : [] incs = tuple.length() >= 7 ? tuple[6] : includes + parallel = tuple.length() >= 8 ? tuple[7] : true timeout = 30 name = sources[0].split('/')[-1].split('.')[0] diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c index cbd3357219..1820b81e11 100644 --- a/src/shared/dissect-image.c +++ b/src/shared/dissect-image.c @@ -109,31 +109,6 @@ not_found: } #if HAVE_BLKID -/* Detect RPMB and Boot partitions, which are not listed by blkid. - * See https://github.com/systemd/systemd/issues/5806. */ -static bool device_is_mmc_special_partition(sd_device *d) { - const char *sysname; - - assert(d); - - if (sd_device_get_sysname(d, &sysname) < 0) - return false; - - return startswith(sysname, "mmcblk") && - (endswith(sysname, "rpmb") || endswith(sysname, "boot0") || endswith(sysname, "boot1")); -} - -static bool device_is_block(sd_device *d) { - const char *ss; - - assert(d); - - if (sd_device_get_subsystem(d, &ss) < 0) - return false; - - return streq(ss, "block"); -} - static int enumerator_for_parent(sd_device *d, sd_device_enumerator **ret) { _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; int r; @@ -157,124 +132,217 @@ static int enumerator_for_parent(sd_device *d, sd_device_enumerator **ret) { return 0; } -static int wait_for_partitions_to_appear( - int fd, - sd_device *d, - unsigned num_partitions, - DissectImageFlags flags, - sd_device_enumerator **ret_enumerator) { +static int device_is_partition(sd_device *d, blkid_partition pp) { + blkid_loff_t bsize, bstart; + uint64_t size, start; + int partno, bpartno, r; + const char *ss, *v; - _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; - sd_device *q; - unsigned n; - int r; - - assert(fd >= 0); assert(d); - assert(ret_enumerator); + assert(pp); - r = enumerator_for_parent(d, &e); + r = sd_device_get_subsystem(d, &ss); + if (r < 0) + return r; + if (!streq(ss, "block")) + return false; + + r = sd_device_get_sysattr_value(d, "partition", &v); + if (r == -ENOENT) /* Not a partition device */ + return false; + if (r < 0) + return r; + r = safe_atoi(v, &partno); if (r < 0) return r; - /* Count the partitions enumerated by the kernel */ - n = 0; - FOREACH_DEVICE(e, q) { - if (sd_device_get_devnum(q, NULL) < 0) - continue; - if (!device_is_block(q)) - continue; - if (device_is_mmc_special_partition(q)) - continue; + errno = 0; + bpartno = blkid_partition_get_partno(pp); + if (bpartno < 0) + return errno_or_else(EIO); - if (!FLAGS_SET(flags, DISSECT_IMAGE_NO_UDEV)) { - r = device_wait_for_initialization(q, "block", USEC_INFINITY, NULL); - if (r < 0) - return r; - } + if (partno != bpartno) + return false; - n++; - } + r = sd_device_get_sysattr_value(d, "start", &v); + if (r < 0) + return r; + r = safe_atou64(v, &start); + if (r < 0) + return r; - if (n == num_partitions + 1) { - *ret_enumerator = TAKE_PTR(e); - return 0; /* success! */ - } - if (n > num_partitions + 1) - return log_debug_errno(SYNTHETIC_ERRNO(EIO), - "blkid and kernel partition lists do not match."); + errno = 0; + bstart = blkid_partition_get_start(pp); + if (bstart < 0) + return errno_or_else(EIO); - /* The kernel has probed fewer partitions than blkid? Maybe the kernel prober is still running or it - * got EBUSY because udev already opened the device. Let's reprobe the device, which is a synchronous - * call that waits until probing is complete. */ + if (start != (uint64_t) bstart) + return false; - for (unsigned j = 0; ; j++) { - if (j++ > 20) - return -EBUSY; + r = sd_device_get_sysattr_value(d, "size", &v); + if (r < 0) + return r; + r = safe_atou64(v, &size); + if (r < 0) + return r; - if (ioctl(fd, BLKRRPART, 0) >= 0) - break; - r = -errno; - if (r == -EINVAL) { - /* If we are running on a block device that has partition scanning off, return an - * explicit recognizable error about this, so that callers can generate a proper - * message explaining the situation. */ + errno = 0; + bsize = blkid_partition_get_size(pp); + if (bsize < 0) + return errno_or_else(EIO); - r = blockdev_partscan_enabled(fd); - if (r < 0) - return r; - if (r == 0) - return log_debug_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), - "Device is a loop device and partition scanning is off!"); + if (size != (uint64_t) bsize) + return false; - return -EINVAL; /* original error */ - } - if (r != -EBUSY) - return r; - - /* If something else has the device open, such as an udev rule, the ioctl will return - * EBUSY. Since there's no way to wait until it isn't busy anymore, let's just wait a bit, - * and try again. - * - * This is really something they should fix in the kernel! */ - (void) usleep(50 * USEC_PER_MSEC); - - } - - return -EAGAIN; /* no success yet, try again */ + return true; } -static int loop_wait_for_partitions_to_appear( - int fd, - sd_device *d, - unsigned num_partitions, - DissectImageFlags flags, - sd_device_enumerator **ret_enumerator) { - _cleanup_(sd_device_unrefp) sd_device *device = NULL; +static int find_partition( + sd_device *parent, + blkid_partition pp, + sd_device **ret) { + + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + sd_device *q; int r; - assert(fd >= 0); - assert(d); - assert(ret_enumerator); + assert(parent); + assert(pp); + assert(ret); - log_debug("Waiting for device (parent + %d partitions) to appear...", num_partitions); + r = enumerator_for_parent(parent, &e); + if (r < 0) + return r; - if (!FLAGS_SET(flags, DISSECT_IMAGE_NO_UDEV)) { - r = device_wait_for_initialization(d, "block", USEC_INFINITY, &device); + FOREACH_DEVICE(e, q) { + r = device_is_partition(q, pp); if (r < 0) return r; - } else - device = sd_device_ref(d); + if (r > 0) { + *ret = sd_device_ref(q); + return 0; + } + } - for (unsigned i = 0; i < N_DEVICE_NODE_LIST_ATTEMPTS; i++) { - r = wait_for_partitions_to_appear(fd, device, num_partitions, flags, ret_enumerator); - if (r != -EAGAIN) + return -ENXIO; +} + +struct wait_data { + sd_device *parent_device; + blkid_partition blkidp; + sd_device *found; +}; + +static inline void wait_data_done(struct wait_data *d) { + sd_device_unref(d->found); +} + +static int device_monitor_handler(sd_device_monitor *monitor, sd_device *device, void *userdata) { + const char *parent1_path, *parent2_path; + struct wait_data *w = userdata; + sd_device *pp; + int r; + + assert(w); + + if (device_for_action(device, DEVICE_ACTION_REMOVE)) + return 0; + + r = sd_device_get_parent(device, &pp); + if (r < 0) + return 0; /* Doesn't have a parent? No relevant to us */ + + r = sd_device_get_syspath(pp, &parent1_path); /* Check parent of device of this action */ + if (r < 0) + goto finish; + + r = sd_device_get_syspath(w->parent_device, &parent2_path); /* Check parent of device we are looking for */ + if (r < 0) + goto finish; + + if (!path_equal(parent1_path, parent2_path)) + return 0; /* Has a different parent than what we need, not interesting to us */ + + r = device_is_partition(device, w->blkidp); + if (r < 0) + goto finish; + if (r == 0) /* Not the one we need */ + return 0; + + /* It's the one we need! Yay! */ + assert(!w->found); + w->found = sd_device_ref(device); + r = 0; + +finish: + return sd_event_exit(sd_device_monitor_get_event(monitor), r); +} + +static int wait_for_partition_device( + sd_device *parent, + blkid_partition pp, + usec_t deadline, + sd_device **ret) { + + _cleanup_(sd_event_source_unrefp) sd_event_source *timeout_source = NULL; + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + int r; + + assert(parent); + assert(pp); + assert(ret); + + r = find_partition(parent, pp, ret); + if (r != -ENXIO) + return r; + + r = sd_event_new(&event); + if (r < 0) + return r; + + r = sd_device_monitor_new(&monitor); + if (r < 0) + return r; + + r = sd_device_monitor_filter_add_match_subsystem_devtype(monitor, "block", "partition"); + if (r < 0) + return r; + + r = sd_device_monitor_attach_event(monitor, event); + if (r < 0) + return r; + + _cleanup_(wait_data_done) struct wait_data w = { + .parent_device = parent, + .blkidp = pp, + }; + + r = sd_device_monitor_start(monitor, device_monitor_handler, &w); + if (r < 0) + return r; + + /* Check again, the partition might have appeared in the meantime */ + r = find_partition(parent, pp, ret); + if (r != -ENXIO) + return r; + + if (deadline != USEC_INFINITY) { + r = sd_event_add_time( + event, &timeout_source, + CLOCK_MONOTONIC, deadline, 0, + NULL, INT_TO_PTR(-ETIMEDOUT)); + if (r < 0) return r; } - return log_debug_errno(SYNTHETIC_ERRNO(ENXIO), - "Kernel partitions dit not appear within %d attempts", - N_DEVICE_NODE_LIST_ATTEMPTS); + r = sd_event_loop(event); + if (r < 0) + return r; + + assert(w.found); + *ret = TAKE_PTR(w.found); + return 0; } static void check_partition_flags( @@ -300,8 +368,90 @@ static void check_partition_flags( } } +static int device_wait_for_initialization_harder( + sd_device *device, + const char *subsystem, + usec_t deadline, + sd_device **ret) { + + _cleanup_free_ char *uevent = NULL; + usec_t start, left, retrigger_timeout; + int r; + + start = now(CLOCK_MONOTONIC); + left = usec_sub_unsigned(deadline, start); + + if (DEBUG_LOGGING) { + char buf[FORMAT_TIMESPAN_MAX]; + const char *sn = NULL; + + (void) sd_device_get_sysname(device, &sn); + log_debug("Waiting for device '%s' to initialize for %s.", strna(sn), format_timespan(buf, sizeof(buf), left, 0)); + } + + if (left != USEC_INFINITY) + retrigger_timeout = CLAMP(left / 4, 1 * USEC_PER_SEC, 5 * USEC_PER_SEC); /* A fourth of the total timeout, but let's clamp to 1s…5s range */ + else + retrigger_timeout = 2 * USEC_PER_SEC; + + for (;;) { + usec_t local_deadline, n; + bool last_try; + + n = now(CLOCK_MONOTONIC); + assert(n >= start); + + /* Find next deadline, when we'll retrigger */ + local_deadline = start + + DIV_ROUND_UP(n - start, retrigger_timeout) * retrigger_timeout; + + if (deadline != USEC_INFINITY && deadline <= local_deadline) { + local_deadline = deadline; + last_try = true; + } else + last_try = false; + + r = device_wait_for_initialization(device, subsystem, local_deadline, ret); + if (r >= 0 && DEBUG_LOGGING) { + char buf[FORMAT_TIMESPAN_MAX]; + const char *sn = NULL; + + (void) sd_device_get_sysname(device, &sn); + log_debug("Successfully waited for device '%s' to initialize for %s.", strna(sn), format_timespan(buf, sizeof(buf), usec_sub_unsigned(now(CLOCK_MONOTONIC), start), 0)); + + } + if (r != -ETIMEDOUT || last_try) + return r; + + if (!uevent) { + const char *syspath; + + r = sd_device_get_syspath(device, &syspath); + if (r < 0) + return r; + + uevent = path_join(syspath, "uevent"); + if (!uevent) + return -ENOMEM; + } + + if (DEBUG_LOGGING) { + char buf[FORMAT_TIMESPAN_MAX]; + + log_debug("Device didn't initialize within %s, assuming lost event. Retriggering device through %s.", + format_timespan(buf, sizeof(buf), usec_sub_unsigned(now(CLOCK_MONOTONIC), start), 0), + uevent); + } + + r = write_string_file(uevent, "change", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return r; + } +} #endif +#define DEVICE_TIMEOUT_USEC (45 * USEC_PER_SEC) + int dissect_image( int fd, const VeritySettings *verity, @@ -312,7 +462,6 @@ int dissect_image( #if HAVE_BLKID sd_id128_t root_uuid = SD_ID128_NULL, root_verity_uuid = SD_ID128_NULL, usr_uuid = SD_ID128_NULL, usr_verity_uuid = SD_ID128_NULL; - _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; bool is_gpt, is_mbr, generic_rw, multiple_generic = false; _cleanup_(sd_device_unrefp) sd_device *d = NULL; _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL; @@ -321,9 +470,9 @@ int dissect_image( sd_id128_t generic_uuid = SD_ID128_NULL; const char *pttype = NULL; blkid_partlist pl; - int r, generic_nr; + int r, generic_nr, n_partitions; struct stat st; - sd_device *q; + usec_t deadline; assert(fd >= 0); assert(ret); @@ -370,6 +519,27 @@ int dissect_image( if (!S_ISBLK(st.st_mode)) return -ENOTBLK; + r = sd_device_new_from_devnum(&d, 'b', st.st_rdev); + if (r < 0) + return r; + + if (!FLAGS_SET(flags, DISSECT_IMAGE_NO_UDEV)) { + _cleanup_(sd_device_unrefp) sd_device *initialized = NULL; + + /* If udev support is enabled, then let's wait for the device to be initialized before we doing anything. */ + + r = device_wait_for_initialization_harder( + d, + "block", + usec_add(now(CLOCK_MONOTONIC), DEVICE_TIMEOUT_USEC), + &initialized); + if (r < 0) + return r; + + sd_device_unref(d); + d = TAKE_PTR(initialized); + } + b = blkid_new_probe(); if (!b) return -ENOMEM; @@ -399,10 +569,6 @@ int dissect_image( if (!m) return -ENOMEM; - r = sd_device_new_from_devnum(&d, 'b', st.st_rdev); - if (r < 0) - return r; - if ((!(flags & DISSECT_IMAGE_GPT_ONLY) && (flags & DISSECT_IMAGE_REQUIRE_ROOT)) || (flags & DISSECT_IMAGE_NO_PARTITION_TABLE)) { @@ -412,8 +578,8 @@ int dissect_image( (void) blkid_probe_lookup_value(b, "USAGE", &usage, NULL); if (STRPTR_IN_SET(usage, "filesystem", "crypto")) { + const char *fstype = NULL, *options = NULL, *devname = NULL; _cleanup_free_ char *t = NULL, *n = NULL, *o = NULL; - const char *fstype = NULL, *options = NULL; /* OK, we have found a file system, that's our root partition then. */ (void) blkid_probe_lookup_value(b, "TYPE", &fstype, NULL); @@ -424,10 +590,14 @@ int dissect_image( return -ENOMEM; } - r = device_path_make_major_minor(st.st_mode, st.st_rdev, &n); + r = sd_device_get_devname(d, &devname); if (r < 0) return r; + n = strdup(devname); + if (!n) + return -ENOMEM; + m->single_file_system = true; m->verity = verity && verity->root_hash && verity->data_path && (verity->designator < 0 || verity->designator == PARTITION_ROOT); m->can_verity = verity && verity->data_path; @@ -451,13 +621,7 @@ int dissect_image( m->encrypted = streq_ptr(fstype, "crypto_LUKS"); - /* Even on a single partition we need to wait for udev to create the - * /dev/block/X:Y symlink to /dev/loopZ */ - r = loop_wait_for_partitions_to_appear(fd, d, 0, flags, &e); - if (r < 0) - return r; *ret = TAKE_PTR(m); - return 0; } } @@ -472,48 +636,51 @@ int dissect_image( if (!is_gpt && ((flags & DISSECT_IMAGE_GPT_ONLY) || !is_mbr)) return -ENOPKG; + /* Safety check: refuse block devices that carry a partition table but for which the kernel doesn't + * do partition scanning. */ + r = blockdev_partscan_enabled(fd); + if (r < 0) + return r; + if (r == 0) + return -EPROTONOSUPPORT; + errno = 0; pl = blkid_probe_get_partitions(b); if (!pl) return errno_or_else(ENOMEM); - r = loop_wait_for_partitions_to_appear(fd, d, blkid_partlist_numof_partitions(pl), flags, &e); - if (r < 0) - return r; + errno = 0; + n_partitions = blkid_partlist_numof_partitions(pl); + if (n_partitions < 0) + return errno_or_else(EIO); - FOREACH_DEVICE(e, q) { + deadline = usec_add(now(CLOCK_MONOTONIC), DEVICE_TIMEOUT_USEC); + for (int i = 0; i < n_partitions; i++) { + _cleanup_(sd_device_unrefp) sd_device *q = NULL; unsigned long long pflags; blkid_partition pp; const char *node; - dev_t qn; int nr; - r = sd_device_get_devnum(q, &qn); + errno = 0; + pp = blkid_partlist_get_partition(pl, i); + if (!pp) + return errno_or_else(EIO); + + r = wait_for_partition_device(d, pp, deadline, &q); if (r < 0) - continue; - - if (st.st_rdev == qn) - continue; - - if (!device_is_block(q)) - continue; - - if (device_is_mmc_special_partition(q)) - continue; + return r; r = sd_device_get_devname(q, &node); if (r < 0) - continue; - - pp = blkid_partlist_devno_to_partition(pl, qn); - if (!pp) - continue; + return r; pflags = blkid_partition_get_flags(pp); + errno = 0; nr = blkid_partition_get_partno(pp); if (nr < 0) - continue; + return errno_or_else(EIO); if (is_gpt) { PartitionDesignator designator = _PARTITION_DESIGNATOR_INVALID; @@ -1643,7 +1810,7 @@ static int verity_partition( if (r == 0) { /* devmapper might say that the device exists, but the devlink might not yet have been * created. Check and wait for the udev event in that case. */ - r = device_wait_for_devlink(node, "block", 100 * USEC_PER_MSEC, NULL); + r = device_wait_for_devlink(node, "block", usec_add(now(CLOCK_MONOTONIC), 100 * USEC_PER_MSEC), NULL); /* Fallback to activation with a unique device if it's taking too long */ if (r == -ETIMEDOUT) break; diff --git a/src/shared/loop-util.c b/src/shared/loop-util.c index 4843cbcaab..1c92fdbf5f 100644 --- a/src/shared/loop-util.c +++ b/src/shared/loop-util.c @@ -13,17 +13,22 @@ #include #include +#include "sd-device.h" + #include "alloc-util.h" #include "blockdev-util.h" +#include "device-util.h" #include "errno-util.h" #include "fd-util.h" #include "fileio.h" #include "loop-util.h" #include "missing_loop.h" #include "parse-util.h" +#include "random-util.h" #include "stat-util.h" #include "stdio-util.h" #include "string-util.h" +#include "tmpfile-util.h" static void cleanup_clear_loop_close(int *fd) { if (*fd < 0) @@ -33,66 +38,221 @@ static void cleanup_clear_loop_close(int *fd) { (void) safe_close(*fd); } -static int loop_configure(int fd, const struct loop_config *c) { +static int loop_is_bound(int fd) { + struct loop_info64 info; + + assert(fd >= 0); + + if (ioctl(fd, LOOP_GET_STATUS64, &info) < 0) { + if (errno == ENXIO) + return false; /* not bound! */ + + return -errno; + } + + return true; /* bound! */ +} + +static int device_has_block_children(sd_device *d) { + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL; + const char *main_sn, *main_ss; + sd_device *q; + int r; + + assert(d); + + /* Checks if the specified device currently has block device children (i.e. partition block + * devices). */ + + r = sd_device_get_sysname(d, &main_sn); + if (r < 0) + return r; + + r = sd_device_get_subsystem(d, &main_ss); + if (r < 0) + return r; + + if (!streq(main_ss, "block")) + return -EINVAL; + + r = sd_device_enumerator_new(&e); + if (r < 0) + return r; + + r = sd_device_enumerator_allow_uninitialized(e); + if (r < 0) + return r; + + r = sd_device_enumerator_add_match_parent(e, d); + if (r < 0) + return r; + + FOREACH_DEVICE(e, q) { + const char *ss, *sn; + + r = sd_device_get_subsystem(q, &ss); + if (r < 0) + continue; + + if (!streq(ss, "block")) + continue; + + r = sd_device_get_sysname(q, &sn); + if (r < 0) + continue; + + if (streq(sn, main_sn)) + continue; + + return 1; /* we have block device children */ + } + + return 0; +} + +static int loop_configure( + int fd, + int nr, + const struct loop_config *c, + bool *try_loop_configure) { + + _cleanup_(sd_device_unrefp) sd_device *d = NULL; + _cleanup_free_ char *sysname = NULL; + _cleanup_close_ int lock_fd = -1; int r; assert(fd >= 0); + assert(nr >= 0); assert(c); + assert(try_loop_configure); - if (ioctl(fd, LOOP_CONFIGURE, c) < 0) { - /* Do fallback only if LOOP_CONFIGURE is not supported, propagate all other errors. Note that - * the kernel is weird: non-existing ioctls currently return EINVAL rather than ENOTTY on - * loopback block devices. They should fix that in the kernel, but in the meantime we accept - * both here. */ - if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL) - return -errno; - } else { - bool good = true; + if (asprintf(&sysname, "loop%i", nr) < 0) + return -ENOMEM; - if (c->info.lo_sizelimit != 0) { - /* Kernel 5.8 vanilla doesn't properly propagate the size limit into the block - * device. If it's used, let's immediately check if it had the desired effect - * hence. And if not use classic LOOP_SET_STATUS64. */ - uint64_t z; + r = sd_device_new_from_subsystem_sysname(&d, "block", sysname); + if (r < 0) + return r; - if (ioctl(fd, BLKGETSIZE64, &z) < 0) { - r = -errno; - goto fail; - } + /* Let's lock the device before we do anything. We take the BSD lock on a second, separately opened + * fd for the device. udev after all watches for close() events (specifically IN_CLOSE_WRITE) on + * block devices to reprobe them, hence by having a separate fd we will later close() we can ensure + * we trigger udev after everything is done. If we'd lock our own fd instead and keep it open for a + * long time udev would possibly never run on it again, even though the fd is unlocked, simply + * because we never close() it. It also has the nice benefit we can use the _cleanup_close_ logic to + * automatically release the lock, after we are done. */ + lock_fd = fd_reopen(fd, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (lock_fd < 0) + return lock_fd; + if (flock(lock_fd, LOCK_EX) < 0) + return -errno; - if (z != c->info.lo_sizelimit) { - log_debug("LOOP_CONFIGURE is broken, doesn't honour .lo_sizelimit. Falling back to LOOP_SET_STATUS64."); - good = false; - } - } + /* Let's see if the device is really detached, i.e. currently has no associated partition block + * devices. On various kernels (such as 5.8) it is possible to have a loopback block device that + * superficially is detached but still has partition block devices associated for it. They only go + * away when the device is reattached. (Yes, LOOP_CLR_FD doesn't work then, because officially + * nothing is attached and LOOP_CTL_REMOVE doesn't either, since it doesn't care about partition + * block devices. */ + r = device_has_block_children(d); + if (r < 0) + return r; + if (r > 0) { + r = loop_is_bound(fd); + if (r < 0) + return r; + if (r > 0) + return -EBUSY; - if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_PARTSCAN)) { - /* Kernel 5.8 vanilla doesn't properly propagate the partition scanning flag into the - * block device. Let's hence verify if things work correctly here before - * returning. */ - - r = blockdev_partscan_enabled(fd); - if (r < 0) - goto fail; - if (r == 0) { - log_debug("LOOP_CONFIGURE is broken, doesn't honour LO_FLAGS_PARTSCAN. Falling back to LOOP_SET_STATUS64."); - good = false; - } - } - - if (good) - return 0; - - /* Otherwise, undo the attachment and use the old APIs */ - (void) ioctl(fd, LOOP_CLR_FD); + return -EUCLEAN; /* Bound but children? Tell caller to reattach something so that the + * partition block devices are gone too. */ } + if (*try_loop_configure) { + if (ioctl(fd, LOOP_CONFIGURE, c) < 0) { + /* Do fallback only if LOOP_CONFIGURE is not supported, propagate all other + * errors. Note that the kernel is weird: non-existing ioctls currently return EINVAL + * rather than ENOTTY on loopback block devices. They should fix that in the kernel, + * but in the meantime we accept both here. */ + if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL) + return -errno; + + *try_loop_configure = false; + } else { + bool good = true; + + if (c->info.lo_sizelimit != 0) { + /* Kernel 5.8 vanilla doesn't properly propagate the size limit into the + * block device. If it's used, let's immediately check if it had the desired + * effect hence. And if not use classic LOOP_SET_STATUS64. */ + uint64_t z; + + if (ioctl(fd, BLKGETSIZE64, &z) < 0) { + r = -errno; + goto fail; + } + + if (z != c->info.lo_sizelimit) { + log_debug("LOOP_CONFIGURE is broken, doesn't honour .lo_sizelimit. Falling back to LOOP_SET_STATUS64."); + good = false; + } + } + + if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_PARTSCAN)) { + /* Kernel 5.8 vanilla doesn't properly propagate the partition scanning flag + * into the block device. Let's hence verify if things work correctly here + * before returning. */ + + r = blockdev_partscan_enabled(fd); + if (r < 0) + goto fail; + if (r == 0) { + log_debug("LOOP_CONFIGURE is broken, doesn't honour LO_FLAGS_PARTSCAN. Falling back to LOOP_SET_STATUS64."); + good = false; + } + } + + if (!good) { + /* LOOP_CONFIGURE doesn't work. Remember that. */ + *try_loop_configure = false; + + /* We return EBUSY here instead of retrying immediately with LOOP_SET_FD, + * because LOOP_CLR_FD is async: if the operation cannot be executed right + * away it just sets the autoclear flag on the device. This means there's a + * good chance we cannot actually reuse the loopback device right-away. Hence + * let's assume it's busy, avoid the trouble and let the calling loop call us + * again with a new, likely unused device. */ + r = -EBUSY; + goto fail; + } + + return 0; + } + } + + /* Since kernel commit 5db470e229e22b7eda6e23b5566e532c96fb5bc3 (kernel v5.0) the LOOP_SET_STATUS64 + * ioctl can return EAGAIN in case we change the lo_offset field, if someone else is accessing the + * block device while we try to reconfigure it. This is a pretty common case, since udev might + * instantly start probing the device as soon as we attach an fd to it. Hence handle it in two ways: + * first, let's take the BSD lock that that ensures that udev will not step in between the point in + * time where we attach the fd and where we reconfigure the device. Secondly, let's wait 50ms on + * EAGAIN and retry. The former should be an efficient mechanism to avoid we have to wait 50ms + * needlessly if we are just racing against udev. The latter is protection against all other cases, + * i.e. peers that do not take the BSD lock. */ + if (ioctl(fd, LOOP_SET_FD, c->fd) < 0) return -errno; - if (ioctl(fd, LOOP_SET_STATUS64, &c->info) < 0) { - r = -errno; - goto fail; + for (unsigned n_attempts = 0;;) { + if (ioctl(fd, LOOP_SET_STATUS64, &c->info) >= 0) + break; + if (errno != EAGAIN || ++n_attempts >= 64) { + r = log_debug_errno(errno, "Failed to configure loopback device: %m"); + goto fail; + } + + /* Sleep some random time, but at least 10ms, at most 250ms. Increase the delay the more + * failed attempts we see */ + (void) usleep(UINT64_C(10) * USEC_PER_MSEC + + random_u64() % (UINT64_C(240) * USEC_PER_MSEC * n_attempts/64)); } return 0; @@ -102,6 +262,44 @@ fail: return r; } +static int attach_empty_file(int loop, int nr) { + _cleanup_close_ int fd = -1; + + /* So here's the thing: on various kernels (5.8 at least) loop block devices might enter a state + * where they are detached but nonetheless have partitions, when used heavily. Accessing these + * partitions results in immediatey IO errors. There's no pretty way to get rid of them + * again. Neither LOOP_CLR_FD nor LOOP_CTL_REMOVE suffice (see above). What does work is to + * reassociate them with a new fd however. This is what we do here hence: we associate the devices + * with an empty file (i.e. an image that definitely has no partitons). We then immediately clear it + * again. This suffices to make the partitions go away. Ugly but appears to work. */ + + log_debug("Found unattached loopback block device /dev/loop%i with partitions. Attaching empty file to remove them.", nr); + + fd = open_tmpfile_unlinkable(NULL, O_RDONLY); + if (fd < 0) + return fd; + + if (flock(loop, LOCK_EX) < 0) + return -errno; + + if (ioctl(loop, LOOP_SET_FD, fd) < 0) + return -errno; + + if (ioctl(loop, LOOP_SET_STATUS64, &(struct loop_info64) { + .lo_flags = LO_FLAGS_READ_ONLY| + LO_FLAGS_AUTOCLEAR| + LO_FLAGS_PARTSCAN, /* enable partscan, so that the partitions really go away */ + }) < 0) + return -errno; + + if (ioctl(loop, LOOP_CLR_FD) < 0) + return -errno; + + /* The caller is expected to immediately close the loopback device after this, so that the BSD lock + * is released, and udev sees the changes. */ + return 0; +} + int loop_device_make( int fd, int open_flags, @@ -111,6 +309,7 @@ int loop_device_make( LoopDevice **ret) { _cleanup_free_ char *loopdev = NULL; + bool try_loop_configure = true; struct loop_config config; LoopDevice *d = NULL; struct stat st; @@ -201,12 +400,17 @@ int loop_device_make( if (!IN_SET(errno, ENOENT, ENXIO)) return -errno; } else { - r = loop_configure(loop, &config); + r = loop_configure(loop, nr, &config, &try_loop_configure); if (r >= 0) { loop_with_fd = TAKE_FD(loop); break; } - if (r != -EBUSY) + if (r == -EUCLEAN) { + /* Make left-over partition disappear hack (see above) */ + r = attach_empty_file(loop, nr); + if (r < 0 && r != -EBUSY) + return r; + } else if (r != -EBUSY) return r; } @@ -214,6 +418,11 @@ int loop_device_make( return -EBUSY; loopdev = mfree(loopdev); + + /* Wait some random time, to make collision less likely. Let's pick a random time in the + * range 0ms…250ms, linearly scaled by the number of failed attempts. */ + (void) usleep(random_u64() % (UINT64_C(10) * USEC_PER_MSEC + + UINT64_C(240) * USEC_PER_MSEC * n_attempts/64)); } d = new(LoopDevice, 1); diff --git a/src/shared/mount-util.h b/src/shared/mount-util.h index ba5d6280d2..63de2c7dff 100644 --- a/src/shared/mount-util.h +++ b/src/shared/mount-util.h @@ -89,10 +89,11 @@ int mount_option_mangle( int mode_to_inaccessible_node(const char *runtime_dir, mode_t mode, char **dest); /* Useful for usage with _cleanup_(), unmounts, removes a directory and frees the pointer */ -static inline void umount_and_rmdir_and_free(char *p) { +static inline char* umount_and_rmdir_and_free(char *p) { PROTECT_ERRNO; (void) umount_recursive(p, 0); (void) rmdir(p); free(p); + return NULL; } DEFINE_TRIVIAL_CLEANUP_FUNC(char*, umount_and_rmdir_and_free); diff --git a/src/shared/udev-util.c b/src/shared/udev-util.c index 98bbfb2ae3..33272de4bc 100644 --- a/src/shared/udev-util.c +++ b/src/shared/udev-util.c @@ -195,8 +195,9 @@ static int device_wait_for_initialization_internal( sd_device *_device, const char *devlink, const char *subsystem, - usec_t timeout, + usec_t deadline, sd_device **ret) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL; _cleanup_(sd_event_source_unrefp) sd_event_source *timeout_source = NULL; _cleanup_(sd_event_unrefp) sd_event *event = NULL; @@ -256,10 +257,10 @@ static int device_wait_for_initialization_internal( if (r < 0) return log_error_errno(r, "Failed to start device monitor: %m"); - if (timeout != USEC_INFINITY) { - r = sd_event_add_time_relative( + if (deadline != USEC_INFINITY) { + r = sd_event_add_time( event, &timeout_source, - CLOCK_MONOTONIC, timeout, 0, + CLOCK_MONOTONIC, deadline, 0, NULL, INT_TO_PTR(-ETIMEDOUT)); if (r < 0) return log_error_errno(r, "Failed to add timeout event source: %m"); @@ -287,12 +288,12 @@ static int device_wait_for_initialization_internal( return 0; } -int device_wait_for_initialization(sd_device *device, const char *subsystem, usec_t timeout, sd_device **ret) { - return device_wait_for_initialization_internal(device, NULL, subsystem, timeout, ret); +int device_wait_for_initialization(sd_device *device, const char *subsystem, usec_t deadline, sd_device **ret) { + return device_wait_for_initialization_internal(device, NULL, subsystem, deadline, ret); } -int device_wait_for_devlink(const char *devlink, const char *subsystem, usec_t timeout, sd_device **ret) { - return device_wait_for_initialization_internal(NULL, devlink, subsystem, timeout, ret); +int device_wait_for_devlink(const char *devlink, const char *subsystem, usec_t deadline, sd_device **ret) { + return device_wait_for_initialization_internal(NULL, devlink, subsystem, deadline, ret); } int device_is_renaming(sd_device *dev) { diff --git a/src/shared/udev-util.h b/src/shared/udev-util.h index 04c7ce5520..58803ea522 100644 --- a/src/shared/udev-util.h +++ b/src/shared/udev-util.h @@ -28,7 +28,7 @@ static inline int udev_parse_config(void) { return udev_parse_config_full(NULL, NULL, NULL, NULL, NULL); } -int device_wait_for_initialization(sd_device *device, const char *subsystem, usec_t timeout, sd_device **ret); -int device_wait_for_devlink(const char *path, const char *subsystem, usec_t timeout, sd_device **ret); +int device_wait_for_initialization(sd_device *device, const char *subsystem, usec_t deadline, sd_device **ret); +int device_wait_for_devlink(const char *path, const char *subsystem, usec_t deadline, sd_device **ret); int device_is_renaming(sd_device *dev); bool device_for_action(sd_device *dev, DeviceAction action); diff --git a/src/test/meson.build b/src/test/meson.build index ac1182e37a..daf62eb539 100644 --- a/src/test/meson.build +++ b/src/test/meson.build @@ -433,6 +433,17 @@ tests += [ [], []], + [['src/test/test-loop-block.c'], + [libcore, + libshared], + [threads, + libblkid], + '', + '', + [], + includes, + false], + [['src/test/test-selinux.c'], [], []], diff --git a/src/test/test-loop-block.c b/src/test/test-loop-block.c new file mode 100644 index 0000000000..b9533fc16a --- /dev/null +++ b/src/test/test-loop-block.c @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include +#include +#include + +#include "alloc-util.h" +#include "dissect-image.h" +#include "fd-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "gpt.h" +#include "missing_loop.h" +#include "mkfs-util.h" +#include "mount-util.h" +#include "namespace-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "user-util.h" +#include "virt.h" + +#define N_THREADS 5 +#define N_ITERATIONS 3 + +static usec_t end = 0; + +static void* thread_func(void *ptr) { + int fd = PTR_TO_FD(ptr); + int r; + + for (unsigned i = 0; i < N_ITERATIONS; i++) { + _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL; + _cleanup_(umount_and_rmdir_and_freep) char *mounted = NULL; + _cleanup_(dissected_image_unrefp) DissectedImage *dissected = NULL; + + if (now(CLOCK_MONOTONIC) >= end) { + log_notice("Time's up, exiting thread's loop"); + break; + } + + log_notice("> Thread iteration #%u.", i); + + assert_se(mkdtemp_malloc(NULL, &mounted) >= 0); + + r = loop_device_make(fd, O_RDONLY, 0, UINT64_MAX, LO_FLAGS_PARTSCAN, &loop); + if (r < 0) + log_error_errno(r, "Failed to allocate loopback device: %m"); + assert_se(r >= 0); + + log_notice("Acquired loop device %s, will mount on %s", loop->node, mounted); + + r = dissect_image(loop->fd, NULL, NULL, DISSECT_IMAGE_READ_ONLY, &dissected); + if (r < 0) + log_error_errno(r, "Failed dissect loopback device %s: %m", loop->node); + assert_se(r >= 0); + + log_info("Dissected loop device %s", loop->node); + + for (PartitionDesignator d = 0; d < _PARTITION_DESIGNATOR_MAX; d++) { + if (!dissected->partitions[d].found) + continue; + + log_notice("Found node %s fstype %s designator %s", + dissected->partitions[d].node, + dissected->partitions[d].fstype, + partition_designator_to_string(d)); + } + + assert_se(dissected->partitions[PARTITION_ESP].found); + assert_se(dissected->partitions[PARTITION_ESP].node); + assert_se(dissected->partitions[PARTITION_XBOOTLDR].found); + assert_se(dissected->partitions[PARTITION_XBOOTLDR].node); + assert_se(dissected->partitions[PARTITION_ROOT].found); + assert_se(dissected->partitions[PARTITION_ROOT].node); + assert_se(dissected->partitions[PARTITION_HOME].found); + assert_se(dissected->partitions[PARTITION_HOME].node); + + r = dissected_image_mount(dissected, mounted, UID_INVALID, DISSECT_IMAGE_READ_ONLY); + log_notice_errno(r, "Mounted %s → %s: %m", loop->node, mounted); + assert_se(r >= 0); + + log_notice("Unmounting %s", mounted); + mounted = umount_and_rmdir_and_free(mounted); + + log_notice("Unmounted."); + + dissected = dissected_image_unref(dissected); + + log_notice("Detaching loop device %s", loop->node); + loop = loop_device_unref(loop); + log_notice("Detached loop device."); + } + + log_notice("Leaving thread"); + + return NULL; +} + +static bool have_root_gpt_type(void) { +#ifdef GPT_ROOT_NATIVE + return true; +#else + return false; +#endif +} + +int main(int argc, char *argv[]) { + _cleanup_free_ char *p = NULL, *cmd = NULL; + _cleanup_(pclosep) FILE *sfdisk = NULL; + _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL; + _cleanup_close_ int fd = -1; + _cleanup_(dissected_image_unrefp) DissectedImage *dissected = NULL; + _cleanup_(umount_and_rmdir_and_freep) char *mounted = NULL; + pthread_t threads[N_THREADS]; + const char *fs; + sd_id128_t id; + int r; + + test_setup_logging(LOG_DEBUG); + log_show_tid(true); + log_show_time(true); + + if (!have_root_gpt_type()) { + log_tests_skipped("No root partition GPT defined for this architecture, exiting."); + return EXIT_TEST_SKIP; + } + + if (detect_container() > 0) { + log_tests_skipped("Test not supported in a container, requires udev/uevent notifications."); + return EXIT_TEST_SKIP; + } + + if (strstr_ptr(ci_environment(), "autopkgtest")) { + // FIXME: we should reenable this one day + log_tests_skipped("Skipping test on Ubuntu autopkgtest CI, test too slow and installed udev too flakey."); + return EXIT_TEST_SKIP; + } + + /* This is a test for the loopback block device setup code and it's use by the image dissection + * logic: since the kernel APIs are hard use and prone to races, let's test this in a heavy duty + * test: we open a bunch of threads and repeatedly allocate and deallocate loopback block devices in + * them in parallel, with an image file with a number of partitions. */ + + r = detach_mount_namespace(); + if (ERRNO_IS_PRIVILEGE(r)) { + log_tests_skipped("Lacking privileges"); + return EXIT_TEST_SKIP; + } + + FOREACH_STRING(fs, "vfat", "ext4") { + r = mkfs_exists(fs); + assert_se(r >= 0); + if (!r) { + log_tests_skipped("mkfs.{vfat|ext4} not installed"); + return EXIT_TEST_SKIP; + } + } + + assert_se(r >= 0); + + assert_se(tempfn_random_child("/var/tmp", "sfdisk", &p) >= 0); + fd = open(p, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC|O_NOFOLLOW, 0666); + assert_se(fd >= 0); + assert_se(ftruncate(fd, 256*1024*1024) >= 0); + + assert_se(cmd = strjoin("sfdisk ", p)); + assert_se(sfdisk = popen(cmd, "we")); + + /* A reasonably complex partition table that fits on a 64K disk */ + fputs("label: gpt\n" + "size=32M, type=C12A7328-F81F-11D2-BA4B-00A0C93EC93B\n" + "size=32M, type=BC13C2FF-59E6-4262-A352-B275FD6F7172\n" + "size=32M, type=0657FD6D-A4AB-43C4-84E5-0933C84B4F4F\n" + "size=32M, type=", sfdisk); + +#ifdef GPT_ROOT_NATIVE + fprintf(sfdisk, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(GPT_ROOT_NATIVE)); +#else + fprintf(sfdisk, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(GPT_ROOT_X86_64)); +#endif + + fputs("\n" + "size=32M, type=933AC7E1-2EB4-4F13-B844-0E14E2AEF915\n", sfdisk); + + assert_se(pclose(sfdisk) == 0); + sfdisk = NULL; + + assert_se(loop_device_make(fd, O_RDWR, 0, UINT64_MAX, LO_FLAGS_PARTSCAN, &loop) >= 0); + assert_se(dissect_image(loop->fd, NULL, NULL, 0, &dissected) >= 0); + + assert_se(dissected->partitions[PARTITION_ESP].found); + assert_se(dissected->partitions[PARTITION_ESP].node); + assert_se(dissected->partitions[PARTITION_XBOOTLDR].found); + assert_se(dissected->partitions[PARTITION_XBOOTLDR].node); + assert_se(dissected->partitions[PARTITION_ROOT].found); + assert_se(dissected->partitions[PARTITION_ROOT].node); + assert_se(dissected->partitions[PARTITION_HOME].found); + assert_se(dissected->partitions[PARTITION_HOME].node); + + assert_se(sd_id128_randomize(&id) >= 0); + assert_se(make_filesystem(dissected->partitions[PARTITION_ESP].node, "vfat", "EFI", id, true) >= 0); + + assert_se(sd_id128_randomize(&id) >= 0); + assert_se(make_filesystem(dissected->partitions[PARTITION_XBOOTLDR].node, "vfat", "xbootldr", id, true) >= 0); + + assert_se(sd_id128_randomize(&id) >= 0); + assert_se(make_filesystem(dissected->partitions[PARTITION_ROOT].node, "ext4", "root", id, true) >= 0); + + assert_se(sd_id128_randomize(&id) >= 0); + assert_se(make_filesystem(dissected->partitions[PARTITION_HOME].node, "ext4", "home", id, true) >= 0); + + dissected = dissected_image_unref(dissected); + assert_se(dissect_image(loop->fd, NULL, NULL, 0, &dissected) >= 0); + + assert_se(mkdtemp_malloc(NULL, &mounted) >= 0); + + /* This first (writable) mount will initialize the mount point dirs, so that the subsequent read-only ones can work */ + assert_se(dissected_image_mount(dissected, mounted, UID_INVALID, 0) >= 0); + + assert_se(umount_recursive(mounted, 0) >= 0); + loop = loop_device_unref(loop); + + log_notice("Threads are being started now"); + + /* Let's make sure we run for 10s on slow systems at max */ + end = usec_add(now(CLOCK_MONOTONIC), + slow_tests_enabled() ? 5 * USEC_PER_SEC : + 1 * USEC_PER_SEC); + + for (unsigned i = 0; i < N_THREADS; i++) + assert_se(pthread_create(threads + i, NULL, thread_func, FD_TO_PTR(fd)) == 0); + + log_notice("All threads started now."); + + for (unsigned i = 0; i < N_THREADS; i++) { + log_notice("Joining thread #%u.", i); + + void *k; + assert_se(pthread_join(threads[i], &k) == 0); + assert_se(k == NULL); + + log_notice("Joined thread #%u.", i); + } + + log_notice("Threads are all terminated now."); + + return 0; +} diff --git a/src/udev/udevadm-info.c b/src/udev/udevadm-info.c index ae6d8caf54..27e9a226cb 100644 --- a/src/udev/udevadm-info.c +++ b/src/udev/udevadm-info.c @@ -493,7 +493,11 @@ int info_main(int argc, char *argv[], void *userdata) { if (arg_wait_for_initialization_timeout > 0) { sd_device *d; - r = device_wait_for_initialization(device, NULL, arg_wait_for_initialization_timeout, &d); + r = device_wait_for_initialization( + device, + NULL, + usec_add(now(CLOCK_MONOTONIC), arg_wait_for_initialization_timeout), + &d); if (r < 0) return r;