Merge pull request #16859 from poettering/loop-eagain

loop: handle EAGAIN on LOOP_SET_STATUS64
This commit is contained in:
Lennart Poettering 2020-10-23 13:15:04 +02:00 committed by GitHub
commit 17d99f95c2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 859 additions and 215 deletions

View File

@ -3338,6 +3338,7 @@ foreach tuple : tests
type = tuple.length() >= 5 ? tuple[4] : ''
defs = tuple.length() >= 6 ? tuple[5] : []
incs = tuple.length() >= 7 ? tuple[6] : includes
parallel = tuple.length() >= 8 ? tuple[7] : true
timeout = 30
name = sources[0].split('/')[-1].split('.')[0]

View File

@ -109,31 +109,6 @@ not_found:
}
#if HAVE_BLKID
/* Detect RPMB and Boot partitions, which are not listed by blkid.
* See https://github.com/systemd/systemd/issues/5806. */
static bool device_is_mmc_special_partition(sd_device *d) {
const char *sysname;
assert(d);
if (sd_device_get_sysname(d, &sysname) < 0)
return false;
return startswith(sysname, "mmcblk") &&
(endswith(sysname, "rpmb") || endswith(sysname, "boot0") || endswith(sysname, "boot1"));
}
static bool device_is_block(sd_device *d) {
const char *ss;
assert(d);
if (sd_device_get_subsystem(d, &ss) < 0)
return false;
return streq(ss, "block");
}
static int enumerator_for_parent(sd_device *d, sd_device_enumerator **ret) {
_cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
int r;
@ -157,124 +132,217 @@ static int enumerator_for_parent(sd_device *d, sd_device_enumerator **ret) {
return 0;
}
static int wait_for_partitions_to_appear(
int fd,
sd_device *d,
unsigned num_partitions,
DissectImageFlags flags,
sd_device_enumerator **ret_enumerator) {
static int device_is_partition(sd_device *d, blkid_partition pp) {
blkid_loff_t bsize, bstart;
uint64_t size, start;
int partno, bpartno, r;
const char *ss, *v;
_cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
sd_device *q;
unsigned n;
int r;
assert(fd >= 0);
assert(d);
assert(ret_enumerator);
assert(pp);
r = enumerator_for_parent(d, &e);
r = sd_device_get_subsystem(d, &ss);
if (r < 0)
return r;
if (!streq(ss, "block"))
return false;
r = sd_device_get_sysattr_value(d, "partition", &v);
if (r == -ENOENT) /* Not a partition device */
return false;
if (r < 0)
return r;
r = safe_atoi(v, &partno);
if (r < 0)
return r;
/* Count the partitions enumerated by the kernel */
n = 0;
FOREACH_DEVICE(e, q) {
if (sd_device_get_devnum(q, NULL) < 0)
continue;
if (!device_is_block(q))
continue;
if (device_is_mmc_special_partition(q))
continue;
errno = 0;
bpartno = blkid_partition_get_partno(pp);
if (bpartno < 0)
return errno_or_else(EIO);
if (!FLAGS_SET(flags, DISSECT_IMAGE_NO_UDEV)) {
r = device_wait_for_initialization(q, "block", USEC_INFINITY, NULL);
if (r < 0)
return r;
}
if (partno != bpartno)
return false;
n++;
}
r = sd_device_get_sysattr_value(d, "start", &v);
if (r < 0)
return r;
r = safe_atou64(v, &start);
if (r < 0)
return r;
if (n == num_partitions + 1) {
*ret_enumerator = TAKE_PTR(e);
return 0; /* success! */
}
if (n > num_partitions + 1)
return log_debug_errno(SYNTHETIC_ERRNO(EIO),
"blkid and kernel partition lists do not match.");
errno = 0;
bstart = blkid_partition_get_start(pp);
if (bstart < 0)
return errno_or_else(EIO);
/* The kernel has probed fewer partitions than blkid? Maybe the kernel prober is still running or it
* got EBUSY because udev already opened the device. Let's reprobe the device, which is a synchronous
* call that waits until probing is complete. */
if (start != (uint64_t) bstart)
return false;
for (unsigned j = 0; ; j++) {
if (j++ > 20)
return -EBUSY;
r = sd_device_get_sysattr_value(d, "size", &v);
if (r < 0)
return r;
r = safe_atou64(v, &size);
if (r < 0)
return r;
if (ioctl(fd, BLKRRPART, 0) >= 0)
break;
r = -errno;
if (r == -EINVAL) {
/* If we are running on a block device that has partition scanning off, return an
* explicit recognizable error about this, so that callers can generate a proper
* message explaining the situation. */
errno = 0;
bsize = blkid_partition_get_size(pp);
if (bsize < 0)
return errno_or_else(EIO);
r = blockdev_partscan_enabled(fd);
if (r < 0)
return r;
if (r == 0)
return log_debug_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT),
"Device is a loop device and partition scanning is off!");
if (size != (uint64_t) bsize)
return false;
return -EINVAL; /* original error */
}
if (r != -EBUSY)
return r;
/* If something else has the device open, such as an udev rule, the ioctl will return
* EBUSY. Since there's no way to wait until it isn't busy anymore, let's just wait a bit,
* and try again.
*
* This is really something they should fix in the kernel! */
(void) usleep(50 * USEC_PER_MSEC);
}
return -EAGAIN; /* no success yet, try again */
return true;
}
static int loop_wait_for_partitions_to_appear(
int fd,
sd_device *d,
unsigned num_partitions,
DissectImageFlags flags,
sd_device_enumerator **ret_enumerator) {
_cleanup_(sd_device_unrefp) sd_device *device = NULL;
static int find_partition(
sd_device *parent,
blkid_partition pp,
sd_device **ret) {
_cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
sd_device *q;
int r;
assert(fd >= 0);
assert(d);
assert(ret_enumerator);
assert(parent);
assert(pp);
assert(ret);
log_debug("Waiting for device (parent + %d partitions) to appear...", num_partitions);
r = enumerator_for_parent(parent, &e);
if (r < 0)
return r;
if (!FLAGS_SET(flags, DISSECT_IMAGE_NO_UDEV)) {
r = device_wait_for_initialization(d, "block", USEC_INFINITY, &device);
FOREACH_DEVICE(e, q) {
r = device_is_partition(q, pp);
if (r < 0)
return r;
} else
device = sd_device_ref(d);
if (r > 0) {
*ret = sd_device_ref(q);
return 0;
}
}
for (unsigned i = 0; i < N_DEVICE_NODE_LIST_ATTEMPTS; i++) {
r = wait_for_partitions_to_appear(fd, device, num_partitions, flags, ret_enumerator);
if (r != -EAGAIN)
return -ENXIO;
}
struct wait_data {
sd_device *parent_device;
blkid_partition blkidp;
sd_device *found;
};
static inline void wait_data_done(struct wait_data *d) {
sd_device_unref(d->found);
}
static int device_monitor_handler(sd_device_monitor *monitor, sd_device *device, void *userdata) {
const char *parent1_path, *parent2_path;
struct wait_data *w = userdata;
sd_device *pp;
int r;
assert(w);
if (device_for_action(device, DEVICE_ACTION_REMOVE))
return 0;
r = sd_device_get_parent(device, &pp);
if (r < 0)
return 0; /* Doesn't have a parent? No relevant to us */
r = sd_device_get_syspath(pp, &parent1_path); /* Check parent of device of this action */
if (r < 0)
goto finish;
r = sd_device_get_syspath(w->parent_device, &parent2_path); /* Check parent of device we are looking for */
if (r < 0)
goto finish;
if (!path_equal(parent1_path, parent2_path))
return 0; /* Has a different parent than what we need, not interesting to us */
r = device_is_partition(device, w->blkidp);
if (r < 0)
goto finish;
if (r == 0) /* Not the one we need */
return 0;
/* It's the one we need! Yay! */
assert(!w->found);
w->found = sd_device_ref(device);
r = 0;
finish:
return sd_event_exit(sd_device_monitor_get_event(monitor), r);
}
static int wait_for_partition_device(
sd_device *parent,
blkid_partition pp,
usec_t deadline,
sd_device **ret) {
_cleanup_(sd_event_source_unrefp) sd_event_source *timeout_source = NULL;
_cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL;
_cleanup_(sd_event_unrefp) sd_event *event = NULL;
int r;
assert(parent);
assert(pp);
assert(ret);
r = find_partition(parent, pp, ret);
if (r != -ENXIO)
return r;
r = sd_event_new(&event);
if (r < 0)
return r;
r = sd_device_monitor_new(&monitor);
if (r < 0)
return r;
r = sd_device_monitor_filter_add_match_subsystem_devtype(monitor, "block", "partition");
if (r < 0)
return r;
r = sd_device_monitor_attach_event(monitor, event);
if (r < 0)
return r;
_cleanup_(wait_data_done) struct wait_data w = {
.parent_device = parent,
.blkidp = pp,
};
r = sd_device_monitor_start(monitor, device_monitor_handler, &w);
if (r < 0)
return r;
/* Check again, the partition might have appeared in the meantime */
r = find_partition(parent, pp, ret);
if (r != -ENXIO)
return r;
if (deadline != USEC_INFINITY) {
r = sd_event_add_time(
event, &timeout_source,
CLOCK_MONOTONIC, deadline, 0,
NULL, INT_TO_PTR(-ETIMEDOUT));
if (r < 0)
return r;
}
return log_debug_errno(SYNTHETIC_ERRNO(ENXIO),
"Kernel partitions dit not appear within %d attempts",
N_DEVICE_NODE_LIST_ATTEMPTS);
r = sd_event_loop(event);
if (r < 0)
return r;
assert(w.found);
*ret = TAKE_PTR(w.found);
return 0;
}
static void check_partition_flags(
@ -300,8 +368,90 @@ static void check_partition_flags(
}
}
static int device_wait_for_initialization_harder(
sd_device *device,
const char *subsystem,
usec_t deadline,
sd_device **ret) {
_cleanup_free_ char *uevent = NULL;
usec_t start, left, retrigger_timeout;
int r;
start = now(CLOCK_MONOTONIC);
left = usec_sub_unsigned(deadline, start);
if (DEBUG_LOGGING) {
char buf[FORMAT_TIMESPAN_MAX];
const char *sn = NULL;
(void) sd_device_get_sysname(device, &sn);
log_debug("Waiting for device '%s' to initialize for %s.", strna(sn), format_timespan(buf, sizeof(buf), left, 0));
}
if (left != USEC_INFINITY)
retrigger_timeout = CLAMP(left / 4, 1 * USEC_PER_SEC, 5 * USEC_PER_SEC); /* A fourth of the total timeout, but let's clamp to 1s…5s range */
else
retrigger_timeout = 2 * USEC_PER_SEC;
for (;;) {
usec_t local_deadline, n;
bool last_try;
n = now(CLOCK_MONOTONIC);
assert(n >= start);
/* Find next deadline, when we'll retrigger */
local_deadline = start +
DIV_ROUND_UP(n - start, retrigger_timeout) * retrigger_timeout;
if (deadline != USEC_INFINITY && deadline <= local_deadline) {
local_deadline = deadline;
last_try = true;
} else
last_try = false;
r = device_wait_for_initialization(device, subsystem, local_deadline, ret);
if (r >= 0 && DEBUG_LOGGING) {
char buf[FORMAT_TIMESPAN_MAX];
const char *sn = NULL;
(void) sd_device_get_sysname(device, &sn);
log_debug("Successfully waited for device '%s' to initialize for %s.", strna(sn), format_timespan(buf, sizeof(buf), usec_sub_unsigned(now(CLOCK_MONOTONIC), start), 0));
}
if (r != -ETIMEDOUT || last_try)
return r;
if (!uevent) {
const char *syspath;
r = sd_device_get_syspath(device, &syspath);
if (r < 0)
return r;
uevent = path_join(syspath, "uevent");
if (!uevent)
return -ENOMEM;
}
if (DEBUG_LOGGING) {
char buf[FORMAT_TIMESPAN_MAX];
log_debug("Device didn't initialize within %s, assuming lost event. Retriggering device through %s.",
format_timespan(buf, sizeof(buf), usec_sub_unsigned(now(CLOCK_MONOTONIC), start), 0),
uevent);
}
r = write_string_file(uevent, "change", WRITE_STRING_FILE_DISABLE_BUFFER);
if (r < 0)
return r;
}
}
#endif
#define DEVICE_TIMEOUT_USEC (45 * USEC_PER_SEC)
int dissect_image(
int fd,
const VeritySettings *verity,
@ -312,7 +462,6 @@ int dissect_image(
#if HAVE_BLKID
sd_id128_t root_uuid = SD_ID128_NULL, root_verity_uuid = SD_ID128_NULL,
usr_uuid = SD_ID128_NULL, usr_verity_uuid = SD_ID128_NULL;
_cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
bool is_gpt, is_mbr, generic_rw, multiple_generic = false;
_cleanup_(sd_device_unrefp) sd_device *d = NULL;
_cleanup_(dissected_image_unrefp) DissectedImage *m = NULL;
@ -321,9 +470,9 @@ int dissect_image(
sd_id128_t generic_uuid = SD_ID128_NULL;
const char *pttype = NULL;
blkid_partlist pl;
int r, generic_nr;
int r, generic_nr, n_partitions;
struct stat st;
sd_device *q;
usec_t deadline;
assert(fd >= 0);
assert(ret);
@ -370,6 +519,27 @@ int dissect_image(
if (!S_ISBLK(st.st_mode))
return -ENOTBLK;
r = sd_device_new_from_devnum(&d, 'b', st.st_rdev);
if (r < 0)
return r;
if (!FLAGS_SET(flags, DISSECT_IMAGE_NO_UDEV)) {
_cleanup_(sd_device_unrefp) sd_device *initialized = NULL;
/* If udev support is enabled, then let's wait for the device to be initialized before we doing anything. */
r = device_wait_for_initialization_harder(
d,
"block",
usec_add(now(CLOCK_MONOTONIC), DEVICE_TIMEOUT_USEC),
&initialized);
if (r < 0)
return r;
sd_device_unref(d);
d = TAKE_PTR(initialized);
}
b = blkid_new_probe();
if (!b)
return -ENOMEM;
@ -399,10 +569,6 @@ int dissect_image(
if (!m)
return -ENOMEM;
r = sd_device_new_from_devnum(&d, 'b', st.st_rdev);
if (r < 0)
return r;
if ((!(flags & DISSECT_IMAGE_GPT_ONLY) &&
(flags & DISSECT_IMAGE_REQUIRE_ROOT)) ||
(flags & DISSECT_IMAGE_NO_PARTITION_TABLE)) {
@ -412,8 +578,8 @@ int dissect_image(
(void) blkid_probe_lookup_value(b, "USAGE", &usage, NULL);
if (STRPTR_IN_SET(usage, "filesystem", "crypto")) {
const char *fstype = NULL, *options = NULL, *devname = NULL;
_cleanup_free_ char *t = NULL, *n = NULL, *o = NULL;
const char *fstype = NULL, *options = NULL;
/* OK, we have found a file system, that's our root partition then. */
(void) blkid_probe_lookup_value(b, "TYPE", &fstype, NULL);
@ -424,10 +590,14 @@ int dissect_image(
return -ENOMEM;
}
r = device_path_make_major_minor(st.st_mode, st.st_rdev, &n);
r = sd_device_get_devname(d, &devname);
if (r < 0)
return r;
n = strdup(devname);
if (!n)
return -ENOMEM;
m->single_file_system = true;
m->verity = verity && verity->root_hash && verity->data_path && (verity->designator < 0 || verity->designator == PARTITION_ROOT);
m->can_verity = verity && verity->data_path;
@ -451,13 +621,7 @@ int dissect_image(
m->encrypted = streq_ptr(fstype, "crypto_LUKS");
/* Even on a single partition we need to wait for udev to create the
* /dev/block/X:Y symlink to /dev/loopZ */
r = loop_wait_for_partitions_to_appear(fd, d, 0, flags, &e);
if (r < 0)
return r;
*ret = TAKE_PTR(m);
return 0;
}
}
@ -472,48 +636,51 @@ int dissect_image(
if (!is_gpt && ((flags & DISSECT_IMAGE_GPT_ONLY) || !is_mbr))
return -ENOPKG;
/* Safety check: refuse block devices that carry a partition table but for which the kernel doesn't
* do partition scanning. */
r = blockdev_partscan_enabled(fd);
if (r < 0)
return r;
if (r == 0)
return -EPROTONOSUPPORT;
errno = 0;
pl = blkid_probe_get_partitions(b);
if (!pl)
return errno_or_else(ENOMEM);
r = loop_wait_for_partitions_to_appear(fd, d, blkid_partlist_numof_partitions(pl), flags, &e);
if (r < 0)
return r;
errno = 0;
n_partitions = blkid_partlist_numof_partitions(pl);
if (n_partitions < 0)
return errno_or_else(EIO);
FOREACH_DEVICE(e, q) {
deadline = usec_add(now(CLOCK_MONOTONIC), DEVICE_TIMEOUT_USEC);
for (int i = 0; i < n_partitions; i++) {
_cleanup_(sd_device_unrefp) sd_device *q = NULL;
unsigned long long pflags;
blkid_partition pp;
const char *node;
dev_t qn;
int nr;
r = sd_device_get_devnum(q, &qn);
errno = 0;
pp = blkid_partlist_get_partition(pl, i);
if (!pp)
return errno_or_else(EIO);
r = wait_for_partition_device(d, pp, deadline, &q);
if (r < 0)
continue;
if (st.st_rdev == qn)
continue;
if (!device_is_block(q))
continue;
if (device_is_mmc_special_partition(q))
continue;
return r;
r = sd_device_get_devname(q, &node);
if (r < 0)
continue;
pp = blkid_partlist_devno_to_partition(pl, qn);
if (!pp)
continue;
return r;
pflags = blkid_partition_get_flags(pp);
errno = 0;
nr = blkid_partition_get_partno(pp);
if (nr < 0)
continue;
return errno_or_else(EIO);
if (is_gpt) {
PartitionDesignator designator = _PARTITION_DESIGNATOR_INVALID;
@ -1643,7 +1810,7 @@ static int verity_partition(
if (r == 0) {
/* devmapper might say that the device exists, but the devlink might not yet have been
* created. Check and wait for the udev event in that case. */
r = device_wait_for_devlink(node, "block", 100 * USEC_PER_MSEC, NULL);
r = device_wait_for_devlink(node, "block", usec_add(now(CLOCK_MONOTONIC), 100 * USEC_PER_MSEC), NULL);
/* Fallback to activation with a unique device if it's taking too long */
if (r == -ETIMEDOUT)
break;

View File

@ -13,17 +13,22 @@
#include <sys/ioctl.h>
#include <unistd.h>
#include "sd-device.h"
#include "alloc-util.h"
#include "blockdev-util.h"
#include "device-util.h"
#include "errno-util.h"
#include "fd-util.h"
#include "fileio.h"
#include "loop-util.h"
#include "missing_loop.h"
#include "parse-util.h"
#include "random-util.h"
#include "stat-util.h"
#include "stdio-util.h"
#include "string-util.h"
#include "tmpfile-util.h"
static void cleanup_clear_loop_close(int *fd) {
if (*fd < 0)
@ -33,66 +38,221 @@ static void cleanup_clear_loop_close(int *fd) {
(void) safe_close(*fd);
}
static int loop_configure(int fd, const struct loop_config *c) {
static int loop_is_bound(int fd) {
struct loop_info64 info;
assert(fd >= 0);
if (ioctl(fd, LOOP_GET_STATUS64, &info) < 0) {
if (errno == ENXIO)
return false; /* not bound! */
return -errno;
}
return true; /* bound! */
}
static int device_has_block_children(sd_device *d) {
_cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
const char *main_sn, *main_ss;
sd_device *q;
int r;
assert(d);
/* Checks if the specified device currently has block device children (i.e. partition block
* devices). */
r = sd_device_get_sysname(d, &main_sn);
if (r < 0)
return r;
r = sd_device_get_subsystem(d, &main_ss);
if (r < 0)
return r;
if (!streq(main_ss, "block"))
return -EINVAL;
r = sd_device_enumerator_new(&e);
if (r < 0)
return r;
r = sd_device_enumerator_allow_uninitialized(e);
if (r < 0)
return r;
r = sd_device_enumerator_add_match_parent(e, d);
if (r < 0)
return r;
FOREACH_DEVICE(e, q) {
const char *ss, *sn;
r = sd_device_get_subsystem(q, &ss);
if (r < 0)
continue;
if (!streq(ss, "block"))
continue;
r = sd_device_get_sysname(q, &sn);
if (r < 0)
continue;
if (streq(sn, main_sn))
continue;
return 1; /* we have block device children */
}
return 0;
}
static int loop_configure(
int fd,
int nr,
const struct loop_config *c,
bool *try_loop_configure) {
_cleanup_(sd_device_unrefp) sd_device *d = NULL;
_cleanup_free_ char *sysname = NULL;
_cleanup_close_ int lock_fd = -1;
int r;
assert(fd >= 0);
assert(nr >= 0);
assert(c);
assert(try_loop_configure);
if (ioctl(fd, LOOP_CONFIGURE, c) < 0) {
/* Do fallback only if LOOP_CONFIGURE is not supported, propagate all other errors. Note that
* the kernel is weird: non-existing ioctls currently return EINVAL rather than ENOTTY on
* loopback block devices. They should fix that in the kernel, but in the meantime we accept
* both here. */
if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL)
return -errno;
} else {
bool good = true;
if (asprintf(&sysname, "loop%i", nr) < 0)
return -ENOMEM;
if (c->info.lo_sizelimit != 0) {
/* Kernel 5.8 vanilla doesn't properly propagate the size limit into the block
* device. If it's used, let's immediately check if it had the desired effect
* hence. And if not use classic LOOP_SET_STATUS64. */
uint64_t z;
r = sd_device_new_from_subsystem_sysname(&d, "block", sysname);
if (r < 0)
return r;
if (ioctl(fd, BLKGETSIZE64, &z) < 0) {
r = -errno;
goto fail;
}
/* Let's lock the device before we do anything. We take the BSD lock on a second, separately opened
* fd for the device. udev after all watches for close() events (specifically IN_CLOSE_WRITE) on
* block devices to reprobe them, hence by having a separate fd we will later close() we can ensure
* we trigger udev after everything is done. If we'd lock our own fd instead and keep it open for a
* long time udev would possibly never run on it again, even though the fd is unlocked, simply
* because we never close() it. It also has the nice benefit we can use the _cleanup_close_ logic to
* automatically release the lock, after we are done. */
lock_fd = fd_reopen(fd, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
if (lock_fd < 0)
return lock_fd;
if (flock(lock_fd, LOCK_EX) < 0)
return -errno;
if (z != c->info.lo_sizelimit) {
log_debug("LOOP_CONFIGURE is broken, doesn't honour .lo_sizelimit. Falling back to LOOP_SET_STATUS64.");
good = false;
}
}
/* Let's see if the device is really detached, i.e. currently has no associated partition block
* devices. On various kernels (such as 5.8) it is possible to have a loopback block device that
* superficially is detached but still has partition block devices associated for it. They only go
* away when the device is reattached. (Yes, LOOP_CLR_FD doesn't work then, because officially
* nothing is attached and LOOP_CTL_REMOVE doesn't either, since it doesn't care about partition
* block devices. */
r = device_has_block_children(d);
if (r < 0)
return r;
if (r > 0) {
r = loop_is_bound(fd);
if (r < 0)
return r;
if (r > 0)
return -EBUSY;
if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_PARTSCAN)) {
/* Kernel 5.8 vanilla doesn't properly propagate the partition scanning flag into the
* block device. Let's hence verify if things work correctly here before
* returning. */
r = blockdev_partscan_enabled(fd);
if (r < 0)
goto fail;
if (r == 0) {
log_debug("LOOP_CONFIGURE is broken, doesn't honour LO_FLAGS_PARTSCAN. Falling back to LOOP_SET_STATUS64.");
good = false;
}
}
if (good)
return 0;
/* Otherwise, undo the attachment and use the old APIs */
(void) ioctl(fd, LOOP_CLR_FD);
return -EUCLEAN; /* Bound but children? Tell caller to reattach something so that the
* partition block devices are gone too. */
}
if (*try_loop_configure) {
if (ioctl(fd, LOOP_CONFIGURE, c) < 0) {
/* Do fallback only if LOOP_CONFIGURE is not supported, propagate all other
* errors. Note that the kernel is weird: non-existing ioctls currently return EINVAL
* rather than ENOTTY on loopback block devices. They should fix that in the kernel,
* but in the meantime we accept both here. */
if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL)
return -errno;
*try_loop_configure = false;
} else {
bool good = true;
if (c->info.lo_sizelimit != 0) {
/* Kernel 5.8 vanilla doesn't properly propagate the size limit into the
* block device. If it's used, let's immediately check if it had the desired
* effect hence. And if not use classic LOOP_SET_STATUS64. */
uint64_t z;
if (ioctl(fd, BLKGETSIZE64, &z) < 0) {
r = -errno;
goto fail;
}
if (z != c->info.lo_sizelimit) {
log_debug("LOOP_CONFIGURE is broken, doesn't honour .lo_sizelimit. Falling back to LOOP_SET_STATUS64.");
good = false;
}
}
if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_PARTSCAN)) {
/* Kernel 5.8 vanilla doesn't properly propagate the partition scanning flag
* into the block device. Let's hence verify if things work correctly here
* before returning. */
r = blockdev_partscan_enabled(fd);
if (r < 0)
goto fail;
if (r == 0) {
log_debug("LOOP_CONFIGURE is broken, doesn't honour LO_FLAGS_PARTSCAN. Falling back to LOOP_SET_STATUS64.");
good = false;
}
}
if (!good) {
/* LOOP_CONFIGURE doesn't work. Remember that. */
*try_loop_configure = false;
/* We return EBUSY here instead of retrying immediately with LOOP_SET_FD,
* because LOOP_CLR_FD is async: if the operation cannot be executed right
* away it just sets the autoclear flag on the device. This means there's a
* good chance we cannot actually reuse the loopback device right-away. Hence
* let's assume it's busy, avoid the trouble and let the calling loop call us
* again with a new, likely unused device. */
r = -EBUSY;
goto fail;
}
return 0;
}
}
/* Since kernel commit 5db470e229e22b7eda6e23b5566e532c96fb5bc3 (kernel v5.0) the LOOP_SET_STATUS64
* ioctl can return EAGAIN in case we change the lo_offset field, if someone else is accessing the
* block device while we try to reconfigure it. This is a pretty common case, since udev might
* instantly start probing the device as soon as we attach an fd to it. Hence handle it in two ways:
* first, let's take the BSD lock that that ensures that udev will not step in between the point in
* time where we attach the fd and where we reconfigure the device. Secondly, let's wait 50ms on
* EAGAIN and retry. The former should be an efficient mechanism to avoid we have to wait 50ms
* needlessly if we are just racing against udev. The latter is protection against all other cases,
* i.e. peers that do not take the BSD lock. */
if (ioctl(fd, LOOP_SET_FD, c->fd) < 0)
return -errno;
if (ioctl(fd, LOOP_SET_STATUS64, &c->info) < 0) {
r = -errno;
goto fail;
for (unsigned n_attempts = 0;;) {
if (ioctl(fd, LOOP_SET_STATUS64, &c->info) >= 0)
break;
if (errno != EAGAIN || ++n_attempts >= 64) {
r = log_debug_errno(errno, "Failed to configure loopback device: %m");
goto fail;
}
/* Sleep some random time, but at least 10ms, at most 250ms. Increase the delay the more
* failed attempts we see */
(void) usleep(UINT64_C(10) * USEC_PER_MSEC +
random_u64() % (UINT64_C(240) * USEC_PER_MSEC * n_attempts/64));
}
return 0;
@ -102,6 +262,44 @@ fail:
return r;
}
static int attach_empty_file(int loop, int nr) {
_cleanup_close_ int fd = -1;
/* So here's the thing: on various kernels (5.8 at least) loop block devices might enter a state
* where they are detached but nonetheless have partitions, when used heavily. Accessing these
* partitions results in immediatey IO errors. There's no pretty way to get rid of them
* again. Neither LOOP_CLR_FD nor LOOP_CTL_REMOVE suffice (see above). What does work is to
* reassociate them with a new fd however. This is what we do here hence: we associate the devices
* with an empty file (i.e. an image that definitely has no partitons). We then immediately clear it
* again. This suffices to make the partitions go away. Ugly but appears to work. */
log_debug("Found unattached loopback block device /dev/loop%i with partitions. Attaching empty file to remove them.", nr);
fd = open_tmpfile_unlinkable(NULL, O_RDONLY);
if (fd < 0)
return fd;
if (flock(loop, LOCK_EX) < 0)
return -errno;
if (ioctl(loop, LOOP_SET_FD, fd) < 0)
return -errno;
if (ioctl(loop, LOOP_SET_STATUS64, &(struct loop_info64) {
.lo_flags = LO_FLAGS_READ_ONLY|
LO_FLAGS_AUTOCLEAR|
LO_FLAGS_PARTSCAN, /* enable partscan, so that the partitions really go away */
}) < 0)
return -errno;
if (ioctl(loop, LOOP_CLR_FD) < 0)
return -errno;
/* The caller is expected to immediately close the loopback device after this, so that the BSD lock
* is released, and udev sees the changes. */
return 0;
}
int loop_device_make(
int fd,
int open_flags,
@ -111,6 +309,7 @@ int loop_device_make(
LoopDevice **ret) {
_cleanup_free_ char *loopdev = NULL;
bool try_loop_configure = true;
struct loop_config config;
LoopDevice *d = NULL;
struct stat st;
@ -201,12 +400,17 @@ int loop_device_make(
if (!IN_SET(errno, ENOENT, ENXIO))
return -errno;
} else {
r = loop_configure(loop, &config);
r = loop_configure(loop, nr, &config, &try_loop_configure);
if (r >= 0) {
loop_with_fd = TAKE_FD(loop);
break;
}
if (r != -EBUSY)
if (r == -EUCLEAN) {
/* Make left-over partition disappear hack (see above) */
r = attach_empty_file(loop, nr);
if (r < 0 && r != -EBUSY)
return r;
} else if (r != -EBUSY)
return r;
}
@ -214,6 +418,11 @@ int loop_device_make(
return -EBUSY;
loopdev = mfree(loopdev);
/* Wait some random time, to make collision less likely. Let's pick a random time in the
* range 0ms250ms, linearly scaled by the number of failed attempts. */
(void) usleep(random_u64() % (UINT64_C(10) * USEC_PER_MSEC +
UINT64_C(240) * USEC_PER_MSEC * n_attempts/64));
}
d = new(LoopDevice, 1);

View File

@ -89,10 +89,11 @@ int mount_option_mangle(
int mode_to_inaccessible_node(const char *runtime_dir, mode_t mode, char **dest);
/* Useful for usage with _cleanup_(), unmounts, removes a directory and frees the pointer */
static inline void umount_and_rmdir_and_free(char *p) {
static inline char* umount_and_rmdir_and_free(char *p) {
PROTECT_ERRNO;
(void) umount_recursive(p, 0);
(void) rmdir(p);
free(p);
return NULL;
}
DEFINE_TRIVIAL_CLEANUP_FUNC(char*, umount_and_rmdir_and_free);

View File

@ -195,8 +195,9 @@ static int device_wait_for_initialization_internal(
sd_device *_device,
const char *devlink,
const char *subsystem,
usec_t timeout,
usec_t deadline,
sd_device **ret) {
_cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL;
_cleanup_(sd_event_source_unrefp) sd_event_source *timeout_source = NULL;
_cleanup_(sd_event_unrefp) sd_event *event = NULL;
@ -256,10 +257,10 @@ static int device_wait_for_initialization_internal(
if (r < 0)
return log_error_errno(r, "Failed to start device monitor: %m");
if (timeout != USEC_INFINITY) {
r = sd_event_add_time_relative(
if (deadline != USEC_INFINITY) {
r = sd_event_add_time(
event, &timeout_source,
CLOCK_MONOTONIC, timeout, 0,
CLOCK_MONOTONIC, deadline, 0,
NULL, INT_TO_PTR(-ETIMEDOUT));
if (r < 0)
return log_error_errno(r, "Failed to add timeout event source: %m");
@ -287,12 +288,12 @@ static int device_wait_for_initialization_internal(
return 0;
}
int device_wait_for_initialization(sd_device *device, const char *subsystem, usec_t timeout, sd_device **ret) {
return device_wait_for_initialization_internal(device, NULL, subsystem, timeout, ret);
int device_wait_for_initialization(sd_device *device, const char *subsystem, usec_t deadline, sd_device **ret) {
return device_wait_for_initialization_internal(device, NULL, subsystem, deadline, ret);
}
int device_wait_for_devlink(const char *devlink, const char *subsystem, usec_t timeout, sd_device **ret) {
return device_wait_for_initialization_internal(NULL, devlink, subsystem, timeout, ret);
int device_wait_for_devlink(const char *devlink, const char *subsystem, usec_t deadline, sd_device **ret) {
return device_wait_for_initialization_internal(NULL, devlink, subsystem, deadline, ret);
}
int device_is_renaming(sd_device *dev) {

View File

@ -28,7 +28,7 @@ static inline int udev_parse_config(void) {
return udev_parse_config_full(NULL, NULL, NULL, NULL, NULL);
}
int device_wait_for_initialization(sd_device *device, const char *subsystem, usec_t timeout, sd_device **ret);
int device_wait_for_devlink(const char *path, const char *subsystem, usec_t timeout, sd_device **ret);
int device_wait_for_initialization(sd_device *device, const char *subsystem, usec_t deadline, sd_device **ret);
int device_wait_for_devlink(const char *path, const char *subsystem, usec_t deadline, sd_device **ret);
int device_is_renaming(sd_device *dev);
bool device_for_action(sd_device *dev, DeviceAction action);

View File

@ -433,6 +433,17 @@ tests += [
[],
[]],
[['src/test/test-loop-block.c'],
[libcore,
libshared],
[threads,
libblkid],
'',
'',
[],
includes,
false],
[['src/test/test-selinux.c'],
[],
[]],

250
src/test/test-loop-block.c Normal file
View File

@ -0,0 +1,250 @@
/* SPDX-License-Identifier: LGPL-2.1+ */
#include <fcntl.h>
#include <linux/loop.h>
#include <pthread.h>
#include "alloc-util.h"
#include "dissect-image.h"
#include "fd-util.h"
#include "fileio.h"
#include "fs-util.h"
#include "gpt.h"
#include "missing_loop.h"
#include "mkfs-util.h"
#include "mount-util.h"
#include "namespace-util.h"
#include "string-util.h"
#include "strv.h"
#include "tests.h"
#include "tmpfile-util.h"
#include "user-util.h"
#include "virt.h"
#define N_THREADS 5
#define N_ITERATIONS 3
static usec_t end = 0;
static void* thread_func(void *ptr) {
int fd = PTR_TO_FD(ptr);
int r;
for (unsigned i = 0; i < N_ITERATIONS; i++) {
_cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
_cleanup_(umount_and_rmdir_and_freep) char *mounted = NULL;
_cleanup_(dissected_image_unrefp) DissectedImage *dissected = NULL;
if (now(CLOCK_MONOTONIC) >= end) {
log_notice("Time's up, exiting thread's loop");
break;
}
log_notice("> Thread iteration #%u.", i);
assert_se(mkdtemp_malloc(NULL, &mounted) >= 0);
r = loop_device_make(fd, O_RDONLY, 0, UINT64_MAX, LO_FLAGS_PARTSCAN, &loop);
if (r < 0)
log_error_errno(r, "Failed to allocate loopback device: %m");
assert_se(r >= 0);
log_notice("Acquired loop device %s, will mount on %s", loop->node, mounted);
r = dissect_image(loop->fd, NULL, NULL, DISSECT_IMAGE_READ_ONLY, &dissected);
if (r < 0)
log_error_errno(r, "Failed dissect loopback device %s: %m", loop->node);
assert_se(r >= 0);
log_info("Dissected loop device %s", loop->node);
for (PartitionDesignator d = 0; d < _PARTITION_DESIGNATOR_MAX; d++) {
if (!dissected->partitions[d].found)
continue;
log_notice("Found node %s fstype %s designator %s",
dissected->partitions[d].node,
dissected->partitions[d].fstype,
partition_designator_to_string(d));
}
assert_se(dissected->partitions[PARTITION_ESP].found);
assert_se(dissected->partitions[PARTITION_ESP].node);
assert_se(dissected->partitions[PARTITION_XBOOTLDR].found);
assert_se(dissected->partitions[PARTITION_XBOOTLDR].node);
assert_se(dissected->partitions[PARTITION_ROOT].found);
assert_se(dissected->partitions[PARTITION_ROOT].node);
assert_se(dissected->partitions[PARTITION_HOME].found);
assert_se(dissected->partitions[PARTITION_HOME].node);
r = dissected_image_mount(dissected, mounted, UID_INVALID, DISSECT_IMAGE_READ_ONLY);
log_notice_errno(r, "Mounted %s → %s: %m", loop->node, mounted);
assert_se(r >= 0);
log_notice("Unmounting %s", mounted);
mounted = umount_and_rmdir_and_free(mounted);
log_notice("Unmounted.");
dissected = dissected_image_unref(dissected);
log_notice("Detaching loop device %s", loop->node);
loop = loop_device_unref(loop);
log_notice("Detached loop device.");
}
log_notice("Leaving thread");
return NULL;
}
static bool have_root_gpt_type(void) {
#ifdef GPT_ROOT_NATIVE
return true;
#else
return false;
#endif
}
int main(int argc, char *argv[]) {
_cleanup_free_ char *p = NULL, *cmd = NULL;
_cleanup_(pclosep) FILE *sfdisk = NULL;
_cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
_cleanup_close_ int fd = -1;
_cleanup_(dissected_image_unrefp) DissectedImage *dissected = NULL;
_cleanup_(umount_and_rmdir_and_freep) char *mounted = NULL;
pthread_t threads[N_THREADS];
const char *fs;
sd_id128_t id;
int r;
test_setup_logging(LOG_DEBUG);
log_show_tid(true);
log_show_time(true);
if (!have_root_gpt_type()) {
log_tests_skipped("No root partition GPT defined for this architecture, exiting.");
return EXIT_TEST_SKIP;
}
if (detect_container() > 0) {
log_tests_skipped("Test not supported in a container, requires udev/uevent notifications.");
return EXIT_TEST_SKIP;
}
if (strstr_ptr(ci_environment(), "autopkgtest")) {
// FIXME: we should reenable this one day
log_tests_skipped("Skipping test on Ubuntu autopkgtest CI, test too slow and installed udev too flakey.");
return EXIT_TEST_SKIP;
}
/* This is a test for the loopback block device setup code and it's use by the image dissection
* logic: since the kernel APIs are hard use and prone to races, let's test this in a heavy duty
* test: we open a bunch of threads and repeatedly allocate and deallocate loopback block devices in
* them in parallel, with an image file with a number of partitions. */
r = detach_mount_namespace();
if (ERRNO_IS_PRIVILEGE(r)) {
log_tests_skipped("Lacking privileges");
return EXIT_TEST_SKIP;
}
FOREACH_STRING(fs, "vfat", "ext4") {
r = mkfs_exists(fs);
assert_se(r >= 0);
if (!r) {
log_tests_skipped("mkfs.{vfat|ext4} not installed");
return EXIT_TEST_SKIP;
}
}
assert_se(r >= 0);
assert_se(tempfn_random_child("/var/tmp", "sfdisk", &p) >= 0);
fd = open(p, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC|O_NOFOLLOW, 0666);
assert_se(fd >= 0);
assert_se(ftruncate(fd, 256*1024*1024) >= 0);
assert_se(cmd = strjoin("sfdisk ", p));
assert_se(sfdisk = popen(cmd, "we"));
/* A reasonably complex partition table that fits on a 64K disk */
fputs("label: gpt\n"
"size=32M, type=C12A7328-F81F-11D2-BA4B-00A0C93EC93B\n"
"size=32M, type=BC13C2FF-59E6-4262-A352-B275FD6F7172\n"
"size=32M, type=0657FD6D-A4AB-43C4-84E5-0933C84B4F4F\n"
"size=32M, type=", sfdisk);
#ifdef GPT_ROOT_NATIVE
fprintf(sfdisk, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(GPT_ROOT_NATIVE));
#else
fprintf(sfdisk, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(GPT_ROOT_X86_64));
#endif
fputs("\n"
"size=32M, type=933AC7E1-2EB4-4F13-B844-0E14E2AEF915\n", sfdisk);
assert_se(pclose(sfdisk) == 0);
sfdisk = NULL;
assert_se(loop_device_make(fd, O_RDWR, 0, UINT64_MAX, LO_FLAGS_PARTSCAN, &loop) >= 0);
assert_se(dissect_image(loop->fd, NULL, NULL, 0, &dissected) >= 0);
assert_se(dissected->partitions[PARTITION_ESP].found);
assert_se(dissected->partitions[PARTITION_ESP].node);
assert_se(dissected->partitions[PARTITION_XBOOTLDR].found);
assert_se(dissected->partitions[PARTITION_XBOOTLDR].node);
assert_se(dissected->partitions[PARTITION_ROOT].found);
assert_se(dissected->partitions[PARTITION_ROOT].node);
assert_se(dissected->partitions[PARTITION_HOME].found);
assert_se(dissected->partitions[PARTITION_HOME].node);
assert_se(sd_id128_randomize(&id) >= 0);
assert_se(make_filesystem(dissected->partitions[PARTITION_ESP].node, "vfat", "EFI", id, true) >= 0);
assert_se(sd_id128_randomize(&id) >= 0);
assert_se(make_filesystem(dissected->partitions[PARTITION_XBOOTLDR].node, "vfat", "xbootldr", id, true) >= 0);
assert_se(sd_id128_randomize(&id) >= 0);
assert_se(make_filesystem(dissected->partitions[PARTITION_ROOT].node, "ext4", "root", id, true) >= 0);
assert_se(sd_id128_randomize(&id) >= 0);
assert_se(make_filesystem(dissected->partitions[PARTITION_HOME].node, "ext4", "home", id, true) >= 0);
dissected = dissected_image_unref(dissected);
assert_se(dissect_image(loop->fd, NULL, NULL, 0, &dissected) >= 0);
assert_se(mkdtemp_malloc(NULL, &mounted) >= 0);
/* This first (writable) mount will initialize the mount point dirs, so that the subsequent read-only ones can work */
assert_se(dissected_image_mount(dissected, mounted, UID_INVALID, 0) >= 0);
assert_se(umount_recursive(mounted, 0) >= 0);
loop = loop_device_unref(loop);
log_notice("Threads are being started now");
/* Let's make sure we run for 10s on slow systems at max */
end = usec_add(now(CLOCK_MONOTONIC),
slow_tests_enabled() ? 5 * USEC_PER_SEC :
1 * USEC_PER_SEC);
for (unsigned i = 0; i < N_THREADS; i++)
assert_se(pthread_create(threads + i, NULL, thread_func, FD_TO_PTR(fd)) == 0);
log_notice("All threads started now.");
for (unsigned i = 0; i < N_THREADS; i++) {
log_notice("Joining thread #%u.", i);
void *k;
assert_se(pthread_join(threads[i], &k) == 0);
assert_se(k == NULL);
log_notice("Joined thread #%u.", i);
}
log_notice("Threads are all terminated now.");
return 0;
}

View File

@ -493,7 +493,11 @@ int info_main(int argc, char *argv[], void *userdata) {
if (arg_wait_for_initialization_timeout > 0) {
sd_device *d;
r = device_wait_for_initialization(device, NULL, arg_wait_for_initialization_timeout, &d);
r = device_wait_for_initialization(
device,
NULL,
usec_add(now(CLOCK_MONOTONIC), arg_wait_for_initialization_timeout),
&d);
if (r < 0)
return r;