From f7c9f4a2a9bdb476b6b0e0e88291fa8d5e9fc0de Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 4 Oct 2017 13:48:51 +0200 Subject: [PATCH 1/6] btrfs-util: when opening subvolume fds, always set O_NOFOLLOW Some of the btrfs utility functions already used O_NOFOLLOW others didn't. Let's streamline this, and refuse operation when we are called for symlinks on "remove" and "snapshot" too. In particular in the "remove" case following symlinks is a bad idea, and is quite different from how unlink() and friends work, which always remove the symlink, and not the destination, a logic we should follow here too. --- src/basic/btrfs-util.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/basic/btrfs-util.c b/src/basic/btrfs-util.c index c2061addd1..783ff13a36 100644 --- a/src/basic/btrfs-util.c +++ b/src/basic/btrfs-util.c @@ -1212,7 +1212,7 @@ static int subvol_remove_children(int fd, const char *subvolume, uint64_t subvol if (!S_ISDIR(st.st_mode)) return -EINVAL; - subvol_fd = openat(fd, subvolume, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY); + subvol_fd = openat(fd, subvolume, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW); if (subvol_fd < 0) return -errno; @@ -1292,7 +1292,7 @@ static int subvol_remove_children(int fd, const char *subvolume, uint64_t subvol * hence we need to open the * containing directory first */ - child_fd = openat(subvol_fd, ino_args.name, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY); + child_fd = openat(subvol_fd, ino_args.name, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW); if (child_fd < 0) return -errno; @@ -1641,7 +1641,7 @@ static int subvol_snapshot_children(int old_fd, int new_fd, const char *subvolum if (!c) return -ENOMEM; - old_child_fd = openat(old_fd, c, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY); + old_child_fd = openat(old_fd, c, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW); if (old_child_fd < 0) return -errno; @@ -1649,7 +1649,7 @@ static int subvol_snapshot_children(int old_fd, int new_fd, const char *subvolum if (!np) return -ENOMEM; - new_child_fd = openat(new_fd, np, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY); + new_child_fd = openat(new_fd, np, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW); if (new_child_fd < 0) return -errno; @@ -1660,7 +1660,7 @@ static int subvol_snapshot_children(int old_fd, int new_fd, const char *subvolum * into place. */ if (subvolume_fd < 0) { - subvolume_fd = openat(new_fd, subvolume, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY); + subvolume_fd = openat(new_fd, subvolume, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW); if (subvolume_fd < 0) return -errno; } From 8c4a8ea2ac2569eb9f376ad17f8baea3e836b8ba Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 4 Oct 2017 17:34:03 +0200 Subject: [PATCH 2/6] fs-util: small tweak in chase_symlinks() If we follow an absolute symlink there's no need to prefix the path with a "/", since by definition it already has one. This helps suppressing double "/" in resolved paths containing absolute symlinks. --- src/basic/fs-util.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/basic/fs-util.c b/src/basic/fs-util.c index b90f343ed3..946f779a32 100644 --- a/src/basic/fs-util.c +++ b/src/basic/fs-util.c @@ -766,12 +766,11 @@ int chase_symlinks(const char *path, const char *original_root, unsigned flags, return -ENOMEM; } - } - - /* Prefix what's left to do with what we just read, and start the loop again, - * but remain in the current directory. */ - - joined = strjoin("/", destination, todo); + /* Prefix what's left to do with what we just read, and start the loop again, but + * remain in the current directory. */ + joined = strjoin(destination, todo); + } else + joined = strjoin("/", destination, todo); if (!joined) return -ENOMEM; From eb38edce88ac2f511ed9593a1c25c58a77158219 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 4 Oct 2017 17:36:58 +0200 Subject: [PATCH 3/6] machine-image: add partial discovery of block devices as images This adds some basic discovery of block device images for nspawn and friends. Note that this doesn't add searching for block devices using udev, but instead expects users to symlink relevant block devices into /var/lib/machines. Discovery is hence done exactly like for dir/subvol/raw file images, except that what is found may be a (symlink to) a block device. For now, we do not support cloning these images, but removal, renaming and read-only flags are supported to the point where that makes sense. Fixe: #6990 --- src/machine/image-dbus.c | 1 + src/nspawn/nspawn.c | 2 +- src/shared/machine-image.c | 118 ++++++++++++++++++++++++++++++++++--- src/shared/machine-image.h | 1 + 4 files changed, 114 insertions(+), 8 deletions(-) diff --git a/src/machine/image-dbus.c b/src/machine/image-dbus.c index 18e0e34896..e534470feb 100644 --- a/src/machine/image-dbus.c +++ b/src/machine/image-dbus.c @@ -431,6 +431,7 @@ int bus_image_method_get_os_release( break; case IMAGE_RAW: + case IMAGE_BLOCK: r = raw_image_get_os_release(image, &v, error); break; diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 4e3803be82..d1e38556c8 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2089,7 +2089,7 @@ static int determine_names(void) { return -ENOENT; } - if (i->type == IMAGE_RAW) + if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK)) r = free_and_strdup(&arg_image, i->path); else r = free_and_strdup(&arg_directory, i->path); diff --git a/src/shared/machine-image.c b/src/shared/machine-image.c index 859e5ffc1a..a8af1b73eb 100644 --- a/src/shared/machine-image.c +++ b/src/shared/machine-image.c @@ -171,9 +171,8 @@ static int image_make( assert(filename); - /* We explicitly *do* follow symlinks here, since we want to - * allow symlinking trees into /var/lib/machines/, and treat - * them normally. */ + /* We explicitly *do* follow symlinks here, since we want to allow symlinking trees, raw files and block + * devices into /var/lib/machines/, and treat them normally. */ if (fstatat(dfd, filename, &st, 0) < 0) return -errno; @@ -286,6 +285,58 @@ static int image_make( (*ret)->limit = (*ret)->limit_exclusive = st.st_size; return 1; + + } else if (S_ISBLK(st.st_mode)) { + _cleanup_close_ int block_fd = -1; + uint64_t size = UINT64_MAX; + + /* A block device */ + + if (!ret) + return 1; + + if (!pretty) + pretty = filename; + + block_fd = openat(dfd, filename, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY); + if (block_fd < 0) + log_debug_errno(errno, "Failed to open block device %s/%s, ignoring: %m", path, filename); + else { + if (fstat(block_fd, &st) < 0) + return -errno; + if (!S_ISBLK(st.st_mode)) /* Verify that what we opened is actually what we think it is */ + return -ENOTTY; + + if (!read_only) { + int state = 0; + + if (ioctl(block_fd, BLKROGET, &state) < 0) + log_debug_errno(errno, "Failed to issue BLKROGET on device %s/%s, ignoring: %m", path, filename); + else if (state) + read_only = true; + } + + if (ioctl(block_fd, BLKGETSIZE64, &size) < 0) + log_debug_errno(errno, "Failed to issue BLKFLSBUF on device %s/%s, ignoring: %m", path, filename); + + block_fd = safe_close(block_fd); + } + + r = image_new(IMAGE_BLOCK, + pretty, + path, + filename, + !(st.st_mode & 0222) || read_only, + 0, + 0, + ret); + if (r < 0) + return r; + + if (size != 0 && size != UINT64_MAX) + (*ret)->usage = (*ret)->usage_exclusive = (*ret)->limit = (*ret)->limit_exclusive = size; + + return 1; } return 0; @@ -446,6 +497,17 @@ int image_remove(Image *i) { break; + case IMAGE_BLOCK: + + /* If this is inside of /dev, then it's a real block device, hence let's not touch the device node + * itself (but let's remove the stuff stored alongside it). If it's anywhere else, let's try to unlink + * the thing (it's most likely a symlink after all). */ + + if (path_startswith(i->path, "/dev")) + break; + + /* fallthrough */ + case IMAGE_RAW: if (unlink(i->path) < 0) return -errno; @@ -536,6 +598,15 @@ int image_rename(Image *i, const char *new_name) { new_path = file_in_same_dir(i->path, new_name); break; + case IMAGE_BLOCK: + + /* Refuse renaming raw block devices in /dev, the names are picked by udev after all. */ + if (path_startswith(i->path, "/dev")) + return -EROFS; + + new_path = file_in_same_dir(i->path, new_name); + break; + case IMAGE_RAW: { const char *fn; @@ -659,6 +730,7 @@ int image_clone(Image *i, const char *new_name, bool read_only) { r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, FS_NOCOW_FL, COPY_REFLINK); break; + case IMAGE_BLOCK: default: return -EOPNOTSUPP; } @@ -737,6 +809,26 @@ int image_read_only(Image *i, bool b) { break; } + case IMAGE_BLOCK: { + _cleanup_close_ int fd = -1; + struct stat st; + int state = b; + + fd = open(i->path, O_CLOEXEC|O_RDONLY|O_NONBLOCK|O_NOCTTY); + if (fd < 0) + return -errno; + + if (fstat(fd, &st) < 0) + return -errno; + if (!S_ISBLK(st.st_mode)) + return -ENOTTY; + + if (ioctl(fd, BLKROSET, &state) < 0) + return -errno; + + break; + } + default: return -EOPNOTSUPP; } @@ -772,13 +864,24 @@ int image_path_lock(const char *path, int operation, LockFile *global, LockFile return -EBUSY; if (stat(path, &st) >= 0) { - if (asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino) < 0) + if (S_ISBLK(st.st_mode)) + r = asprintf(&p, "/run/systemd/nspawn/locks/block-%u:%u", major(st.st_rdev), minor(st.st_rdev)); + else if (S_ISDIR(st.st_mode) || S_ISREG(st.st_mode)) + r = asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino); + else + return -ENOTTY; + + if (r < 0) return -ENOMEM; } - r = make_lock_file_for(path, operation, &t); - if (r < 0) - return r; + /* For block devices we don't need the "local" lock, as the major/minor lock above should be sufficient, since + * block devices are device local anyway. */ + if (!path_startswith(path, "/dev")) { + r = make_lock_file_for(path, operation, &t); + if (r < 0) + return r; + } if (p) { mkdir_p("/run/systemd/nspawn/locks", 0700); @@ -860,6 +963,7 @@ static const char* const image_type_table[_IMAGE_TYPE_MAX] = { [IMAGE_DIRECTORY] = "directory", [IMAGE_SUBVOLUME] = "subvolume", [IMAGE_RAW] = "raw", + [IMAGE_BLOCK] = "block", }; DEFINE_STRING_TABLE_LOOKUP(image_type, ImageType); diff --git a/src/shared/machine-image.h b/src/shared/machine-image.h index 7410168c4f..50d89e4392 100644 --- a/src/shared/machine-image.h +++ b/src/shared/machine-image.h @@ -33,6 +33,7 @@ typedef enum ImageType { IMAGE_DIRECTORY, IMAGE_SUBVOLUME, IMAGE_RAW, + IMAGE_BLOCK, _IMAGE_TYPE_MAX, _IMAGE_TYPE_INVALID = -1 } ImageType; From 759aaedc5ce8fb42c99fbccb374f35f063194a4c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 4 Oct 2017 17:40:46 +0200 Subject: [PATCH 4/6] dissect: when we invoke dissection on a loop device with partscan help the user This adds some simply detection logic for cases where dissection is invoked on an externally created loop device, and partitions have been detected on it, but partition scanning so far was off. If this is detected we now print a brief message indicating what the issue is, instead of failing with a useless EINVAL message the kernel passed to us. --- src/dissect/dissect.c | 4 ++++ src/nspawn/nspawn.c | 4 ++++ src/shared/dissect-image.c | 31 ++++++++++++++++++++++--------- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/dissect/dissect.c b/src/dissect/dissect.c index 06564e94b1..e5264d09df 100644 --- a/src/dissect/dissect.c +++ b/src/dissect/dissect.c @@ -216,6 +216,10 @@ int main(int argc, char *argv[]) { log_error_errno(r, "No suitable root partition found in image %s.", arg_image); goto finish; } + if (r == -EPROTONOSUPPORT) { + log_error_errno(r, "Device %s is loopback block device with partition scanning turned off, please turn it on.", arg_image); + goto finish; + } if (r < 0) { log_error_errno(r, "Failed to dissect image: %m"); goto finish; diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index d1e38556c8..cbe98a3ff8 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -3883,6 +3883,10 @@ int main(int argc, char *argv[]) { log_error_errno(r, "--image= is not supported, compiled without blkid support."); goto finish; } + if (r == -EPROTONOSUPPORT) { + log_error_errno(r, "Device is loopback block device with partition scanning turned off, please turn it on."); + goto finish; + } if (r < 0) { log_error_errno(r, "Failed to dissect image: %m"); goto finish; diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c index 4b59f2ca7d..de9510df63 100644 --- a/src/shared/dissect-image.c +++ b/src/shared/dissect-image.c @@ -266,18 +266,34 @@ int dissect_image(int fd, const void *root_hash, size_t root_hash_size, DissectI return -EIO; } if (n < z + 1) { - unsigned j; + unsigned j = 0; /* The kernel has probed fewer partitions than blkid? Maybe the kernel prober is still running * or it got EBUSY because udev already opened the device. Let's reprobe the device, which is a * synchronous call that waits until probing is complete. */ - for (j = 0; j < 20; j++) { + for (;;) { + if (j++ > 20) + return -EBUSY; - r = ioctl(fd, BLKRRPART, 0); - if (r < 0) + if (ioctl(fd, BLKRRPART, 0) < 0) { r = -errno; - if (r >= 0 || r != -EBUSY) + + if (r == -EINVAL) { + struct loop_info64 info; + + /* If we are running on a loop device that has partition scanning off, + * return an explicit recognizable error about this, so that callers + * can generate a proper message explaining the situation. */ + + if (ioctl(fd, LOOP_GET_STATUS64, &info) >= 0 && (info.lo_flags & LO_FLAGS_PARTSCAN) == 0) { + log_debug("Device is loop device and partition scanning is off!"); + return -EPROTONOSUPPORT; + } + } + if (r != -EBUSY) + return r; + } else break; /* If something else has the device open, such as an udev rule, the ioctl will return @@ -286,11 +302,8 @@ int dissect_image(int fd, const void *root_hash, size_t root_hash_size, DissectI * * This is really something they should fix in the kernel! */ - usleep(50 * USEC_PER_MSEC); + (void) usleep(50 * USEC_PER_MSEC); } - - if (r < 0) - return r; } e = udev_enumerate_unref(e); From 9fb0b9c70d8cf04ae000a3431a2ccf9e8b8b798c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 4 Oct 2017 17:42:23 +0200 Subject: [PATCH 5/6] machine-image: handle nicely if the user asks us to remove a symlinked image Much like for dirs/raw images lets remove the symlink and not the destination. --- src/shared/machine-image.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/shared/machine-image.c b/src/shared/machine-image.c index a8af1b73eb..24209fa35d 100644 --- a/src/shared/machine-image.c +++ b/src/shared/machine-image.c @@ -483,9 +483,15 @@ int image_remove(Image *i) { switch (i->type) { case IMAGE_SUBVOLUME: - r = btrfs_subvol_remove(i->path, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA); - if (r < 0) - return r; + + /* Let's unlink first, maybe it is a symlink? If that works we are happy. Otherwise, let's get out the + * big guns */ + if (unlink(i->path) < 0) { + r = btrfs_subvol_remove(i->path, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA); + if (r < 0) + return r; + } + break; case IMAGE_DIRECTORY: From 3992bce17f2e7ae9eb5ebe2c97cd48a0606a0c79 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 4 Oct 2017 17:44:14 +0200 Subject: [PATCH 6/6] update TODO --- TODO | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/TODO b/TODO index e1771c802f..da6d01e54b 100644 --- a/TODO +++ b/TODO @@ -35,6 +35,10 @@ Features: the quota of a the user indicated in User= via unit file settings, like the other resource management concepts. Would mix nicely with DynamicUser=1 +* add dissect_image_warn() as a wrapper around dissect_image() that prints + friendly log messages for the returned errors, so that we don't have to + duplicate that in nspawn, systemd-dissect and PID 1. + * maybe set a new set of env vars for services, based on RuntimeDirectory=, StateDirectory=, LogsDirectory=, CacheDirectory= and ConfigurationDirectory= automatically. For example, there could be $RUNTIME_DIRECTORY,