From 0646d3c3dd27bab1ec6bd73c37325318f8578393 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 19 Dec 2018 00:01:22 +0100 Subject: [PATCH 01/18] nspawn: explicitly refuse mounts over / Previously this would fail later on, but let's filter this out at the time of parsing. --- src/nspawn/nspawn-mount.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index a9af889747..1c0c6d8ff1 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -212,6 +212,8 @@ int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) if (!path_is_absolute(destination)) return -EINVAL; + if (empty_or_root(destination)) + return -EINVAL; m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND); if (!m) @@ -251,6 +253,8 @@ int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s) { if (!path_is_absolute(path)) return -EINVAL; + if (empty_or_root(path)) + return -EINVAL; m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS); if (!m) @@ -310,6 +314,9 @@ int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_onl return -EINVAL; } + if (empty_or_root(destination)) + return -EINVAL; + m = custom_mount_add(l, n, CUSTOM_MOUNT_OVERLAY); if (!m) return -ENOMEM; From e5b43a04b6e4871b8dbcaa0cbabf953bd3aaa04b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 19 Dec 2018 00:09:57 +0100 Subject: [PATCH 02/18] nspawn: add volatile mode multiplexer call setup_volatile_mode() Just some refactoring, no change in behaviour. --- src/nspawn/nspawn-mount.c | 38 ++++++++++++++++++++++++-------------- src/nspawn/nspawn-mount.h | 3 +-- src/nspawn/nspawn.c | 12 +----------- 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 1c0c6d8ff1..5a9b8eb326 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -856,9 +856,8 @@ int mount_custom( return 0; } -int setup_volatile_state( +static int setup_volatile_state( const char *directory, - VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context) { @@ -868,11 +867,7 @@ int setup_volatile_state( assert(directory); - if (mode != VOLATILE_STATE) - return 0; - - /* --volatile=state means we simply overmount /var - with a tmpfs, and the rest read-only. */ + /* --volatile=state means we simply overmount /var with a tmpfs, and the rest read-only. */ r = bind_remount_recursive(directory, true, NULL); if (r < 0) @@ -893,9 +888,8 @@ int setup_volatile_state( return mount_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options); } -int setup_volatile( +static int setup_volatile_yes( const char *directory, - VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context) { @@ -907,11 +901,8 @@ int setup_volatile( assert(directory); - if (mode != VOLATILE_YES) - return 0; - - /* --volatile=yes means we mount a tmpfs to the root dir, and - the original /usr to use inside it, and that read-only. */ + /* --volatile=yes means we mount a tmpfs to the root dir, and the original /usr to use inside it, and that + read-only. */ if (!mkdtemp(template)) return log_error_errno(errno, "Failed to create temporary directory: %m"); @@ -968,6 +959,25 @@ fail: return r; } +int setup_volatile_mode( + const char *directory, + VolatileMode mode, + bool userns, uid_t uid_shift, uid_t uid_range, + const char *selinux_apifs_context) { + + switch (mode) { + + case VOLATILE_YES: + return setup_volatile_yes(directory, userns, uid_shift, uid_range, selinux_apifs_context); + + case VOLATILE_STATE: + return setup_volatile_state(directory, userns, uid_shift, uid_range, selinux_apifs_context); + + default: + return 0; + } +} + /* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */ int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s) { _cleanup_free_ char *root_new = NULL, *root_old = NULL; diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h index 8051a7d9d9..e060ca0e4d 100644 --- a/src/nspawn/nspawn-mount.h +++ b/src/nspawn/nspawn-mount.h @@ -49,8 +49,7 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings); int mount_custom(const char *dest, CustomMount *mounts, size_t n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); -int setup_volatile(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); -int setup_volatile_state(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); +int setup_volatile_mode(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s); int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index e0c2d711e6..bb070e1b35 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2933,17 +2933,7 @@ static int outer_child( if (r < 0) return r; - r = setup_volatile( - directory, - arg_volatile_mode, - arg_userns_mode != USER_NAMESPACE_NO, - arg_uid_shift, - arg_uid_range, - arg_selinux_context); - if (r < 0) - return r; - - r = setup_volatile_state( + r = setup_volatile_mode( directory, arg_volatile_mode, arg_userns_mode != USER_NAMESPACE_NO, From c55d0ae76422d050d0832dce0ff4b724072e8777 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 19 Dec 2018 01:01:46 +0100 Subject: [PATCH 03/18] nspawn: fix an error path --- src/nspawn/nspawn-mount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 5a9b8eb326..916070b3f2 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -910,7 +910,7 @@ static int setup_volatile_yes( options = "mode=755"; r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf); if (r < 0) - return log_oom(); + goto fail; if (r > 0) options = buf; From 6c610acaaa771a20893fb185d654cb0f9dd451b4 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 19 Dec 2018 01:02:06 +0100 Subject: [PATCH 04/18] nspawn: add --volatile=overlay support Fixes: #11054 #3847 --- src/nspawn/nspawn-mount.c | 68 ++++++++++++++++++++++++++++++++++++++ src/nspawn/nspawn.c | 12 +++---- src/shared/volatile-util.c | 1 + src/shared/volatile-util.h | 1 + 4 files changed, 76 insertions(+), 6 deletions(-) diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 916070b3f2..eb0a26ef35 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -959,6 +959,71 @@ fail: return r; } +static int setup_volatile_overlay( + const char *directory, + bool userns, uid_t uid_shift, uid_t uid_range, + const char *selinux_apifs_context) { + + _cleanup_free_ char *buf = NULL, *escaped_directory = NULL, *escaped_upper = NULL, *escaped_work = NULL; + char template[] = "/tmp/nspawn-volatile-XXXXXX"; + const char *upper, *work, *options; + bool tmpfs_mounted = false; + int r; + + assert(directory); + + /* --volatile=overlay means we mount an overlayfs to the root dir. */ + + if (!mkdtemp(template)) + return log_error_errno(errno, "Failed to create temporary directory: %m"); + + options = "mode=755"; + r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf); + if (r < 0) + goto finish; + if (r > 0) + options = buf; + + r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options); + if (r < 0) + goto finish; + + tmpfs_mounted = true; + + upper = strjoina(template, "/upper"); + work = strjoina(template, "/work"); + + if (mkdir(upper, 0755) < 0) { + r = log_error_errno(errno, "Failed to create %s: %m", upper); + goto finish; + } + if (mkdir(work, 0755) < 0) { + r = log_error_errno(errno, "Failed to create %s: %m", work); + goto finish; + } + + /* And now, let's overmount the root dir with an overlayfs that uses the root dir as lower dir. It's kinda nice + * that the kernel allows us to do that without going through some mount point rearrangements. */ + + escaped_directory = shell_escape(directory, ",:"); + escaped_upper = shell_escape(upper, ",:"); + escaped_work = shell_escape(work, ",:"); + if (!escaped_directory || !escaped_upper || !escaped_work) { + r = -ENOMEM; + goto finish; + } + + options = strjoina("lowerdir=", escaped_directory, ",upperdir=", escaped_upper, ",workdir=", escaped_work); + r = mount_verbose(LOG_ERR, "overlay", directory, "overlay", 0, options); + +finish: + if (tmpfs_mounted) + (void) umount_verbose(template); + + (void) rmdir(template); + return r; +} + int setup_volatile_mode( const char *directory, VolatileMode mode, @@ -973,6 +1038,9 @@ int setup_volatile_mode( case VOLATILE_STATE: return setup_volatile_state(directory, userns, uid_shift, uid_range, selinux_apifs_context); + case VOLATILE_OVERLAY: + return setup_volatile_overlay(directory, userns, uid_shift, uid_range, selinux_apifs_context); + default: return 0; } diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index bb070e1b35..0bdfc7677a 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -1431,9 +1431,9 @@ static int setup_timezone(const char *dest) { if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) { r = readlink_malloc("/etc/localtime", &p); if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO) - m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_OFF : TIMEZONE_DELETE; + m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? TIMEZONE_OFF : TIMEZONE_DELETE; else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */ - m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_BIND : TIMEZONE_COPY; + m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? TIMEZONE_BIND : TIMEZONE_COPY; else if (r < 0) { log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m"); /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data @@ -1444,7 +1444,7 @@ static int setup_timezone(const char *dest) { */ return 0; } else if (arg_timezone == TIMEZONE_AUTO) - m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_BIND : TIMEZONE_SYMLINK; + m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? TIMEZONE_BIND : TIMEZONE_SYMLINK; else m = arg_timezone; } else @@ -1606,11 +1606,11 @@ static int setup_resolv_conf(const char *dest) { if (arg_private_network) m = RESOLV_CONF_OFF; else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0) - m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_BIND_STATIC : RESOLV_CONF_COPY_STATIC; + m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? RESOLV_CONF_BIND_STATIC : RESOLV_CONF_COPY_STATIC; else if (have_resolv_conf("/etc/resolv.conf") > 0) - m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_BIND_HOST : RESOLV_CONF_COPY_HOST; + m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? RESOLV_CONF_BIND_HOST : RESOLV_CONF_COPY_HOST; else - m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_OFF : RESOLV_CONF_DELETE; + m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? RESOLV_CONF_OFF : RESOLV_CONF_DELETE; } else m = arg_resolv_conf; diff --git a/src/shared/volatile-util.c b/src/shared/volatile-util.c index 4d75bc0e96..917ebfa4e4 100644 --- a/src/shared/volatile-util.c +++ b/src/shared/volatile-util.c @@ -39,6 +39,7 @@ static const char* const volatile_mode_table[_VOLATILE_MODE_MAX] = { [VOLATILE_NO] = "no", [VOLATILE_YES] = "yes", [VOLATILE_STATE] = "state", + [VOLATILE_OVERLAY] = "overlay", }; DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(volatile_mode, VolatileMode, VOLATILE_YES); diff --git a/src/shared/volatile-util.h b/src/shared/volatile-util.h index 8761c44ab8..2d31bb1174 100644 --- a/src/shared/volatile-util.h +++ b/src/shared/volatile-util.h @@ -5,6 +5,7 @@ typedef enum VolatileMode { VOLATILE_NO, VOLATILE_YES, VOLATILE_STATE, + VOLATILE_OVERLAY, _VOLATILE_MODE_MAX, _VOLATILE_MODE_INVALID = -1 } VolatileMode; From 7d0ecdd62d296061ebcbc4cfb04165a6efd856d3 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 19 Dec 2018 01:02:47 +0100 Subject: [PATCH 05/18] nspawn: slightly reorder mount logic Let's first setup the volatile logic, and only then mount secondary partitions of the image in. --- src/nspawn/nspawn.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 0bdfc7677a..18ba76f0cd 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2896,6 +2896,28 @@ static int outer_child( "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range); } + /* Turn directory into bind mount */ + r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + + r = setup_pivot_root( + directory, + arg_pivot_root_new, + arg_pivot_root_old); + if (r < 0) + return r; + + r = setup_volatile_mode( + directory, + arg_volatile_mode, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_context); + if (r < 0) + return r; + if (dissected_image) { /* Now we know the uid shift, let's now mount everything else that might be in the image. */ r = dissected_image_mount(dissected_image, directory, arg_uid_shift, @@ -2921,28 +2943,6 @@ static int outer_child( unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket); } - /* Turn directory into bind mount */ - r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL); - if (r < 0) - return r; - - r = setup_pivot_root( - directory, - arg_pivot_root_new, - arg_pivot_root_old); - if (r < 0) - return r; - - r = setup_volatile_mode( - directory, - arg_volatile_mode, - arg_userns_mode != USER_NAMESPACE_NO, - arg_uid_shift, - arg_uid_range, - arg_selinux_context); - if (r < 0) - return r; - /* Mark everything as shared so our mounts get propagated down. This is * required to make new bind mounts available in systemd services * inside the containter that create a new mount namespace. From e50cd82f688b69065fe67314f6f45a35c01ee6ee Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 19 Dec 2018 01:03:52 +0100 Subject: [PATCH 06/18] nspawn: no need to make top-level directory a bind mount if we just dissected an image --- src/nspawn/nspawn.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 18ba76f0cd..f668fffb6e 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2896,10 +2896,12 @@ static int outer_child( "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range); } - /* Turn directory into bind mount */ - r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL); - if (r < 0) - return r; + if (!dissected_image) { + /* Turn directory into bind mount */ + r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + } r = setup_pivot_root( directory, From 83205269c0f269a64143012604357d607ef8d142 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 21 Dec 2018 23:33:44 +0100 Subject: [PATCH 07/18] nspawn: refactor how we determine whether it's OK to write to /etc --- src/nspawn/nspawn.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index f668fffb6e..92c7e2e6b1 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -1420,6 +1420,10 @@ static const char *timezone_from_path(const char *path) { "/usr/share/zoneinfo/"); } +static bool etc_writable(void) { + return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY); +} + static int setup_timezone(const char *dest) { _cleanup_free_ char *p = NULL, *etc = NULL; const char *where, *check; @@ -1431,9 +1435,9 @@ static int setup_timezone(const char *dest) { if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) { r = readlink_malloc("/etc/localtime", &p); if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO) - m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? TIMEZONE_OFF : TIMEZONE_DELETE; + m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF; else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */ - m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? TIMEZONE_BIND : TIMEZONE_COPY; + m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND; else if (r < 0) { log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m"); /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data @@ -1444,7 +1448,7 @@ static int setup_timezone(const char *dest) { */ return 0; } else if (arg_timezone == TIMEZONE_AUTO) - m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? TIMEZONE_BIND : TIMEZONE_SYMLINK; + m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND; else m = arg_timezone; } else @@ -1606,11 +1610,11 @@ static int setup_resolv_conf(const char *dest) { if (arg_private_network) m = RESOLV_CONF_OFF; else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0) - m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? RESOLV_CONF_BIND_STATIC : RESOLV_CONF_COPY_STATIC; + m = etc_writable() ? RESOLV_CONF_COPY_STATIC : RESOLV_CONF_BIND_STATIC; else if (have_resolv_conf("/etc/resolv.conf") > 0) - m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? RESOLV_CONF_BIND_HOST : RESOLV_CONF_COPY_HOST; + m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST; else - m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? RESOLV_CONF_OFF : RESOLV_CONF_DELETE; + m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF; } else m = arg_resolv_conf; From e5a4bb0d4e73079f9d553b1aeb0692e36554d3ca Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 21 Dec 2018 23:37:00 +0100 Subject: [PATCH 08/18] nspawn: rework how arg_read_only is initialized in --volatile= mode Previously, we'd refuse the combination, and claimed we'd imply it, but actually didn't. Let's allow the combination and imply read-only from --volatile=, because that's what's documented, what we claim we do, and what makes sense. --- src/nspawn/nspawn.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 92c7e2e6b1..14f251a277 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -1308,6 +1308,9 @@ static int verify_arguments(void) { if (arg_start_mode == START_BOOT && arg_kill_signal <= 0) arg_kill_signal = SIGRTMIN+3; + if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */ + arg_read_only = true; + if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) /* Save the user from accidentally registering either user-$SESSION.scope or user@.service. * The latter is not technically a user session, but we don't need to labour the point. */ @@ -1334,6 +1337,12 @@ static int verify_arguments(void) { if (arg_userns_chown && arg_read_only) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--read-only and --private-users-chown may not be combined."); + /* We don't support --private-users-chown together with any of the volatile modes since we couldn't + * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive + * copy-up (in case of overlay) making the entire excercise pointless. */ + if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined."); + /* If --network-namespace-path is given with any other network-related option, * we need to error out, to avoid conflicts between different network options. */ if (arg_network_namespace_path && @@ -1352,9 +1361,6 @@ static int verify_arguments(void) { if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts."); - if (arg_volatile_mode != VOLATILE_NO && arg_read_only) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy."); - if (arg_expose_ports && !arg_private_network) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking."); @@ -2967,7 +2973,7 @@ static int outer_child( if (r < 0) return r; - if (arg_read_only) { + if (arg_read_only && arg_volatile_mode == VOLATILE_NO) { r = bind_remount_recursive(directory, true, NULL); if (r < 0) return log_error_errno(r, "Failed to make tree read-only: %m"); From 68abaa09299445ad63881bca070c0069355c2ebc Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 19 Dec 2018 14:52:26 +0100 Subject: [PATCH 09/18] volatile-util: tweak query_volatile_mode() a bit --- src/shared/volatile-util.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/shared/volatile-util.c b/src/shared/volatile-util.c index 917ebfa4e4..5ca6ab3376 100644 --- a/src/shared/volatile-util.c +++ b/src/shared/volatile-util.c @@ -12,27 +12,28 @@ int query_volatile_mode(VolatileMode *ret) { _cleanup_free_ char *mode = NULL; - VolatileMode m = VOLATILE_NO; int r; r = proc_cmdline_get_key("systemd.volatile", PROC_CMDLINE_VALUE_OPTIONAL, &mode); if (r < 0) return r; - if (r == 0) - goto finish; + if (r == 0) { + *ret = VOLATILE_NO; + return 0; + } if (mode) { + VolatileMode m; + m = volatile_mode_from_string(mode); if (m < 0) return -EINVAL; + + *ret = m; } else - m = VOLATILE_YES; + *ret = VOLATILE_YES; - r = 1; - -finish: - *ret = m; - return r; + return 1; } static const char* const volatile_mode_table[_VOLATILE_MODE_MAX] = { From 85fb5bb2cb074b75e79e348bb0ab2bdc0111de91 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 19 Dec 2018 15:03:47 +0100 Subject: [PATCH 10/18] volatile-root: add missing logging to volatile-root --- src/volatile-root/volatile-root.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/volatile-root/volatile-root.c b/src/volatile-root/volatile-root.c index 5da9ce1681..5c30da0de8 100644 --- a/src/volatile-root/volatile-root.c +++ b/src/volatile-root/volatile-root.c @@ -45,7 +45,7 @@ static int make_volatile(const char *path) { goto finish_rmdir; if (mkdir("/run/systemd/volatile-sysroot/usr", 0755) < 0) { - r = -errno; + r = log_error_errno(errno, "Failed to create /usr directory: %m"); goto finish_umount; } @@ -54,8 +54,10 @@ static int make_volatile(const char *path) { goto finish_umount; r = bind_remount_recursive("/run/systemd/volatile-sysroot/usr", true, NULL); - if (r < 0) + if (r < 0) { + log_error_errno(r, "Failed to remount /usr read-only: %m"); goto finish_umount; + } r = umount_recursive(path, 0); if (r < 0) { @@ -64,7 +66,7 @@ static int make_volatile(const char *path) { } if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) - log_warning_errno(errno, "Failed to remount %s MS_SLAVE|MS_REC: %m", path); + log_warning_errno(errno, "Failed to remount %s MS_SLAVE|MS_REC, ignoring: %m", path); r = mount_verbose(LOG_ERR, "/run/systemd/volatile-sysroot", path, NULL, MS_MOVE, NULL); From 26945d18acf1c40a5e4533b60023506c918331b6 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 19 Dec 2018 15:04:08 +0100 Subject: [PATCH 11/18] volatile-root: fail if we can't parse specified parameter --- src/volatile-root/volatile-root.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/volatile-root/volatile-root.c b/src/volatile-root/volatile-root.c index 5c30da0de8..d7f52e4a62 100644 --- a/src/volatile-root/volatile-root.c +++ b/src/volatile-root/volatile-root.c @@ -96,10 +96,8 @@ static int run(int argc, char *argv[]) { if (r == 0 && argc >= 2) { /* The kernel command line always wins. However if nothing was set there, the argument passed here wins instead. */ m = volatile_mode_from_string(argv[1]); - if (m < 0) { - log_error("Couldn't parse volatile mode: %s", argv[1]); - r = -EINVAL; - } + if (m < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Couldn't parse volatile mode: %s", argv[1]); } if (argc < 3) From 1de7f825d3a585846c7421041a21a1672eaf837d Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 19 Dec 2018 17:30:13 +0100 Subject: [PATCH 12/18] volatile-root: add overlay mode for host boots, too --- src/fstab-generator/fstab-generator.c | 7 ++- src/volatile-root/volatile-root.c | 84 ++++++++++++++++++++++----- 2 files changed, 72 insertions(+), 19 deletions(-) diff --git a/src/fstab-generator/fstab-generator.c b/src/fstab-generator/fstab-generator.c index 30a6d356d0..d1bfa775e4 100644 --- a/src/fstab-generator/fstab-generator.c +++ b/src/fstab-generator/fstab-generator.c @@ -722,10 +722,11 @@ static int add_sysroot_usr_mount(void) { } static int add_volatile_root(void) { - /* Let's add in systemd-remount-volatile.service which will remount the root device to tmpfs if this is - * requested, leaving only /usr from the root mount inside. */ - if (arg_volatile_mode != VOLATILE_YES) + /* Let's add in systemd-remount-volatile.service which will remount the root device to tmpfs if this is + * requested (or as an overlayfs), leaving only /usr from the root mount inside. */ + + if (!IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY)) return 0; return generator_add_symlink(arg_dest, SPECIAL_INITRD_ROOT_FS_TARGET, "requires", diff --git a/src/volatile-root/volatile-root.c b/src/volatile-root/volatile-root.c index d7f52e4a62..02f6d31b05 100644 --- a/src/volatile-root/volatile-root.c +++ b/src/volatile-root/volatile-root.c @@ -3,6 +3,7 @@ #include #include "alloc-util.h" +#include "escape.h" #include "fs-util.h" #include "main-func.h" #include "mkdir.h" @@ -17,20 +18,7 @@ static int make_volatile(const char *path) { _cleanup_free_ char *old_usr = NULL; int r; - r = path_is_mount_point(path, NULL, AT_SYMLINK_FOLLOW); - if (r < 0) - return log_error_errno(r, "Couldn't determine whether %s is a mount point: %m", path); - if (r == 0) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), - "%s is not a mount point.", path); - - r = path_is_temporary_fs(path); - if (r < 0) - return log_error_errno(r, "Couldn't determine whether %s is a temporary file system: %m", path); - if (r > 0) { - log_info("%s already is a temporary file system.", path); - return 0; - } + assert(path); r = chase_symlinks("/usr", path, CHASE_PREFIX_ROOT, &old_usr); if (r < 0) @@ -79,6 +67,51 @@ finish_rmdir: return r; } +static int make_overlay(const char *path) { + _cleanup_free_ char *escaped_path = NULL; + bool tmpfs_mounted = false; + const char *options = NULL; + int r; + + assert(path); + + r = mkdir_p("/run/systemd/overlay-sysroot", 0700); + if (r < 0) + return log_error_errno(r, "Couldn't create overlay sysroot directory: %m"); + + r = mount_verbose(LOG_ERR, "tmpfs", "/run/systemd/overlay-sysroot", "tmpfs", MS_STRICTATIME, "mode=755"); + if (r < 0) + goto finish; + + tmpfs_mounted = true; + + if (mkdir("/run/systemd/overlay-sysroot/upper", 0755) < 0) { + r = log_error_errno(errno, "Failed to create /run/systemd/overlay-sysroot/upper: %m"); + goto finish; + } + + if (mkdir("/run/systemd/overlay-sysroot/work", 0755) < 0) { + r = log_error_errno(errno, "Failed to create /run/systemd/overlay-sysroot/work: %m"); + goto finish; + } + + escaped_path = shell_escape(path, ",:"); + if (!escaped_path) { + r = log_oom(); + goto finish; + } + + options = strjoina("lowerdir=", escaped_path, ",upperdir=/run/systemd/overlay-sysroot/upper,workdir=/run/systemd/overlay-sysroot/work"); + r = mount_verbose(LOG_ERR, "overlay", path, "overlay", 0, options); + +finish: + if (tmpfs_mounted) + (void) umount_verbose("/run/systemd/overlay-sysroot"); + + (void) rmdir("/run/systemd/overlay-sysroot"); + return r; +} + static int run(int argc, char *argv[]) { VolatileMode m = _VOLATILE_MODE_INVALID; const char *path; @@ -116,10 +149,29 @@ static int run(int argc, char *argv[]) { "Directory cannot be the root directory."); } - if (m != VOLATILE_YES) + if (!IN_SET(m, VOLATILE_YES, VOLATILE_OVERLAY)) return 0; - return make_volatile(path); + r = path_is_mount_point(path, NULL, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Couldn't determine whether %s is a mount point: %m", path); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "%s is not a mount point.", path); + + r = path_is_temporary_fs(path); + if (r < 0) + return log_error_errno(r, "Couldn't determine whether %s is a temporary file system: %m", path); + if (r > 0) { + log_info("%s already is a temporary file system.", path); + return 0; + } + + if (m == VOLATILE_YES) + return make_volatile(path); + else { + assert(m == VOLATILE_OVERLAY); + return make_overlay(path); + } } DEFINE_MAIN_FUNCTION(run); From d10b92cb5e0461efa3defe76a16605d73e3977c9 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 20 Dec 2018 10:13:35 +0100 Subject: [PATCH 13/18] volatile-root: export original root --- src/volatile-root/volatile-root.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/volatile-root/volatile-root.c b/src/volatile-root/volatile-root.c index 02f6d31b05..701f5a2832 100644 --- a/src/volatile-root/volatile-root.c +++ b/src/volatile-root/volatile-root.c @@ -3,6 +3,7 @@ #include #include "alloc-util.h" +#include "blockdev-util.h" #include "escape.h" #include "fs-util.h" #include "main-func.h" @@ -115,6 +116,7 @@ finish: static int run(int argc, char *argv[]) { VolatileMode m = _VOLATILE_MODE_INVALID; const char *path; + dev_t devt; int r; log_setup_service(); @@ -166,6 +168,24 @@ static int run(int argc, char *argv[]) { return 0; } + /* We are about to replace the root directory with something else. Later code might want to know what we + * replaced here, hence let's save that information as a symlink we can later use. (This is particularly + * relevant for the overlayfs case where we'll fully obstruct the view onto the underlying device, hence + * querying the backing device node from the file system directly is no longer possible. */ + r = get_block_device_harder(path, &devt); + if (r < 0) + return log_error_errno(r, "Failed to determine device major/minor of %s: %m", path); + else if (r > 0) { + _cleanup_free_ char *dn = NULL; + + r = device_path_make_major_minor(S_IFBLK, devt, &dn); + if (r < 0) + return log_error_errno(r, "Failed to format device node path: %m"); + + if (symlink(dn, "/run/systemd/volatile-root") < 0) + log_warning_errno(errno, "Failed to create symlink /run/systemd/volatile-root: %m"); + } + if (m == VOLATILE_YES) return make_volatile(path); else { From 46c82d495613cb1aa89f6abf75289946db9996c7 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 20 Dec 2018 15:15:43 +0100 Subject: [PATCH 14/18] =?UTF-8?q?gpt-auto-generator:=20rename=20open=5Fpar?= =?UTF-8?q?ent()=20=E2=86=92=20open=5Fparent=5Fdevno()=20so=20that=20we=20?= =?UTF-8?q?can=20include=20fs-util.h=20later?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As that header also defines a function open_parent() which does something different. --- src/gpt-auto-generator/gpt-auto-generator.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpt-auto-generator/gpt-auto-generator.c b/src/gpt-auto-generator/gpt-auto-generator.c index 09c0bcba2d..0ce9cf06e2 100644 --- a/src/gpt-auto-generator/gpt-auto-generator.c +++ b/src/gpt-auto-generator/gpt-auto-generator.c @@ -483,7 +483,7 @@ static int add_root_rw(DissectedPartition *p) { return 0; } -static int open_parent(dev_t devnum, int *ret) { +static int open_parent_devno(dev_t devnum, int *ret) { _cleanup_(sd_device_unrefp) sd_device *d = NULL; const char *name, *devtype, *node; sd_device *parent; @@ -551,7 +551,7 @@ static int enumerate_partitions(dev_t devnum) { _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL; int r, k; - r = open_parent(devnum, &fd); + r = open_parent_devno(devnum, &fd); if (r <= 0) return r; From 2bef2582a140e4dbaa517c16befc445919f9b7c6 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 20 Dec 2018 15:18:38 +0100 Subject: [PATCH 15/18] gpt-auto-generator: use new /run/systemd/volatile-root symlink as fallback when we otherwise cannot determine root device node --- src/gpt-auto-generator/gpt-auto-generator.c | 22 +++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/gpt-auto-generator/gpt-auto-generator.c b/src/gpt-auto-generator/gpt-auto-generator.c index 0ce9cf06e2..65c504e10f 100644 --- a/src/gpt-auto-generator/gpt-auto-generator.c +++ b/src/gpt-auto-generator/gpt-auto-generator.c @@ -18,6 +18,7 @@ #include "efivars.h" #include "fd-util.h" #include "fileio.h" +#include "fs-util.h" #include "fstab-util.h" #include "generator.h" #include "gpt.h" @@ -707,8 +708,25 @@ static int add_mounts(void) { if (r < 0) return log_error_errno(r, "Failed to determine block device of /usr file system: %m"); if (r == 0) { - log_debug("Neither root nor /usr file system are on a (single) block device."); - return 0; + _cleanup_free_ char *p = NULL; + mode_t m; + + /* If the root mount has been replaced by some form of volatile file system (overlayfs), the + * original root block device node is symlinked in /run/systemd/volatile-root. Let's read that + * here. */ + r = readlink_malloc("/run/systemd/volatile-root", &p); + if (r == -ENOENT) { + log_debug("Neither root nor /usr file system are on a (single) block device."); + return 0; + } + if (r < 0) + return log_error_errno(r, "Failed to read symlink /run/systemd/volatile-root: %m"); + + r = device_path_parse_major_minor(p, &m, &devno); + if (r < 0) + return log_error_errno(r, "Failed to parse major/minor device node: %m"); + if (!S_ISBLK(m)) + return log_error_errno(SYNTHETIC_ERRNO(ENOTBLK), "Volatile root device is of wrong type."); } } From adc6f43b148b097c846f63522876f0f1c91ea0c0 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 20 Dec 2018 16:01:57 +0100 Subject: [PATCH 16/18] copy: don't synthesize a 'user.crtime_usec' xattr on copy unless explicitly requested Previously, when we'd copy an individual file we'd synthesize a user.crtime_usec xattr with the source's creation time if we can determine it. As the creation/birth time was until recently not queriable form userspace this effectively just propagated the same xattr on the source to the same xattr on the destination. However, current kernels now allow to query the birthtime using statx() and we do make use of that now. Which means that suddenly we started synthesizing these xattrs much more regularly. Doing this actually does make sense, but only in very few cases: not for the typical regular files we copy, but certainly when dealing with disk images. Hence, let's keep this kind of propagation, but let's make it a flag and default to off. Then turn it on whenever we deal with disk images, and leave it off otherwise. This is particularly relevant as overlayfs combining a real fs, and a tmpfs on top will result in EOPNOTSUPP when it is attempted to open a file with xattrs for writing, as tmpfs does not support xattrs, and hence the copy-up cannot work. Hence, let's avoid synthesizing this needlessly, to increase compat with overlayfs. --- src/basic/copy.c | 13 ++++++++----- src/basic/copy.h | 3 ++- src/boot/bootctl.c | 2 +- src/import/export-raw.c | 2 +- src/import/import-raw.c | 2 +- src/import/pull-raw.c | 2 +- src/nspawn/nspawn.c | 2 +- src/shared/machine-image.c | 2 +- 8 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/basic/copy.c b/src/basic/copy.c index 46e02a3759..2f36c8eb87 100644 --- a/src/basic/copy.c +++ b/src/basic/copy.c @@ -743,7 +743,7 @@ int copy_file_fd_full( r = copy_bytes_full(fdf, fdt, (uint64_t) -1, copy_flags, NULL, NULL, progress_bytes, userdata); - (void) copy_times(fdf, fdt); + (void) copy_times(fdf, fdt, copy_flags); (void) copy_xattr(fdf, fdt); return r; @@ -849,10 +849,9 @@ int copy_file_atomic_full( return 0; } -int copy_times(int fdf, int fdt) { +int copy_times(int fdf, int fdt, CopyFlags flags) { struct timespec ut[2]; struct stat st; - usec_t crtime = 0; assert(fdf >= 0); assert(fdt >= 0); @@ -866,8 +865,12 @@ int copy_times(int fdf, int fdt) { if (futimens(fdt, ut) < 0) return -errno; - if (fd_getcrtime(fdf, &crtime) >= 0) - (void) fd_setcrtime(fdt, crtime); + if (FLAGS_SET(flags, COPY_CRTIME)) { + usec_t crtime; + + if (fd_getcrtime(fdf, &crtime) >= 0) + (void) fd_setcrtime(fdt, crtime); + } return 0; } diff --git a/src/basic/copy.h b/src/basic/copy.h index f677021881..a33546d3ab 100644 --- a/src/basic/copy.h +++ b/src/basic/copy.h @@ -14,6 +14,7 @@ typedef enum CopyFlags { COPY_REPLACE = 1 << 2, /* Replace an existing file if there's one */ COPY_SAME_MOUNT = 1 << 3, /* Don't descend recursively into other file systems, across mount point boundaries */ COPY_MERGE_EMPTY = 1 << 4, /* Merge an existing, empty directory with our new tree to copy */ + COPY_CRTIME = 1 << 5, /* Generate a user.crtime_usec xattr off the source crtime if there is one, on copying */ } CopyFlags; typedef int (*copy_progress_bytes_t)(uint64_t n_bytes, void *userdata); @@ -57,5 +58,5 @@ static inline int copy_bytes(int fdf, int fdt, uint64_t max_bytes, CopyFlags cop return copy_bytes_full(fdf, fdt, max_bytes, copy_flags, NULL, NULL, NULL, NULL); } -int copy_times(int fdf, int fdt); +int copy_times(int fdf, int fdt, CopyFlags flags); int copy_xattr(int fdf, int fdt); diff --git a/src/boot/bootctl.c b/src/boot/bootctl.c index dc2fd96628..323806a534 100644 --- a/src/boot/bootctl.c +++ b/src/boot/bootctl.c @@ -436,7 +436,7 @@ static int copy_file_with_version_check(const char *from, const char *to, bool f return log_error_errno(r, "Failed to copy data from \"%s\" to \"%s\": %m", from, t); } - (void) copy_times(fd_from, fd_to); + (void) copy_times(fd_from, fd_to, 0); if (fsync(fd_to) < 0) { (void) unlink_noerrno(t); diff --git a/src/import/export-raw.c b/src/import/export-raw.c index 6a02b47a17..c1c946cd2b 100644 --- a/src/import/export-raw.c +++ b/src/import/export-raw.c @@ -223,7 +223,7 @@ static int raw_export_process(RawExport *e) { finish: if (r >= 0) { - (void) copy_times(e->input_fd, e->output_fd); + (void) copy_times(e->input_fd, e->output_fd, COPY_CRTIME); (void) copy_xattr(e->input_fd, e->output_fd); } diff --git a/src/import/import-raw.c b/src/import/import-raw.c index 4b1161557d..56f3431a08 100644 --- a/src/import/import-raw.c +++ b/src/import/import-raw.c @@ -215,7 +215,7 @@ static int raw_import_finish(RawImport *i) { return r; if (S_ISREG(i->st.st_mode)) { - (void) copy_times(i->input_fd, i->output_fd); + (void) copy_times(i->input_fd, i->output_fd, COPY_CRTIME); (void) copy_xattr(i->input_fd, i->output_fd); } diff --git a/src/import/pull-raw.c b/src/import/pull-raw.c index 3a3e015df8..72b9054e49 100644 --- a/src/import/pull-raw.c +++ b/src/import/pull-raw.c @@ -368,7 +368,7 @@ static int raw_pull_make_local_copy(RawPull *i) { return log_error_errno(r, "Failed to make writable copy of image: %m"); } - (void) copy_times(i->raw_job->disk_fd, dfd); + (void) copy_times(i->raw_job->disk_fd, dfd, COPY_CRTIME); (void) copy_xattr(i->raw_job->disk_fd, dfd); dfd = safe_close(dfd); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 14f251a277..5cb049e5f7 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -4400,7 +4400,7 @@ int main(int argc, char *argv[]) { goto finish; } - r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK); + r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME); if (r < 0) { r = log_error_errno(r, "Failed to copy image file: %m"); goto finish; diff --git a/src/shared/machine-image.c b/src/shared/machine-image.c index af06ab22e8..3d61221056 100644 --- a/src/shared/machine-image.c +++ b/src/shared/machine-image.c @@ -870,7 +870,7 @@ int image_clone(Image *i, const char *new_name, bool read_only) { case IMAGE_RAW: new_path = strjoina("/var/lib/machines/", new_name, ".raw"); - r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, FS_NOCOW_FL, COPY_REFLINK); + r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME); break; case IMAGE_BLOCK: From b23f16283d3c8ed2008713049798caff0c3bf9fc Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 21 Dec 2018 21:45:46 +0100 Subject: [PATCH 17/18] man: document nspawn's new --volatile=overlay switch --- man/systemd-nspawn.xml | 115 +++++++++++++++++++++++++---------------- 1 file changed, 71 insertions(+), 44 deletions(-) diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index f0a5231acf..5ed49e6587 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -167,9 +167,9 @@ template path refers to the root of a btrfs subvolume, in which case a simple copy-on-write snapshot is taken, and populating the root directory is instant. If the specified template path does not refer to the root of a btrfs subvolume (or not even to a btrfs file system at - all), the tree is copied (though possibly in a copy-on-write scheme — if the file system supports that), which - can be substantially more time-consuming. May not be specified together with or - . + all), the tree is copied (though possibly in a 'reflink' copy-on-write scheme — if the file system supports + that), which can be substantially more time-consuming. May not be specified together with + or . Note that this switch leaves host name, machine ID and all other settings that could identify the instance @@ -183,9 +183,16 @@ If specified, the container is run with a temporary snapshot of its file system that is removed immediately when the container terminates. May not be specified together with . - Note that this switch leaves host name, machine ID and - all other settings that could identify the instance - unmodified. + Note that this switch leaves host name, machine ID and all other settings that could identify the + instance unmodified. Please note that — as with — taking the temporary snapshot is + more efficient on file systems that support subvolume snapshots or 'reflinks' naively (btrfs + or new xfs) than on more traditional file systems that do not + (ext4). + + With this option no modifications of the container image are retained. Use + (described below) for other mechanisms to restrict persistency of + container images during runtime. + @@ -899,8 +906,12 @@ - Mount the root file system read-only for the - container. + Mount the container's root file system (and any other file systems container in the container + image) read-only. This has no effect on additional mounts made with , + and similar options. This mode is implied if the container image file or directory is + marked read-only itself. It is also implied if is used. In this case the container + image on disk is strictly read-only, while changes are permitted but kept non-persistently in memory only. For + further details, see below. @@ -931,20 +942,16 @@ - Mount a tmpfs file system into the container. - Takes a single absolute path argument that specifies where to - mount the tmpfs instance to (in which case the directory - access mode will be chosen as 0755, owned by root/root), or - optionally a colon-separated pair of path and mount option - string that is used for mounting (in which case the kernel - default for access mode and owner will be chosen, unless - otherwise specified). This option is particularly useful for - mounting directories such as /var as - tmpfs, to allow state-less systems, in particular when - combined with . - Backslash escapes are interpreted in the path, so - \: may be used to embed colons in the path. - + Mount a tmpfs file system into the container. Takes a single absolute path argument that + specifies where to mount the tmpfs instance to (in which case the directory access mode will be chosen as 0755, + owned by root/root), or optionally a colon-separated pair of path and mount option string that is used for + mounting (in which case the kernel default for access mode and owner will be chosen, unless otherwise + specified). Backslash escapes are interpreted in the path, so \: may be used to embed colons + in the path. + + Note that this option cannot be used to replace the root file system of the container with a temporary + file system. However, the option described below provides similar + functionality, with a focus on implementing stateless operating system images. @@ -1002,7 +1009,11 @@ be on the same file system as the top-most directory tree). Also note that the lowerdir= mount option receives the paths to stack in the opposite order of - this switch. + this switch. + + Note that this option cannot be used to replace the root file system of the container with an overlay + file system. However, the option described below provides similar functionality, + with a focus on implementing stateless operating system images. @@ -1074,33 +1085,49 @@ MODE - Boots the container in volatile mode. When no - mode parameter is passed or when mode is specified as - , full volatile mode is enabled. This - means the root directory is mounted as a mostly unpopulated - tmpfs instance, and - /usr from the OS tree is mounted into it - in read-only mode (the system thus starts up with read-only OS - image, but pristine state and configuration, any changes - are lost on shutdown). When the mode parameter - is specified as , the OS tree is - mounted read-only, but /var is mounted as - a tmpfs instance into it (the system thus - starts up with read-only OS resources and configuration, but - pristine state, and any changes to the latter are lost on - shutdown). When the mode parameter is specified as - (the default), the whole OS tree is made - available writable. + Boots the container in volatile mode. When no mode parameter is passed or when mode is + specified as , full volatile mode is enabled. This means the root directory is mounted as a + mostly unpopulated tmpfs instance, and /usr/ from the OS tree is + mounted into it in read-only mode (the system thus starts up with read-only OS image, but pristine state and + configuration, any changes are lost on shutdown). When the mode parameter is specified as + , the OS tree is mounted read-only, but /var/ is mounted as a + writable tmpfs instance into it (the system thus starts up with read-only OS resources and + configuration, but pristine state, and any changes to the latter are lost on shutdown). When the mode parameter + is specified as the read-only root file system is combined with a writable + tmpfs instance through overlayfs, so that it appears at it normally + would, but any changes are applied to the temporary file system only and lost when the container is + terminated. When the mode parameter is specified as (the default), the whole OS tree is + made available writable (unless is specified, see above). + + Note that if one of the volatile modes is chosen, its effect is limited to the root file system (or + /var/ in case of ), and any other mounts placed in the hierarchy are + unaffected — regardless if they are established automatically (e.g. the EFI system partition that might be + mounted to /efi/ or /boot/) or explicitly (e.g. through an additional + command line option such as , see above). This means, even if + is used changes to /efi/ or + /boot/ are prohibited in case such a partition exists in the container image operated on, + and even if is used the hypothetical file /etc/foobar is + potentially writable if if used to mount it from outside the read-only + container /etc directory. + + The option is closely related to this setting, and provides similar + behaviour by making a temporary, ephemeral copy of the whole OS image and executing that. For further details, + see above. + + The and options provide similar functionality, but + for specific sub-directories of the OS image only. For details, see above. This option provides similar functionality for containers as the systemd.volatile= kernel command line switch provides for host systems. See kernel-command-line7 for details. - Note that enabling this setting will only work correctly with operating systems in the container that can - boot up with only /usr mounted, and are able to automatically populate - /var, and also /etc in case of - --volatile=yes. + Note that setting this option to or will only work correctly + with operating systems in the container that can boot up with only /usr mounted, and are + able to automatically populate /var, and also /etc in case of + --volatile=yes. The option does not require any particular + preparations in the OS, but do note that overlayfs behaviour differs from regular file + systems in a number of ways, and hence compatibility is limited. From 13070a70ef8c8a790024698a3b84cb7323a0d0ca Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Sat, 22 Dec 2018 00:51:13 +0100 Subject: [PATCH 18/18] man: document new systemd.volatile=overlay kernel command line option --- man/kernel-command-line.xml | 9 ++++++--- man/systemd-fstab-generator.xml | 22 ++++++++++++++-------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/man/kernel-command-line.xml b/man/kernel-command-line.xml index e39f108d8f..6e791192b2 100644 --- a/man/kernel-command-line.xml +++ b/man/kernel-command-line.xml @@ -137,10 +137,13 @@ enables fully state-less boots were the vendor-supplied OS is used as shipped, with only default configuration and no stored state in effect, as /etc and /var (as well as all other resources shipped in the root file system) are reset at boot and lost on shutdown. If this - setting is set to state the root file system is mounted as usual, however + setting is set to state the root file system is mounted read-only, however /var is mounted as a volatile memory file system (tmpfs), so that the - system boots up with the normal configuration applied, but all state reset at boot and lost at shutdown. For details, - see + system boots up with the normal configuration applied, but all state reset at boot and lost at shutdown. If + this setting is set to overlay the root file system is set up as + overlayfs mount combining the read-only root directory with a writable + tmpfs, so that no modifications are made to disk, but the file system may be modified + nonetheless with all changes being lost at reboot. For details, see systemd-volatile-root.service8 and systemd-fstab-generator8. diff --git a/man/systemd-fstab-generator.xml b/man/systemd-fstab-generator.xml index ab706420c5..7897ce50e9 100644 --- a/man/systemd-fstab-generator.xml +++ b/man/systemd-fstab-generator.xml @@ -175,19 +175,25 @@ lost at shutdown, as /etc and /var will be served from the (initially unpopulated) volatile memory file system. - If set to the generator will leave the root - directory mount point unaltered, however will mount a tmpfs file system to - /var. In this mode the normal system configuration (i.e. the contents of - /etc) is in effect (and may be modified during system runtime), however the system state - (i.e. the contents of /var) is reset at boot and lost at shutdown. + If set to the generator will leave the root directory mount point unaltered, + however will mount a tmpfs file system to /var. In this mode the normal + system configuration (i.e. the contents of /etc) is in effect (and may be modified during + system runtime), however the system state (i.e. the contents of /var) is reset at boot and + lost at shutdown. + + If this setting is set to overlay the root file system is set up as + overlayfs mount combining the read-only root directory with a writable + tmpfs, so that no modifications are made to disk, but the file system may be modified + nonetheless with all changes being lost at reboot. Note that in none of these modes the root directory, /etc, /var or any other resources stored in the root file system are physically removed. It's thus safe to boot a system that is normally operated in non-volatile mode temporarily into volatile mode, without losing data. - Note that enabling this setting will only work correctly on operating systems that can boot up with only - /usr mounted, and are able to automatically populate /etc, and also - /var in case of systemd.volatile=yes. + Note that with the exception of overlay mode, enabling this setting will only work + correctly on operating systems that can boot up with only /usr mounted, and are able to + automatically populate /etc, and also /var in case of + systemd.volatile=yes.