diff --git a/man/kernel-command-line.xml b/man/kernel-command-line.xml index e39f108d8f..6e791192b2 100644 --- a/man/kernel-command-line.xml +++ b/man/kernel-command-line.xml @@ -137,10 +137,13 @@ enables fully state-less boots were the vendor-supplied OS is used as shipped, with only default configuration and no stored state in effect, as /etc and /var (as well as all other resources shipped in the root file system) are reset at boot and lost on shutdown. If this - setting is set to state the root file system is mounted as usual, however + setting is set to state the root file system is mounted read-only, however /var is mounted as a volatile memory file system (tmpfs), so that the - system boots up with the normal configuration applied, but all state reset at boot and lost at shutdown. For details, - see + system boots up with the normal configuration applied, but all state reset at boot and lost at shutdown. If + this setting is set to overlay the root file system is set up as + overlayfs mount combining the read-only root directory with a writable + tmpfs, so that no modifications are made to disk, but the file system may be modified + nonetheless with all changes being lost at reboot. For details, see systemd-volatile-root.service8 and systemd-fstab-generator8. diff --git a/man/systemd-fstab-generator.xml b/man/systemd-fstab-generator.xml index ab706420c5..7897ce50e9 100644 --- a/man/systemd-fstab-generator.xml +++ b/man/systemd-fstab-generator.xml @@ -175,19 +175,25 @@ lost at shutdown, as /etc and /var will be served from the (initially unpopulated) volatile memory file system. - If set to the generator will leave the root - directory mount point unaltered, however will mount a tmpfs file system to - /var. In this mode the normal system configuration (i.e. the contents of - /etc) is in effect (and may be modified during system runtime), however the system state - (i.e. the contents of /var) is reset at boot and lost at shutdown. + If set to the generator will leave the root directory mount point unaltered, + however will mount a tmpfs file system to /var. In this mode the normal + system configuration (i.e. the contents of /etc) is in effect (and may be modified during + system runtime), however the system state (i.e. the contents of /var) is reset at boot and + lost at shutdown. + + If this setting is set to overlay the root file system is set up as + overlayfs mount combining the read-only root directory with a writable + tmpfs, so that no modifications are made to disk, but the file system may be modified + nonetheless with all changes being lost at reboot. Note that in none of these modes the root directory, /etc, /var or any other resources stored in the root file system are physically removed. It's thus safe to boot a system that is normally operated in non-volatile mode temporarily into volatile mode, without losing data. - Note that enabling this setting will only work correctly on operating systems that can boot up with only - /usr mounted, and are able to automatically populate /etc, and also - /var in case of systemd.volatile=yes. + Note that with the exception of overlay mode, enabling this setting will only work + correctly on operating systems that can boot up with only /usr mounted, and are able to + automatically populate /etc, and also /var in case of + systemd.volatile=yes. diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index f0a5231acf..5ed49e6587 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -167,9 +167,9 @@ template path refers to the root of a btrfs subvolume, in which case a simple copy-on-write snapshot is taken, and populating the root directory is instant. If the specified template path does not refer to the root of a btrfs subvolume (or not even to a btrfs file system at - all), the tree is copied (though possibly in a copy-on-write scheme — if the file system supports that), which - can be substantially more time-consuming. May not be specified together with or - . + all), the tree is copied (though possibly in a 'reflink' copy-on-write scheme — if the file system supports + that), which can be substantially more time-consuming. May not be specified together with + or . Note that this switch leaves host name, machine ID and all other settings that could identify the instance @@ -183,9 +183,16 @@ If specified, the container is run with a temporary snapshot of its file system that is removed immediately when the container terminates. May not be specified together with . - Note that this switch leaves host name, machine ID and - all other settings that could identify the instance - unmodified. + Note that this switch leaves host name, machine ID and all other settings that could identify the + instance unmodified. Please note that — as with — taking the temporary snapshot is + more efficient on file systems that support subvolume snapshots or 'reflinks' naively (btrfs + or new xfs) than on more traditional file systems that do not + (ext4). + + With this option no modifications of the container image are retained. Use + (described below) for other mechanisms to restrict persistency of + container images during runtime. + @@ -899,8 +906,12 @@ - Mount the root file system read-only for the - container. + Mount the container's root file system (and any other file systems container in the container + image) read-only. This has no effect on additional mounts made with , + and similar options. This mode is implied if the container image file or directory is + marked read-only itself. It is also implied if is used. In this case the container + image on disk is strictly read-only, while changes are permitted but kept non-persistently in memory only. For + further details, see below. @@ -931,20 +942,16 @@ - Mount a tmpfs file system into the container. - Takes a single absolute path argument that specifies where to - mount the tmpfs instance to (in which case the directory - access mode will be chosen as 0755, owned by root/root), or - optionally a colon-separated pair of path and mount option - string that is used for mounting (in which case the kernel - default for access mode and owner will be chosen, unless - otherwise specified). This option is particularly useful for - mounting directories such as /var as - tmpfs, to allow state-less systems, in particular when - combined with . - Backslash escapes are interpreted in the path, so - \: may be used to embed colons in the path. - + Mount a tmpfs file system into the container. Takes a single absolute path argument that + specifies where to mount the tmpfs instance to (in which case the directory access mode will be chosen as 0755, + owned by root/root), or optionally a colon-separated pair of path and mount option string that is used for + mounting (in which case the kernel default for access mode and owner will be chosen, unless otherwise + specified). Backslash escapes are interpreted in the path, so \: may be used to embed colons + in the path. + + Note that this option cannot be used to replace the root file system of the container with a temporary + file system. However, the option described below provides similar + functionality, with a focus on implementing stateless operating system images. @@ -1002,7 +1009,11 @@ be on the same file system as the top-most directory tree). Also note that the lowerdir= mount option receives the paths to stack in the opposite order of - this switch. + this switch. + + Note that this option cannot be used to replace the root file system of the container with an overlay + file system. However, the option described below provides similar functionality, + with a focus on implementing stateless operating system images. @@ -1074,33 +1085,49 @@ MODE - Boots the container in volatile mode. When no - mode parameter is passed or when mode is specified as - , full volatile mode is enabled. This - means the root directory is mounted as a mostly unpopulated - tmpfs instance, and - /usr from the OS tree is mounted into it - in read-only mode (the system thus starts up with read-only OS - image, but pristine state and configuration, any changes - are lost on shutdown). When the mode parameter - is specified as , the OS tree is - mounted read-only, but /var is mounted as - a tmpfs instance into it (the system thus - starts up with read-only OS resources and configuration, but - pristine state, and any changes to the latter are lost on - shutdown). When the mode parameter is specified as - (the default), the whole OS tree is made - available writable. + Boots the container in volatile mode. When no mode parameter is passed or when mode is + specified as , full volatile mode is enabled. This means the root directory is mounted as a + mostly unpopulated tmpfs instance, and /usr/ from the OS tree is + mounted into it in read-only mode (the system thus starts up with read-only OS image, but pristine state and + configuration, any changes are lost on shutdown). When the mode parameter is specified as + , the OS tree is mounted read-only, but /var/ is mounted as a + writable tmpfs instance into it (the system thus starts up with read-only OS resources and + configuration, but pristine state, and any changes to the latter are lost on shutdown). When the mode parameter + is specified as the read-only root file system is combined with a writable + tmpfs instance through overlayfs, so that it appears at it normally + would, but any changes are applied to the temporary file system only and lost when the container is + terminated. When the mode parameter is specified as (the default), the whole OS tree is + made available writable (unless is specified, see above). + + Note that if one of the volatile modes is chosen, its effect is limited to the root file system (or + /var/ in case of ), and any other mounts placed in the hierarchy are + unaffected — regardless if they are established automatically (e.g. the EFI system partition that might be + mounted to /efi/ or /boot/) or explicitly (e.g. through an additional + command line option such as , see above). This means, even if + is used changes to /efi/ or + /boot/ are prohibited in case such a partition exists in the container image operated on, + and even if is used the hypothetical file /etc/foobar is + potentially writable if if used to mount it from outside the read-only + container /etc directory. + + The option is closely related to this setting, and provides similar + behaviour by making a temporary, ephemeral copy of the whole OS image and executing that. For further details, + see above. + + The and options provide similar functionality, but + for specific sub-directories of the OS image only. For details, see above. This option provides similar functionality for containers as the systemd.volatile= kernel command line switch provides for host systems. See kernel-command-line7 for details. - Note that enabling this setting will only work correctly with operating systems in the container that can - boot up with only /usr mounted, and are able to automatically populate - /var, and also /etc in case of - --volatile=yes. + Note that setting this option to or will only work correctly + with operating systems in the container that can boot up with only /usr mounted, and are + able to automatically populate /var, and also /etc in case of + --volatile=yes. The option does not require any particular + preparations in the OS, but do note that overlayfs behaviour differs from regular file + systems in a number of ways, and hence compatibility is limited. diff --git a/src/basic/copy.c b/src/basic/copy.c index 46e02a3759..2f36c8eb87 100644 --- a/src/basic/copy.c +++ b/src/basic/copy.c @@ -743,7 +743,7 @@ int copy_file_fd_full( r = copy_bytes_full(fdf, fdt, (uint64_t) -1, copy_flags, NULL, NULL, progress_bytes, userdata); - (void) copy_times(fdf, fdt); + (void) copy_times(fdf, fdt, copy_flags); (void) copy_xattr(fdf, fdt); return r; @@ -849,10 +849,9 @@ int copy_file_atomic_full( return 0; } -int copy_times(int fdf, int fdt) { +int copy_times(int fdf, int fdt, CopyFlags flags) { struct timespec ut[2]; struct stat st; - usec_t crtime = 0; assert(fdf >= 0); assert(fdt >= 0); @@ -866,8 +865,12 @@ int copy_times(int fdf, int fdt) { if (futimens(fdt, ut) < 0) return -errno; - if (fd_getcrtime(fdf, &crtime) >= 0) - (void) fd_setcrtime(fdt, crtime); + if (FLAGS_SET(flags, COPY_CRTIME)) { + usec_t crtime; + + if (fd_getcrtime(fdf, &crtime) >= 0) + (void) fd_setcrtime(fdt, crtime); + } return 0; } diff --git a/src/basic/copy.h b/src/basic/copy.h index f677021881..a33546d3ab 100644 --- a/src/basic/copy.h +++ b/src/basic/copy.h @@ -14,6 +14,7 @@ typedef enum CopyFlags { COPY_REPLACE = 1 << 2, /* Replace an existing file if there's one */ COPY_SAME_MOUNT = 1 << 3, /* Don't descend recursively into other file systems, across mount point boundaries */ COPY_MERGE_EMPTY = 1 << 4, /* Merge an existing, empty directory with our new tree to copy */ + COPY_CRTIME = 1 << 5, /* Generate a user.crtime_usec xattr off the source crtime if there is one, on copying */ } CopyFlags; typedef int (*copy_progress_bytes_t)(uint64_t n_bytes, void *userdata); @@ -57,5 +58,5 @@ static inline int copy_bytes(int fdf, int fdt, uint64_t max_bytes, CopyFlags cop return copy_bytes_full(fdf, fdt, max_bytes, copy_flags, NULL, NULL, NULL, NULL); } -int copy_times(int fdf, int fdt); +int copy_times(int fdf, int fdt, CopyFlags flags); int copy_xattr(int fdf, int fdt); diff --git a/src/boot/bootctl.c b/src/boot/bootctl.c index a529989ea0..1e0d115fe3 100644 --- a/src/boot/bootctl.c +++ b/src/boot/bootctl.c @@ -494,7 +494,7 @@ static int copy_file_with_version_check(const char *from, const char *to, bool f return log_error_errno(r, "Failed to copy data from \"%s\" to \"%s\": %m", from, t); } - (void) copy_times(fd_from, fd_to); + (void) copy_times(fd_from, fd_to, 0); if (fsync(fd_to) < 0) { (void) unlink_noerrno(t); diff --git a/src/fstab-generator/fstab-generator.c b/src/fstab-generator/fstab-generator.c index 30a6d356d0..d1bfa775e4 100644 --- a/src/fstab-generator/fstab-generator.c +++ b/src/fstab-generator/fstab-generator.c @@ -722,10 +722,11 @@ static int add_sysroot_usr_mount(void) { } static int add_volatile_root(void) { - /* Let's add in systemd-remount-volatile.service which will remount the root device to tmpfs if this is - * requested, leaving only /usr from the root mount inside. */ - if (arg_volatile_mode != VOLATILE_YES) + /* Let's add in systemd-remount-volatile.service which will remount the root device to tmpfs if this is + * requested (or as an overlayfs), leaving only /usr from the root mount inside. */ + + if (!IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY)) return 0; return generator_add_symlink(arg_dest, SPECIAL_INITRD_ROOT_FS_TARGET, "requires", diff --git a/src/gpt-auto-generator/gpt-auto-generator.c b/src/gpt-auto-generator/gpt-auto-generator.c index 2f8ccd025b..0f1e184eea 100644 --- a/src/gpt-auto-generator/gpt-auto-generator.c +++ b/src/gpt-auto-generator/gpt-auto-generator.c @@ -18,6 +18,7 @@ #include "efivars.h" #include "fd-util.h" #include "fileio.h" +#include "fs-util.h" #include "fstab-util.h" #include "generator.h" #include "gpt.h" @@ -533,7 +534,7 @@ static int add_root_rw(DissectedPartition *p) { return 0; } -static int open_parent(dev_t devnum, int *ret) { +static int open_parent_devno(dev_t devnum, int *ret) { _cleanup_(sd_device_unrefp) sd_device *d = NULL; const char *name, *devtype, *node; sd_device *parent; @@ -601,7 +602,7 @@ static int enumerate_partitions(dev_t devnum) { _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL; int r, k; - r = open_parent(devnum, &fd); + r = open_parent_devno(devnum, &fd); if (r <= 0) return r; @@ -763,8 +764,25 @@ static int add_mounts(void) { if (r < 0) return log_error_errno(r, "Failed to determine block device of /usr file system: %m"); if (r == 0) { - log_debug("Neither root nor /usr file system are on a (single) block device."); - return 0; + _cleanup_free_ char *p = NULL; + mode_t m; + + /* If the root mount has been replaced by some form of volatile file system (overlayfs), the + * original root block device node is symlinked in /run/systemd/volatile-root. Let's read that + * here. */ + r = readlink_malloc("/run/systemd/volatile-root", &p); + if (r == -ENOENT) { + log_debug("Neither root nor /usr file system are on a (single) block device."); + return 0; + } + if (r < 0) + return log_error_errno(r, "Failed to read symlink /run/systemd/volatile-root: %m"); + + r = device_path_parse_major_minor(p, &m, &devno); + if (r < 0) + return log_error_errno(r, "Failed to parse major/minor device node: %m"); + if (!S_ISBLK(m)) + return log_error_errno(SYNTHETIC_ERRNO(ENOTBLK), "Volatile root device is of wrong type."); } } diff --git a/src/import/export-raw.c b/src/import/export-raw.c index 6a02b47a17..c1c946cd2b 100644 --- a/src/import/export-raw.c +++ b/src/import/export-raw.c @@ -223,7 +223,7 @@ static int raw_export_process(RawExport *e) { finish: if (r >= 0) { - (void) copy_times(e->input_fd, e->output_fd); + (void) copy_times(e->input_fd, e->output_fd, COPY_CRTIME); (void) copy_xattr(e->input_fd, e->output_fd); } diff --git a/src/import/import-raw.c b/src/import/import-raw.c index 4b1161557d..56f3431a08 100644 --- a/src/import/import-raw.c +++ b/src/import/import-raw.c @@ -215,7 +215,7 @@ static int raw_import_finish(RawImport *i) { return r; if (S_ISREG(i->st.st_mode)) { - (void) copy_times(i->input_fd, i->output_fd); + (void) copy_times(i->input_fd, i->output_fd, COPY_CRTIME); (void) copy_xattr(i->input_fd, i->output_fd); } diff --git a/src/import/pull-raw.c b/src/import/pull-raw.c index 3a3e015df8..72b9054e49 100644 --- a/src/import/pull-raw.c +++ b/src/import/pull-raw.c @@ -368,7 +368,7 @@ static int raw_pull_make_local_copy(RawPull *i) { return log_error_errno(r, "Failed to make writable copy of image: %m"); } - (void) copy_times(i->raw_job->disk_fd, dfd); + (void) copy_times(i->raw_job->disk_fd, dfd, COPY_CRTIME); (void) copy_xattr(i->raw_job->disk_fd, dfd); dfd = safe_close(dfd); diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index a9af889747..eb0a26ef35 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -212,6 +212,8 @@ int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) if (!path_is_absolute(destination)) return -EINVAL; + if (empty_or_root(destination)) + return -EINVAL; m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND); if (!m) @@ -251,6 +253,8 @@ int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s) { if (!path_is_absolute(path)) return -EINVAL; + if (empty_or_root(path)) + return -EINVAL; m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS); if (!m) @@ -310,6 +314,9 @@ int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_onl return -EINVAL; } + if (empty_or_root(destination)) + return -EINVAL; + m = custom_mount_add(l, n, CUSTOM_MOUNT_OVERLAY); if (!m) return -ENOMEM; @@ -849,9 +856,8 @@ int mount_custom( return 0; } -int setup_volatile_state( +static int setup_volatile_state( const char *directory, - VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context) { @@ -861,11 +867,7 @@ int setup_volatile_state( assert(directory); - if (mode != VOLATILE_STATE) - return 0; - - /* --volatile=state means we simply overmount /var - with a tmpfs, and the rest read-only. */ + /* --volatile=state means we simply overmount /var with a tmpfs, and the rest read-only. */ r = bind_remount_recursive(directory, true, NULL); if (r < 0) @@ -886,9 +888,8 @@ int setup_volatile_state( return mount_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options); } -int setup_volatile( +static int setup_volatile_yes( const char *directory, - VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context) { @@ -900,11 +901,8 @@ int setup_volatile( assert(directory); - if (mode != VOLATILE_YES) - return 0; - - /* --volatile=yes means we mount a tmpfs to the root dir, and - the original /usr to use inside it, and that read-only. */ + /* --volatile=yes means we mount a tmpfs to the root dir, and the original /usr to use inside it, and that + read-only. */ if (!mkdtemp(template)) return log_error_errno(errno, "Failed to create temporary directory: %m"); @@ -912,7 +910,7 @@ int setup_volatile( options = "mode=755"; r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf); if (r < 0) - return log_oom(); + goto fail; if (r > 0) options = buf; @@ -961,6 +959,93 @@ fail: return r; } +static int setup_volatile_overlay( + const char *directory, + bool userns, uid_t uid_shift, uid_t uid_range, + const char *selinux_apifs_context) { + + _cleanup_free_ char *buf = NULL, *escaped_directory = NULL, *escaped_upper = NULL, *escaped_work = NULL; + char template[] = "/tmp/nspawn-volatile-XXXXXX"; + const char *upper, *work, *options; + bool tmpfs_mounted = false; + int r; + + assert(directory); + + /* --volatile=overlay means we mount an overlayfs to the root dir. */ + + if (!mkdtemp(template)) + return log_error_errno(errno, "Failed to create temporary directory: %m"); + + options = "mode=755"; + r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf); + if (r < 0) + goto finish; + if (r > 0) + options = buf; + + r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options); + if (r < 0) + goto finish; + + tmpfs_mounted = true; + + upper = strjoina(template, "/upper"); + work = strjoina(template, "/work"); + + if (mkdir(upper, 0755) < 0) { + r = log_error_errno(errno, "Failed to create %s: %m", upper); + goto finish; + } + if (mkdir(work, 0755) < 0) { + r = log_error_errno(errno, "Failed to create %s: %m", work); + goto finish; + } + + /* And now, let's overmount the root dir with an overlayfs that uses the root dir as lower dir. It's kinda nice + * that the kernel allows us to do that without going through some mount point rearrangements. */ + + escaped_directory = shell_escape(directory, ",:"); + escaped_upper = shell_escape(upper, ",:"); + escaped_work = shell_escape(work, ",:"); + if (!escaped_directory || !escaped_upper || !escaped_work) { + r = -ENOMEM; + goto finish; + } + + options = strjoina("lowerdir=", escaped_directory, ",upperdir=", escaped_upper, ",workdir=", escaped_work); + r = mount_verbose(LOG_ERR, "overlay", directory, "overlay", 0, options); + +finish: + if (tmpfs_mounted) + (void) umount_verbose(template); + + (void) rmdir(template); + return r; +} + +int setup_volatile_mode( + const char *directory, + VolatileMode mode, + bool userns, uid_t uid_shift, uid_t uid_range, + const char *selinux_apifs_context) { + + switch (mode) { + + case VOLATILE_YES: + return setup_volatile_yes(directory, userns, uid_shift, uid_range, selinux_apifs_context); + + case VOLATILE_STATE: + return setup_volatile_state(directory, userns, uid_shift, uid_range, selinux_apifs_context); + + case VOLATILE_OVERLAY: + return setup_volatile_overlay(directory, userns, uid_shift, uid_range, selinux_apifs_context); + + default: + return 0; + } +} + /* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */ int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s) { _cleanup_free_ char *root_new = NULL, *root_old = NULL; diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h index 8051a7d9d9..e060ca0e4d 100644 --- a/src/nspawn/nspawn-mount.h +++ b/src/nspawn/nspawn-mount.h @@ -49,8 +49,7 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings); int mount_custom(const char *dest, CustomMount *mounts, size_t n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); -int setup_volatile(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); -int setup_volatile_state(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); +int setup_volatile_mode(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s); int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index e0c2d711e6..5cb049e5f7 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -1308,6 +1308,9 @@ static int verify_arguments(void) { if (arg_start_mode == START_BOOT && arg_kill_signal <= 0) arg_kill_signal = SIGRTMIN+3; + if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */ + arg_read_only = true; + if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) /* Save the user from accidentally registering either user-$SESSION.scope or user@.service. * The latter is not technically a user session, but we don't need to labour the point. */ @@ -1334,6 +1337,12 @@ static int verify_arguments(void) { if (arg_userns_chown && arg_read_only) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--read-only and --private-users-chown may not be combined."); + /* We don't support --private-users-chown together with any of the volatile modes since we couldn't + * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive + * copy-up (in case of overlay) making the entire excercise pointless. */ + if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined."); + /* If --network-namespace-path is given with any other network-related option, * we need to error out, to avoid conflicts between different network options. */ if (arg_network_namespace_path && @@ -1352,9 +1361,6 @@ static int verify_arguments(void) { if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts."); - if (arg_volatile_mode != VOLATILE_NO && arg_read_only) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy."); - if (arg_expose_ports && !arg_private_network) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking."); @@ -1420,6 +1426,10 @@ static const char *timezone_from_path(const char *path) { "/usr/share/zoneinfo/"); } +static bool etc_writable(void) { + return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY); +} + static int setup_timezone(const char *dest) { _cleanup_free_ char *p = NULL, *etc = NULL; const char *where, *check; @@ -1431,9 +1441,9 @@ static int setup_timezone(const char *dest) { if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) { r = readlink_malloc("/etc/localtime", &p); if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO) - m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_OFF : TIMEZONE_DELETE; + m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF; else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */ - m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_BIND : TIMEZONE_COPY; + m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND; else if (r < 0) { log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m"); /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data @@ -1444,7 +1454,7 @@ static int setup_timezone(const char *dest) { */ return 0; } else if (arg_timezone == TIMEZONE_AUTO) - m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_BIND : TIMEZONE_SYMLINK; + m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND; else m = arg_timezone; } else @@ -1606,11 +1616,11 @@ static int setup_resolv_conf(const char *dest) { if (arg_private_network) m = RESOLV_CONF_OFF; else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0) - m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_BIND_STATIC : RESOLV_CONF_COPY_STATIC; + m = etc_writable() ? RESOLV_CONF_COPY_STATIC : RESOLV_CONF_BIND_STATIC; else if (have_resolv_conf("/etc/resolv.conf") > 0) - m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_BIND_HOST : RESOLV_CONF_COPY_HOST; + m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST; else - m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_OFF : RESOLV_CONF_DELETE; + m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF; } else m = arg_resolv_conf; @@ -2896,6 +2906,30 @@ static int outer_child( "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range); } + if (!dissected_image) { + /* Turn directory into bind mount */ + r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + } + + r = setup_pivot_root( + directory, + arg_pivot_root_new, + arg_pivot_root_old); + if (r < 0) + return r; + + r = setup_volatile_mode( + directory, + arg_volatile_mode, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_context); + if (r < 0) + return r; + if (dissected_image) { /* Now we know the uid shift, let's now mount everything else that might be in the image. */ r = dissected_image_mount(dissected_image, directory, arg_uid_shift, @@ -2921,38 +2955,6 @@ static int outer_child( unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket); } - /* Turn directory into bind mount */ - r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL); - if (r < 0) - return r; - - r = setup_pivot_root( - directory, - arg_pivot_root_new, - arg_pivot_root_old); - if (r < 0) - return r; - - r = setup_volatile( - directory, - arg_volatile_mode, - arg_userns_mode != USER_NAMESPACE_NO, - arg_uid_shift, - arg_uid_range, - arg_selinux_context); - if (r < 0) - return r; - - r = setup_volatile_state( - directory, - arg_volatile_mode, - arg_userns_mode != USER_NAMESPACE_NO, - arg_uid_shift, - arg_uid_range, - arg_selinux_context); - if (r < 0) - return r; - /* Mark everything as shared so our mounts get propagated down. This is * required to make new bind mounts available in systemd services * inside the containter that create a new mount namespace. @@ -2971,7 +2973,7 @@ static int outer_child( if (r < 0) return r; - if (arg_read_only) { + if (arg_read_only && arg_volatile_mode == VOLATILE_NO) { r = bind_remount_recursive(directory, true, NULL); if (r < 0) return log_error_errno(r, "Failed to make tree read-only: %m"); @@ -4398,7 +4400,7 @@ int main(int argc, char *argv[]) { goto finish; } - r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK); + r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME); if (r < 0) { r = log_error_errno(r, "Failed to copy image file: %m"); goto finish; diff --git a/src/shared/machine-image.c b/src/shared/machine-image.c index af06ab22e8..3d61221056 100644 --- a/src/shared/machine-image.c +++ b/src/shared/machine-image.c @@ -870,7 +870,7 @@ int image_clone(Image *i, const char *new_name, bool read_only) { case IMAGE_RAW: new_path = strjoina("/var/lib/machines/", new_name, ".raw"); - r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, FS_NOCOW_FL, COPY_REFLINK); + r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME); break; case IMAGE_BLOCK: diff --git a/src/shared/volatile-util.c b/src/shared/volatile-util.c index 4d75bc0e96..5ca6ab3376 100644 --- a/src/shared/volatile-util.c +++ b/src/shared/volatile-util.c @@ -12,33 +12,35 @@ int query_volatile_mode(VolatileMode *ret) { _cleanup_free_ char *mode = NULL; - VolatileMode m = VOLATILE_NO; int r; r = proc_cmdline_get_key("systemd.volatile", PROC_CMDLINE_VALUE_OPTIONAL, &mode); if (r < 0) return r; - if (r == 0) - goto finish; + if (r == 0) { + *ret = VOLATILE_NO; + return 0; + } if (mode) { + VolatileMode m; + m = volatile_mode_from_string(mode); if (m < 0) return -EINVAL; + + *ret = m; } else - m = VOLATILE_YES; + *ret = VOLATILE_YES; - r = 1; - -finish: - *ret = m; - return r; + return 1; } static const char* const volatile_mode_table[_VOLATILE_MODE_MAX] = { [VOLATILE_NO] = "no", [VOLATILE_YES] = "yes", [VOLATILE_STATE] = "state", + [VOLATILE_OVERLAY] = "overlay", }; DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(volatile_mode, VolatileMode, VOLATILE_YES); diff --git a/src/shared/volatile-util.h b/src/shared/volatile-util.h index 8761c44ab8..2d31bb1174 100644 --- a/src/shared/volatile-util.h +++ b/src/shared/volatile-util.h @@ -5,6 +5,7 @@ typedef enum VolatileMode { VOLATILE_NO, VOLATILE_YES, VOLATILE_STATE, + VOLATILE_OVERLAY, _VOLATILE_MODE_MAX, _VOLATILE_MODE_INVALID = -1 } VolatileMode; diff --git a/src/volatile-root/volatile-root.c b/src/volatile-root/volatile-root.c index 5da9ce1681..701f5a2832 100644 --- a/src/volatile-root/volatile-root.c +++ b/src/volatile-root/volatile-root.c @@ -3,6 +3,8 @@ #include #include "alloc-util.h" +#include "blockdev-util.h" +#include "escape.h" #include "fs-util.h" #include "main-func.h" #include "mkdir.h" @@ -17,20 +19,7 @@ static int make_volatile(const char *path) { _cleanup_free_ char *old_usr = NULL; int r; - r = path_is_mount_point(path, NULL, AT_SYMLINK_FOLLOW); - if (r < 0) - return log_error_errno(r, "Couldn't determine whether %s is a mount point: %m", path); - if (r == 0) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), - "%s is not a mount point.", path); - - r = path_is_temporary_fs(path); - if (r < 0) - return log_error_errno(r, "Couldn't determine whether %s is a temporary file system: %m", path); - if (r > 0) { - log_info("%s already is a temporary file system.", path); - return 0; - } + assert(path); r = chase_symlinks("/usr", path, CHASE_PREFIX_ROOT, &old_usr); if (r < 0) @@ -45,7 +34,7 @@ static int make_volatile(const char *path) { goto finish_rmdir; if (mkdir("/run/systemd/volatile-sysroot/usr", 0755) < 0) { - r = -errno; + r = log_error_errno(errno, "Failed to create /usr directory: %m"); goto finish_umount; } @@ -54,8 +43,10 @@ static int make_volatile(const char *path) { goto finish_umount; r = bind_remount_recursive("/run/systemd/volatile-sysroot/usr", true, NULL); - if (r < 0) + if (r < 0) { + log_error_errno(r, "Failed to remount /usr read-only: %m"); goto finish_umount; + } r = umount_recursive(path, 0); if (r < 0) { @@ -64,7 +55,7 @@ static int make_volatile(const char *path) { } if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) - log_warning_errno(errno, "Failed to remount %s MS_SLAVE|MS_REC: %m", path); + log_warning_errno(errno, "Failed to remount %s MS_SLAVE|MS_REC, ignoring: %m", path); r = mount_verbose(LOG_ERR, "/run/systemd/volatile-sysroot", path, NULL, MS_MOVE, NULL); @@ -77,9 +68,55 @@ finish_rmdir: return r; } +static int make_overlay(const char *path) { + _cleanup_free_ char *escaped_path = NULL; + bool tmpfs_mounted = false; + const char *options = NULL; + int r; + + assert(path); + + r = mkdir_p("/run/systemd/overlay-sysroot", 0700); + if (r < 0) + return log_error_errno(r, "Couldn't create overlay sysroot directory: %m"); + + r = mount_verbose(LOG_ERR, "tmpfs", "/run/systemd/overlay-sysroot", "tmpfs", MS_STRICTATIME, "mode=755"); + if (r < 0) + goto finish; + + tmpfs_mounted = true; + + if (mkdir("/run/systemd/overlay-sysroot/upper", 0755) < 0) { + r = log_error_errno(errno, "Failed to create /run/systemd/overlay-sysroot/upper: %m"); + goto finish; + } + + if (mkdir("/run/systemd/overlay-sysroot/work", 0755) < 0) { + r = log_error_errno(errno, "Failed to create /run/systemd/overlay-sysroot/work: %m"); + goto finish; + } + + escaped_path = shell_escape(path, ",:"); + if (!escaped_path) { + r = log_oom(); + goto finish; + } + + options = strjoina("lowerdir=", escaped_path, ",upperdir=/run/systemd/overlay-sysroot/upper,workdir=/run/systemd/overlay-sysroot/work"); + r = mount_verbose(LOG_ERR, "overlay", path, "overlay", 0, options); + +finish: + if (tmpfs_mounted) + (void) umount_verbose("/run/systemd/overlay-sysroot"); + + (void) rmdir("/run/systemd/overlay-sysroot"); + return r; +} + static int run(int argc, char *argv[]) { VolatileMode m = _VOLATILE_MODE_INVALID; const char *path; + dev_t devt; int r; log_setup_service(); @@ -94,10 +131,8 @@ static int run(int argc, char *argv[]) { if (r == 0 && argc >= 2) { /* The kernel command line always wins. However if nothing was set there, the argument passed here wins instead. */ m = volatile_mode_from_string(argv[1]); - if (m < 0) { - log_error("Couldn't parse volatile mode: %s", argv[1]); - r = -EINVAL; - } + if (m < 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Couldn't parse volatile mode: %s", argv[1]); } if (argc < 3) @@ -116,10 +151,47 @@ static int run(int argc, char *argv[]) { "Directory cannot be the root directory."); } - if (m != VOLATILE_YES) + if (!IN_SET(m, VOLATILE_YES, VOLATILE_OVERLAY)) return 0; - return make_volatile(path); + r = path_is_mount_point(path, NULL, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Couldn't determine whether %s is a mount point: %m", path); + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "%s is not a mount point.", path); + + r = path_is_temporary_fs(path); + if (r < 0) + return log_error_errno(r, "Couldn't determine whether %s is a temporary file system: %m", path); + if (r > 0) { + log_info("%s already is a temporary file system.", path); + return 0; + } + + /* We are about to replace the root directory with something else. Later code might want to know what we + * replaced here, hence let's save that information as a symlink we can later use. (This is particularly + * relevant for the overlayfs case where we'll fully obstruct the view onto the underlying device, hence + * querying the backing device node from the file system directly is no longer possible. */ + r = get_block_device_harder(path, &devt); + if (r < 0) + return log_error_errno(r, "Failed to determine device major/minor of %s: %m", path); + else if (r > 0) { + _cleanup_free_ char *dn = NULL; + + r = device_path_make_major_minor(S_IFBLK, devt, &dn); + if (r < 0) + return log_error_errno(r, "Failed to format device node path: %m"); + + if (symlink(dn, "/run/systemd/volatile-root") < 0) + log_warning_errno(errno, "Failed to create symlink /run/systemd/volatile-root: %m"); + } + + if (m == VOLATILE_YES) + return make_volatile(path); + else { + assert(m == VOLATILE_OVERLAY); + return make_overlay(path); + } } DEFINE_MAIN_FUNCTION(run);