From 69338c3dfb13d5f5e2d8b7f66f785daab8cbe190 Mon Sep 17 00:00:00 2001 From: Alan Jenkins Date: Thu, 30 Aug 2018 00:32:54 +0100 Subject: [PATCH 1/4] namespace: don't try to remount superblocks We can't remount the underlying superblocks, if we are inside a user namespace and running Linux <= 4.17. We can only change the per-mount flags (MS_REMOUNT | MS_BIND). This type of mount() call can only change the per-mount flags, so we don't have to worry about passing the right string options now. Fixes #9914 ("Since 1beab8b was merged, systemd has been failing to start systemd-resolved inside unprivileged containers" ... "Failed to re-mount '/run/systemd/unit-root/dev' read-only: Operation not permitted"). > It's basically my fault :-). I pointed out we could remount read-only > without MS_BIND when reviewing the PR that added TemporaryFilesystem=, > and poettering suggested to change PrivateDevices= at the same time. > I think it's safe to change back, and I don't expect anyone will notice > a difference in behaviour. > > It just surprised me to realize that > `TemporaryFilesystem=/tmp:size=10M,ro,nosuid` would not apply `ro` to the > superblock (underlying filesystem), like mount -osize=10M,ro,nosuid does. > Maybe a comment could note the kernel version (v4.18), that lets you > remount without MS_BIND inside a user namespace. This makes the code longer and I guess this function is still ugly, sorry. One obstacle to cleaning it up is the interaction between `PrivateDevices=yes` and `ReadOnlyPaths=/dev`. I've added a test for the existing behaviour, which I think is now the correct behaviour. --- src/core/namespace.c | 19 +++++++++++++------ test/test-execute/exec-readonlypaths.service | 6 ++++-- .../exec-temporaryfilesystem-options.service | 3 +-- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/core/namespace.c b/src/core/namespace.c index c164bc5793..c3bbb40680 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1026,6 +1026,15 @@ static int apply_mount( return 0; } +/* Change the per-mount readonly flag on an existing mount */ +static int remount_bind_readonly(const char *path, unsigned long orig_flags) { + int r; + + r = mount(NULL, path, NULL, MS_REMOUNT | MS_BIND | MS_RDONLY | orig_flags, NULL); + + return r < 0 ? -errno : 0; +} + static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self_mountinfo) { bool submounts = false; int r = 0; @@ -1035,17 +1044,15 @@ static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self if (mount_entry_read_only(m)) { if (IN_SET(m->mode, EMPTY_DIR, TMPFS)) { - /* Make superblock readonly */ - if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT | MS_RDONLY | m->flags, mount_entry_options(m)) < 0) - r = -errno; + r = remount_bind_readonly(mount_entry_path(m), m->flags); } else { submounts = true; r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), true, blacklist, proc_self_mountinfo); } } else if (m->mode == PRIVATE_DEV) { - /* Superblock can be readonly but the submounts can't */ - if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0) - r = -errno; + /* Set /dev readonly, but not submounts like /dev/shm. Also, we only set the per-mount read-only flag. + * We can't set it on the superblock, if we are inside a user namespace and running Linux <= 4.17. */ + r = remount_bind_readonly(mount_entry_path(m), DEV_MOUNT_OPTIONS); } else return 0; diff --git a/test/test-execute/exec-readonlypaths.service b/test/test-execute/exec-readonlypaths.service index 6866fdc700..a0ca68f67d 100644 --- a/test/test-execute/exec-readonlypaths.service +++ b/test/test-execute/exec-readonlypaths.service @@ -2,6 +2,8 @@ Description=Test for ReadOnlyPaths= [Service] -ReadOnlyPaths=/etc -/i-dont-exist /usr -ExecStart=/bin/sh -x -c 'test ! -w /etc && test ! -w /usr && test ! -e /i-dont-exist && test -w /var' +ReadOnlyPaths=/usr /etc /sys /dev -/i-dont-exist +PrivateDevices=yes +ExecStart=/bin/sh -x -c 'test ! -w /usr && test ! -w /etc && test ! -w /sys && test ! -w /sys/fs/cgroup' +ExecStart=/bin/sh -x -c 'test ! -w /dev && test ! -w /dev/shm && test ! -e /i-dont-exist && test -w /var' Type=oneshot diff --git a/test/test-execute/exec-temporaryfilesystem-options.service b/test/test-execute/exec-temporaryfilesystem-options.service index b7a5baf93a..371e5674b1 100644 --- a/test/test-execute/exec-temporaryfilesystem-options.service +++ b/test/test-execute/exec-temporaryfilesystem-options.service @@ -5,11 +5,10 @@ Description=Test for TemporaryFileSystem with mount options Type=oneshot # The mount options default to "mode=0755,nodev,strictatime". -# Let's override some of them, and test the behaviour of "ro". +# Let's override some of them, and test "ro". TemporaryFileSystem=/var:ro,mode=0700,nostrictatime # Check /proc/self/mountinfo -ExecStart=/bin/sh -x -c 'test "$$(awk \'$$5 == "/var" && $$11 !~ /(^|,)ro(,|$$)/ { print $$6 }\' /proc/self/mountinfo)" = ""' ExecStart=/bin/sh -x -c 'test "$$(awk \'$$5 == "/var" && $$11 !~ /(^|,)mode=700(,|$$)/ { print $$6 }\' /proc/self/mountinfo)" = ""' ExecStart=/bin/sh -x -c 'test "$$(awk \'$$5 == "/var" && $$6 !~ /(^|,)ro(,|$$)/ { print $$6 }\' /proc/self/mountinfo)" = ""' From ad8e66dcc485425da925a9ef99e911c6e9d8ee5b Mon Sep 17 00:00:00 2001 From: Alan Jenkins Date: Wed, 29 Aug 2018 22:36:37 +0100 Subject: [PATCH 2/4] namespace: fix mode for TemporaryFileSystem= ... when no mount options are passed. Change the code, to avoid the following failure in the newly added tests: exec-temporaryfilesystem-rw.service: Executing: /usr/bin/sh -x -c '[ "$(stat -c %a /var)" == 755 ]' ++ stat -c %a /var + '[' 1777 == 755 ']' Received SIGCHLD from PID 30364 (sh). Child 30364 (sh) died (code=exited, status=1/FAILURE) (And I spotted an opportunity to use TAKE_PTR() at the end). --- src/core/namespace.c | 26 ++++++++----------- .../exec-temporaryfilesystem-ro.service | 3 +++ .../exec-temporaryfilesystem-rw.service | 3 +++ 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src/core/namespace.c b/src/core/namespace.c index c3bbb40680..c7ccaa5192 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -304,7 +304,7 @@ static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, for (i = 0; i < n; i++) { const TemporaryFileSystem *t = tmpfs + i; _cleanup_free_ char *o = NULL, *str = NULL; - unsigned long flags = MS_NODEV|MS_STRICTATIME; + unsigned long flags; bool ro = false; if (!path_is_absolute(t->path)) { @@ -312,29 +312,25 @@ static int append_tmpfs_mounts(MountEntry **p, const TemporaryFileSystem *tmpfs, return -EINVAL; } - if (!isempty(t->options)) { - str = strjoin("mode=0755,", t->options); - if (!str) - return -ENOMEM; + str = strjoin("mode=0755,", t->options); + if (!str) + return -ENOMEM; - r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o); - if (r < 0) - return log_debug_errno(r, "Failed to parse mount option '%s': %m", str); + r = mount_option_mangle(str, MS_NODEV|MS_STRICTATIME, &flags, &o); + if (r < 0) + return log_debug_errno(r, "Failed to parse mount option '%s': %m", str); - ro = flags & MS_RDONLY; - if (ro) - flags ^= MS_RDONLY; - } + ro = flags & MS_RDONLY; + if (ro) + flags ^= MS_RDONLY; *((*p)++) = (MountEntry) { .path_const = t->path, .mode = TMPFS, .read_only = ro, - .options_malloc = o, + .options_malloc = TAKE_PTR(o), .flags = flags, }; - - o = NULL; } return 0; diff --git a/test/test-execute/exec-temporaryfilesystem-ro.service b/test/test-execute/exec-temporaryfilesystem-ro.service index c0e3721a01..c161aecc30 100644 --- a/test/test-execute/exec-temporaryfilesystem-ro.service +++ b/test/test-execute/exec-temporaryfilesystem-ro.service @@ -10,6 +10,9 @@ ExecStart=/bin/sh -c 'test -d /var/test-exec-temporaryfilesystem/rw && test -d / # Check TemporaryFileSystem= are empty ExecStart=/bin/sh -c 'for i in $$(ls -A /var); do test $$i = test-exec-temporaryfilesystem || false; done' +# Check default mode +ExecStart=sh -x -c 'test "$$(stat -c %%a /var)" = "755"' + # Cannot create a file in /var ExecStart=/bin/sh -c '! touch /var/hoge' diff --git a/test/test-execute/exec-temporaryfilesystem-rw.service b/test/test-execute/exec-temporaryfilesystem-rw.service index 379ad066fb..bb830595bc 100644 --- a/test/test-execute/exec-temporaryfilesystem-rw.service +++ b/test/test-execute/exec-temporaryfilesystem-rw.service @@ -10,6 +10,9 @@ ExecStart=test -d /var/test-exec-temporaryfilesystem/rw -a -d /var/test-exec-tem # Check TemporaryFileSystem= are empty ExecStart=sh -c 'for i in $$(ls -A /var); do test $$i = test-exec-temporaryfilesystem || false; done' +# Check default mode +ExecStart=sh -x -c 'test "$$(stat -c %%a /var)" = "755"' + # Create a file in /var ExecStart=touch /var/hoge From 4a756839e6598164fb5a74e631261b1d0589fa11 Mon Sep 17 00:00:00 2001 From: Alan Jenkins Date: Wed, 29 Aug 2018 23:38:40 +0100 Subject: [PATCH 3/4] namespace: we always use a root_directory now We changed to always setup the new namespace in a separate directory (commit 0722b35) --- src/core/namespace.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/core/namespace.c b/src/core/namespace.c index c7ccaa5192..0cfe3eb114 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -419,11 +419,7 @@ static int mount_path_compare(const void *a, const void *b) { static int prefix_where_needed(MountEntry *m, size_t n, const char *root_directory) { size_t i; - /* Prefixes all paths in the bind mount table with the root directory if it is specified and the entry needs - * that. */ - - if (!root_directory) - return 0; + /* Prefixes all paths in the bind mount table with the root directory if the entry needs that. */ for (i = 0; i < n; i++) { char *s; From fcac12d15059f5bb2503827ad2002ccbdb0d22dd Mon Sep 17 00:00:00 2001 From: Alan Jenkins Date: Thu, 30 Aug 2018 00:20:48 +0100 Subject: [PATCH 4/4] namespace: remove redundant .has_prefix=false The MountEntry's added for EMPTY_DIR work very similarly to the TMPFS ones. In both cases, .has_prefix is false. In fact, .has_prefix is false in *all* the MountEntry's we add except for the access mounts (READONLY etc). But EMPTY_DIR stuck out by explicitly setting .has_prefix = false. Let's remove that. --- src/core/namespace.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/core/namespace.c b/src/core/namespace.c index 0cfe3eb114..7d12d6c330 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -265,7 +265,6 @@ static int append_empty_dir_mounts(MountEntry **p, char **strv) { .path_const = *i, .mode = EMPTY_DIR, .ignore = false, - .has_prefix = false, .read_only = true, .options_const = "mode=755", .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,