From d8fc6a000fe21b0c1ba27fbfed8b42d00b349a4b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 30 Sep 2015 13:47:28 +0200 Subject: [PATCH] nspawn: mount /sys as tmpfs, and then mount only select subdirs of the real sysfs below it This way we can hide things like /sys/firmware or /sys/hypervisor from the container, while keeping the device tree around. While this is a security benefit in itself it also allows us to fix issue #1277. Previously we'd mount /sys before creating the user namespace, in order to be able to mount /sys/fs/cgroup/* beneath it (which resides in it), which we can only mount outside of the user namespace. To ensure that the user namespace owns the network namespace we'd set up the network namespace at the same time as the user namespace. Thus, we'd still see the /sys/class/net/ from the originating network namespace, even though we are in our own network namespace now. With this patch, /sys is mounted before transitioning into the user namespace as tmpfs, so that we can also mount /sys/fs/cgroup/* into it this early. The directories such as /sys/class/ are then later added in from the real sysfs from inside the network and user namespace so that they actually show whatis available in it. Fixes #1277 --- src/nspawn/nspawn-mount.c | 50 ++++++++++++++++++++++++++++++++++++++- src/nspawn/nspawn-mount.h | 1 + src/nspawn/nspawn.c | 4 ++++ 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 85e81b43fe..3d302ef9ad 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -216,6 +216,52 @@ static int tmpfs_patch_options( return !!buf; } +int mount_sysfs(const char *dest) { + const char *full, *top, *x; + + top = prefix_roota(dest, "/sys"); + full = prefix_roota(top, "/full"); + + (void) mkdir(full, 0755); + + if (mount("sysfs", full, "sysfs", MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0) + return log_error_errno(errno, "Failed to mount sysfs to %s: %m", full); + + FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") { + _cleanup_free_ char *from = NULL, *to = NULL; + + from = prefix_root(full, x); + if (!from) + return log_oom(); + + to = prefix_root(top, x); + if (!to) + return log_oom(); + + (void) mkdir(to, 0755); + + if (mount(from, to, NULL, MS_BIND, NULL) < 0) + return log_error_errno(errno, "Failed to mount /sys/%s into place: %m", x); + + if (mount(NULL, to, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0) + return log_error_errno(errno, "Failed to mount /sys/%s read-only: %m", x); + } + + if (umount(full) < 0) + return log_error_errno(errno, "Failed to unmount %s: %m", full); + + if (rmdir(full) < 0) + return log_error_errno(errno, "Failed to remove %s: %m", full); + + x = prefix_roota(top, "/fs/kdbus"); + (void) mkdir(x, 0755); + + if (mount(NULL, top, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0) + return log_error_errno(errno, "Failed to make %s read-only: %m", top); + + return 0; +} + int mount_all(const char *dest, bool use_userns, bool in_userns, uid_t uid_shift, uid_t uid_range, @@ -235,7 +281,7 @@ int mount_all(const char *dest, { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true }, { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */ { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */ - { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false }, + { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false }, { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false }, { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false }, { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false }, @@ -570,6 +616,8 @@ static int mount_legacy_cgroups( cgroup_root = prefix_roota(dest, "/sys/fs/cgroup"); + (void) mkdir_p(cgroup_root, 0755); + /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW); if (r < 0) diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h index da4986add0..54cab87665 100644 --- a/src/nspawn/nspawn-mount.h +++ b/src/nspawn/nspawn-mount.h @@ -58,6 +58,7 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s); int custom_mount_compare(const void *a, const void *b); int mount_all(const char *dest, bool use_userns, bool in_userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); +int mount_sysfs(const char *dest); int mount_cgroups(const char *dest, bool unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); int mount_systemd_cgroup_writable(const char *dest, bool unified_requested); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 9f60f41b98..f4a2e3d9ba 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2454,6 +2454,10 @@ static int inner_child( if (r < 0) return r; + r = mount_sysfs(NULL); + if (r < 0) + return r; + /* Wait until we are cgroup-ified, so that we * can mount the right cgroup path writable */ if (!barrier_place_and_sync(barrier)) { /* #3 */