nspawn: mount /sys as tmpfs, and then mount only select subdirs of the real sysfs below it

This way we can hide things like /sys/firmware or /sys/hypervisor from
the container, while keeping the device tree around.

While this is a security benefit in itself it also allows us to fix
issue #1277.

Previously we'd mount /sys before creating the user namespace, in order
to be able to mount /sys/fs/cgroup/* beneath it (which resides in it),
which we can only mount outside of the user namespace. To ensure that
the user namespace owns the network namespace we'd set up the network
namespace at the same time as the user namespace. Thus, we'd still see
the /sys/class/net/ from the originating network namespace, even though
we are in our own network namespace now. With this patch, /sys is
mounted before transitioning into the user namespace as tmpfs, so that
we can also mount /sys/fs/cgroup/* into it this early. The directories
such as /sys/class/ are then later added in from the real sysfs from
inside the network and user namespace so that they actually show whatis
available in it.

Fixes #1277
This commit is contained in:
Lennart Poettering 2015-09-30 13:47:28 +02:00
parent 403af78c80
commit d8fc6a000f
3 changed files with 54 additions and 1 deletions

View File

@ -216,6 +216,52 @@ static int tmpfs_patch_options(
return !!buf;
}
int mount_sysfs(const char *dest) {
const char *full, *top, *x;
top = prefix_roota(dest, "/sys");
full = prefix_roota(top, "/full");
(void) mkdir(full, 0755);
if (mount("sysfs", full, "sysfs", MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
return log_error_errno(errno, "Failed to mount sysfs to %s: %m", full);
FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
_cleanup_free_ char *from = NULL, *to = NULL;
from = prefix_root(full, x);
if (!from)
return log_oom();
to = prefix_root(top, x);
if (!to)
return log_oom();
(void) mkdir(to, 0755);
if (mount(from, to, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "Failed to mount /sys/%s into place: %m", x);
if (mount(NULL, to, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
return log_error_errno(errno, "Failed to mount /sys/%s read-only: %m", x);
}
if (umount(full) < 0)
return log_error_errno(errno, "Failed to unmount %s: %m", full);
if (rmdir(full) < 0)
return log_error_errno(errno, "Failed to remove %s: %m", full);
x = prefix_roota(top, "/fs/kdbus");
(void) mkdir(x, 0755);
if (mount(NULL, top, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
return log_error_errno(errno, "Failed to make %s read-only: %m", top);
return 0;
}
int mount_all(const char *dest,
bool use_userns, bool in_userns,
uid_t uid_shift, uid_t uid_range,
@ -235,7 +281,7 @@ int mount_all(const char *dest,
{ "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
{ "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
{ NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
{ "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
{ "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
{ "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
{ "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
@ -570,6 +616,8 @@ static int mount_legacy_cgroups(
cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
(void) mkdir_p(cgroup_root, 0755);
/* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
if (r < 0)

View File

@ -58,6 +58,7 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s);
int custom_mount_compare(const void *a, const void *b);
int mount_all(const char *dest, bool use_userns, bool in_userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
int mount_sysfs(const char *dest);
int mount_cgroups(const char *dest, bool unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
int mount_systemd_cgroup_writable(const char *dest, bool unified_requested);

View File

@ -2454,6 +2454,10 @@ static int inner_child(
if (r < 0)
return r;
r = mount_sysfs(NULL);
if (r < 0)
return r;
/* Wait until we are cgroup-ified, so that we
* can mount the right cgroup path writable */
if (!barrier_place_and_sync(barrier)) { /* #3 */