Merge pull request #7365 from poettering/nspawn-bind-userns

nspawn: document --bind= and --private-users relationship, and make recursive chown()ing safe
This commit is contained in:
Zbigniew Jędrzejewski-Szmek 2017-11-19 14:01:39 +01:00 committed by GitHub
commit 34b3f471f8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 174 additions and 65 deletions

View file

@ -806,7 +806,13 @@
<option>norbind</option> are allowed, controlling whether to create a recursive or a regular bind
mount. Defaults to "rbind". Backslash escapes are interpreted, so <literal>\:</literal> may be used to embed
colons in either path. This option may be specified multiple times for creating multiple independent bind
mount points. The <option>--bind-ro=</option> option creates read-only bind mounts.</para></listitem>
mount points. The <option>--bind-ro=</option> option creates read-only bind mounts.</para>
<para>Note that when this option is used in combination with <option>--private-users</option>, the resulting
mount points will be owned by the <constant>nobody</constant> user. That's because the mount and its files and
directories continue to be owned by the relevant host users and groups, which do not exist in the container,
and thus show up under the wildcard UID 65534 (nobody). If such bind mounts are created, it is recommended to
make them read-only, using <option>--bind-ro=</option>.</para></listitem>
</varlistentry>
<varlistentry>

View file

@ -104,7 +104,6 @@ int rmdir_parents(const char *path, const char *stop) {
return 0;
}
int rename_noreplace(int olddirfd, const char *oldpath, int newdirfd, const char *newpath) {
struct stat buf;
int ret;
@ -809,3 +808,18 @@ int chase_symlinks(const char *path, const char *original_root, unsigned flags,
return exists;
}
int access_fd(int fd, int mode) {
char p[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(fd) + 1];
int r;
/* Like access() but operates on an already open fd */
xsprintf(p, "/proc/self/fd/%i", fd);
r = access(p, mode);
if (r < 0)
r = -errno;
return r;
}

View file

@ -98,3 +98,5 @@ static inline void unlink_and_free(char *p) {
free(p);
}
DEFINE_TRIVIAL_CLEANUP_FUNC(char*, unlink_and_free);
int access_fd(int fd, int mode);

View file

@ -1,25 +1,26 @@
systemd_nspawn_sources = files('''
nspawn.c
nspawn-settings.c
nspawn-settings.h
nspawn-cgroup.c
nspawn-cgroup.h
nspawn-def.h
nspawn-expose-ports.c
nspawn-expose-ports.h
nspawn-mount.c
nspawn-mount.h
nspawn-network.c
nspawn-network.h
nspawn-expose-ports.c
nspawn-expose-ports.h
nspawn-cgroup.c
nspawn-cgroup.h
nspawn-seccomp.c
nspawn-seccomp.h
nspawn-patch-uid.c
nspawn-patch-uid.h
nspawn-register.c
nspawn-register.h
nspawn-seccomp.c
nspawn-seccomp.h
nspawn-settings.c
nspawn-settings.h
nspawn-setuid.c
nspawn-setuid.h
nspawn-stub-pid1.c
nspawn-stub-pid1.h
nspawn-patch-uid.c
nspawn-patch-uid.h
nspawn.c
'''.split())
nspawn_gperf_c = custom_target(

33
src/nspawn/nspawn-def.h Normal file
View file

@ -0,0 +1,33 @@
#pragma once
/***
This file is part of systemd.
Copyright 2017 Lennart Poettering
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include <sys/types.h>
/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
* UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
* may have their own allocation ranges too. */
#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
/* While we are chmod()ing a directory tree, we set the top-level UID base to this "busy" base, so that we can always
* recognize trees we are were chmod()ing recursively and got interrupted in */
#define UID_BUSY_BASE ((uid_t) UINT32_C(0xFFFE0000))
#define UID_BUSY_MASK ((uid_t) UINT32_C(0xFFFF0000))

View file

@ -23,13 +23,16 @@
#include <sys/acl.h>
#endif
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/vfs.h>
#include <unistd.h>
#include "acl-util.h"
#include "dirent-util.h"
#include "fd-util.h"
#include "fs-util.h"
#include "missing.h"
#include "nspawn-def.h"
#include "nspawn-patch-uid.h"
#include "stat-util.h"
#include "stdio-util.h"
@ -289,42 +292,44 @@ static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift
* user namespaces, however their inodes may relate to host resources or only
* valid in the global user namespace, therefore no patching should be applied.
*/
static int is_fs_fully_userns_compatible(int fd) {
static int is_fs_fully_userns_compatible(const struct statfs *sfs) {
assert(sfs);
return F_TYPE_EQUAL(sfs->f_type, BINFMTFS_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, CGROUP_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, CGROUP2_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, DEBUGFS_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, DEVPTS_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, EFIVARFS_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, HUGETLBFS_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, MQUEUE_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, PROC_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, PSTOREFS_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, SELINUX_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, SMACK_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, SECURITYFS_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, BPF_FS_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, TRACEFS_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, SYSFS_MAGIC);
}
static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
_cleanup_closedir_ DIR *d = NULL;
bool changed = false;
struct statfs sfs;
int r;
assert(fd >= 0);
if (fstatfs(fd, &sfs) < 0)
return -errno;
return F_TYPE_EQUAL(sfs.f_type, BINFMTFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, CGROUP_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, CGROUP2_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, DEBUGFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, DEVPTS_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, EFIVARFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, HUGETLBFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, MQUEUE_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, PROC_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, PSTOREFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SELINUX_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SMACK_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SECURITYFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, BPF_FS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, TRACEFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SYSFS_MAGIC);
}
/* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
* shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
* when we hit procfs, sysfs or some other special file systems. */
static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
bool changed = false;
int r;
assert(fd >= 0);
/* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we
* probably shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's
* stop the recursion when we hit procfs, sysfs or some other special file systems. */
r = is_fs_fully_userns_compatible(fd);
r = is_fs_fully_userns_compatible(&sfs);
if (r < 0)
goto finish;
if (r > 0) {
@ -332,26 +337,12 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift
goto finish;
}
r = patch_fd(fd, NULL, st, shift);
if (r == -EROFS) {
_cleanup_free_ char *name = NULL;
if (!is_toplevel) {
/* When we hit a ready-only subtree we simply skip it, but log about it. */
(void) fd_get_path(fd, &name);
log_debug("Skippping read-only file or directory %s.", strna(name));
r = 0;
}
goto finish;
}
if (r < 0)
goto finish;
if (r > 0)
changed = true;
/* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
if ((sfs.f_flags & ST_RDONLY) ||
access_fd(fd, W_OK) == -EROFS)
goto read_only;
if (S_ISDIR(st->st_mode)) {
_cleanup_closedir_ DIR *d = NULL;
struct dirent *de;
if (!donate_fd) {
@ -411,7 +402,27 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift
}
}
/* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
* directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
* tree is properly chown()ed already. */
r = patch_fd(d ? dirfd(d) : fd, NULL, st, shift);
if (r == -EROFS)
goto read_only;
if (r > 0)
changed = true;
r = changed;
goto finish;
read_only:
if (!is_toplevel) {
_cleanup_free_ char *name = NULL;
/* When we hit a ready-only subtree we simply skip it, but log about it. */
(void) fd_get_path(fd, &name);
log_debug("Skippping read-only file or directory %s.", strna(name));
r = changed;
}
finish:
if (donate_fd)
@ -437,6 +448,11 @@ static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t rang
goto finish;
}
if (shift == UID_BUSY_BASE) {
r = -EINVAL;
goto finish;
}
if (range != 0x10000) {
/* We only support containers with 16bit UID ranges for the patching logic */
r = -EOPNOTSUPP;
@ -459,6 +475,19 @@ static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t rang
if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0)
return 0;
/* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
* range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
* chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */
if ((st.st_uid & UID_BUSY_MASK) != UID_BUSY_BASE) {
if (fchown(fd,
UID_BUSY_BASE | (st.st_uid & ~UID_BUSY_MASK),
(gid_t) UID_BUSY_BASE | (st.st_gid & ~(gid_t) UID_BUSY_MASK)) < 0) {
r = -errno;
goto finish;
}
}
return recurse_fd(fd, donate_fd, &st, shift, true);
finish:

View file

@ -1,3 +1,5 @@
#pragma once
/***
This file is part of systemd.

View file

@ -77,6 +77,7 @@
#include "mount-util.h"
#include "netlink-util.h"
#include "nspawn-cgroup.h"
#include "nspawn-def.h"
#include "nspawn-expose-ports.h"
#include "nspawn-mount.h"
#include "nspawn-network.h"
@ -106,12 +107,6 @@
#include "user-util.h"
#include "util.h"
/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
* UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
* may have their own allocation ranges too. */
#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
* nspawn_notify_socket_path is relative to the container
* the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */

View file

@ -315,6 +315,32 @@ static void test_dot_or_dot_dot(void) {
assert_se(!dot_or_dot_dot("..foo"));
}
static void test_access_fd(void) {
_cleanup_(rmdir_and_freep) char *p = NULL;
_cleanup_close_ int fd = -1;
assert_se(mkdtemp_malloc("/tmp/access-fd.XXXXXX", &p) >= 0);
fd = open(p, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
assert_se(fd >= 0);
assert_se(access_fd(fd, R_OK) >= 0);
assert_se(access_fd(fd, F_OK) >= 0);
assert_se(access_fd(fd, W_OK) >= 0);
assert_se(fchmod(fd, 0000) >= 0);
assert_se(access_fd(fd, F_OK) >= 0);
if (geteuid() == 0) {
assert_se(access_fd(fd, R_OK) >= 0);
assert_se(access_fd(fd, W_OK) >= 0);
} else {
assert_se(access_fd(fd, R_OK) == -EACCES);
assert_se(access_fd(fd, W_OK) == -EACCES);
}
}
int main(int argc, char *argv[]) {
test_unlink_noerrno();
test_get_files_in_directory();
@ -322,6 +348,7 @@ int main(int argc, char *argv[]) {
test_var_tmp();
test_chase_symlinks();
test_dot_or_dot_dot();
test_access_fd();
return 0;
}