namespace: chase symlinks for mounts to set up in userspace

This adds logic to chase symlinks for all mount points that shall be created in
a namespace environment in userspace, instead of leaving this to the kernel.
This has the advantage that we can correctly handle absolute symlinks that
shall be taken relative to a specific root directory. Moreover, we can properly
handle mounts created on symlinked files or directories as we can merge their
mounts as necessary.

(This also drops the "done" flag in the namespace logic, which was never
actually working, but was supposed to permit a partial rollback of the
namespace logic, which however is only mildly useful as it wasn't clear in
which case it would or would not be able to roll back.)

Fixes: #3867
This commit is contained in:
Lennart Poettering 2016-09-24 12:41:30 +02:00 committed by Djalal Harouni
parent 1e4e94c881
commit d944dc9553
5 changed files with 368 additions and 47 deletions

View File

@ -597,3 +597,190 @@ int inotify_add_watch_fd(int fd, int what, uint32_t mask) {
return r;
}
int chase_symlinks(const char *path, const char *_root, char **ret) {
_cleanup_free_ char *buffer = NULL, *done = NULL, *root = NULL;
_cleanup_close_ int fd = -1;
unsigned max_follow = 32; /* how many symlinks to follow before giving up and returning ELOOP */
char *todo;
int r;
assert(path);
/* This is a lot like canonicalize_file_name(), but takes an additional "root" parameter, that allows following
* symlinks relative to a root directory, instead of the root of the host.
*
* Note that "root" matters only if we encounter an absolute symlink, it's unused otherwise. Most importantly
* this means the path parameter passed in is not prefixed by it.
*
* Algorithmically this operates on two path buffers: "done" are the components of the path we already
* processed and resolved symlinks, "." and ".." of. "todo" are the components of the path we still need to
* process. On each iteration, we move one component from "todo" to "done", processing it's special meaning
* each time. The "todo" path always starts with at least one slash, the "done" path always ends in no
* slash. We always keep an O_PATH fd to the component we are currently processing, thus keeping lookup races
* at a minimum. */
r = path_make_absolute_cwd(path, &buffer);
if (r < 0)
return r;
if (_root) {
r = path_make_absolute_cwd(_root, &root);
if (r < 0)
return r;
}
fd = open("/", O_CLOEXEC|O_NOFOLLOW|O_PATH);
if (fd < 0)
return -errno;
todo = buffer;
for (;;) {
_cleanup_free_ char *first = NULL;
_cleanup_close_ int child = -1;
struct stat st;
size_t n, m;
/* Determine length of first component in the path */
n = strspn(todo, "/"); /* The slashes */
m = n + strcspn(todo + n, "/"); /* The entire length of the component */
/* Extract the first component. */
first = strndup(todo, m);
if (!first)
return -ENOMEM;
todo += m;
/* Just a single slash? Then we reached the end. */
if (isempty(first) || path_equal(first, "/"))
break;
/* Just a dot? Then let's eat this up. */
if (path_equal(first, "/."))
continue;
/* Two dots? Then chop off the last bit of what we already found out. */
if (path_equal(first, "/..")) {
_cleanup_free_ char *parent = NULL;
int fd_parent = -1;
if (isempty(done) || path_equal(done, "/"))
return -EINVAL;
parent = dirname_malloc(done);
if (!parent)
return -ENOMEM;
/* Don't allow this to leave the root dir */
if (root &&
path_startswith(done, root) &&
!path_startswith(parent, root))
return -EINVAL;
free(done);
done = parent;
parent = NULL;
fd_parent = openat(fd, "..", O_CLOEXEC|O_NOFOLLOW|O_PATH);
if (fd_parent < 0)
return -errno;
safe_close(fd);
fd = fd_parent;
continue;
}
/* Otherwise let's see what this is. */
child = openat(fd, first + n, O_CLOEXEC|O_NOFOLLOW|O_PATH);
if (child < 0)
return -errno;
if (fstat(child, &st) < 0)
return -errno;
if (S_ISLNK(st.st_mode)) {
_cleanup_free_ char *destination = NULL;
/* This is a symlink, in this case read the destination. But let's make sure we don't follow
* symlinks without bounds. */
if (--max_follow <= 0)
return -ELOOP;
r = readlinkat_malloc(fd, first + n, &destination);
if (r < 0)
return r;
if (isempty(destination))
return -EINVAL;
if (path_is_absolute(destination)) {
/* An absolute destination. Start the loop from the beginning, but use the root
* directory as base. */
safe_close(fd);
fd = open(root ?: "/", O_CLOEXEC|O_NOFOLLOW|O_PATH);
if (fd < 0)
return -errno;
free(buffer);
buffer = destination;
destination = NULL;
todo = buffer;
free(done);
/* Note that we do not revalidate the root, we take it as is. */
if (isempty(root))
done = NULL;
else {
done = strdup(root);
if (!done)
return -ENOMEM;
}
} else {
char *joined;
/* A relative destination. If so, this is what we'll prefix what's left to do with what
* we just read, and start the loop again, but remain in the current directory. */
joined = strjoin("/", destination, todo, NULL);
if (!joined)
return -ENOMEM;
free(buffer);
todo = buffer = joined;
}
continue;
}
/* If this is not a symlink, then let's just add the name we read to what we already verified. */
if (!done) {
done = first;
first = NULL;
} else {
if (!strextend(&done, first, NULL))
return -ENOMEM;
}
/* And iterate again, but go one directory further down. */
safe_close(fd);
fd = child;
child = -1;
}
if (!done) {
/* Special case, turn the empty string into "/", to indicate the root directory. */
done = strdup("/");
if (!done)
return -ENOMEM;
}
*ret = done;
done = NULL;
return 0;
}

View File

@ -77,3 +77,5 @@ union inotify_event_buffer {
};
int inotify_add_watch_fd(int fd, int what, uint32_t mask);
int chase_symlinks(const char *path, const char *_root, char **ret);

View File

@ -29,6 +29,7 @@
#include "alloc-util.h"
#include "dev-setup.h"
#include "fd-util.h"
#include "fs-util.h"
#include "loopback-setup.h"
#include "missing.h"
#include "mkdir.h"
@ -57,9 +58,9 @@ typedef enum MountMode {
} MountMode;
typedef struct BindMount {
const char *path;
const char *path; /* stack memory, doesn't need to be freed explicitly */
char *chased; /* malloc()ed memory, needs to be freed */
MountMode mode;
bool done;
bool ignore;
} BindMount;
@ -71,7 +72,6 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
STRV_FOREACH(i, strv) {
(*p)->ignore = false;
(*p)->done = false;
if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
(*p)->ignore = true;
@ -360,11 +360,8 @@ static int apply_mount(
* inaccessible path. */
(void) umount_recursive(m->path, 0);
if (lstat(m->path, &target) < 0) {
if (m->ignore && errno == ENOENT)
return 0;
if (lstat(m->path, &target) < 0)
return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", m->path);
}
what = mode_to_inaccessible_node(target.st_mode);
if (!what) {
@ -378,11 +375,8 @@ static int apply_mount(
case READWRITE:
r = path_is_mount_point(m->path, 0);
if (r < 0) {
if (m->ignore && errno == ENOENT)
return 0;
if (r < 0)
return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", m->path);
}
if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
return 0;
@ -407,12 +401,8 @@ static int apply_mount(
assert(what);
if (mount(what, m->path, NULL, MS_BIND|MS_REC, NULL) < 0) {
if (m->ignore && errno == ENOENT)
return 0;
if (mount(what, m->path, NULL, MS_BIND|MS_REC, NULL) < 0)
return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, m->path);
}
log_debug("Successfully mounted %s to %s", what, m->path);
return 0;
@ -435,12 +425,43 @@ static int make_read_only(BindMount *m, char **blacklist) {
* already stays this way. This improves compatibility with container managers, where we won't attempt to undo
* read-only mounts already applied. */
if (m->ignore && r == -ENOENT)
return 0;
return r;
}
static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned *n) {
BindMount *f, *t;
int r;
assert(m);
assert(n);
/* Since mount() will always follow symlinks and we need to take the different root directory into account we
* chase the symlinks on our own first. This call wil do so for all entries and remove all entries where we
* can't resolve the path, and which have been marked for such removal. */
for (f = m, t = m; f < m+*n; f++) {
r = chase_symlinks(f->path, root_directory, &f->chased);
if (r == -ENOENT && f->ignore) /* Doesn't exist? Then remove it! */
continue;
if (r < 0)
return log_debug_errno(r, "Failed to chase symlinks for %s: %m", f->path);
if (path_equal(f->path, f->chased))
f->chased = mfree(f->chased);
else {
log_debug("Chased %s → %s", f->path, f->chased);
f->path = f->chased;
}
*t = *f;
t++;
}
*n = t - m;
return 0;
}
int setup_namespace(
const char* root_directory,
char** read_write_paths,
@ -456,6 +477,7 @@ int setup_namespace(
unsigned long mount_flags) {
BindMount *m, *mounts = NULL;
bool make_slave = false;
unsigned n;
int r = 0;
@ -475,6 +497,9 @@ int setup_namespace(
((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
(protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
if (root_directory || n > 0)
make_slave = true;
if (n > 0) {
m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
r = append_mounts(&m, read_write_paths, READWRITE);
@ -596,6 +621,13 @@ int setup_namespace(
assert(mounts + n == m);
/* Resolve symlinks manually first, as mount() will always follow them relative to the host's
* root. Moreover we want to suppress duplicates based on the resolved paths. This of course is a bit
* racy. */
r = chase_all_symlinks(root_directory, mounts, &n);
if (r < 0)
goto finish;
qsort(mounts, n, sizeof(BindMount), mount_path_compare);
drop_duplicates(mounts, &n);
@ -603,20 +635,26 @@ int setup_namespace(
drop_nop(mounts, &n);
}
if (unshare(CLONE_NEWNS) < 0)
return -errno;
if (unshare(CLONE_NEWNS) < 0) {
r = -errno;
goto finish;
}
if (n > 0 || root_directory) {
if (make_slave) {
/* Remount / as SLAVE so that nothing now mounted in the namespace
shows up in the parent */
if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
return -errno;
if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
r = -errno;
goto finish;
}
}
if (root_directory) {
/* Turn directory into bind mount */
if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0)
return -errno;
if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
r = -errno;
goto finish;
}
}
if (n > 0) {
@ -627,7 +665,7 @@ int setup_namespace(
for (m = mounts; m < mounts + n; ++m) {
r = apply_mount(m, tmp_dir, var_tmp_dir);
if (r < 0)
goto fail;
goto finish;
}
/* Create a blacklist we can pass to bind_mount_recursive() */
@ -640,35 +678,31 @@ int setup_namespace(
for (m = mounts; m < mounts + n; ++m) {
r = make_read_only(m, blacklist);
if (r < 0)
goto fail;
goto finish;
}
}
if (root_directory) {
/* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
r = mount_move_root(root_directory);
if (r < 0) /* at this point, we cannot rollback */
return r;
if (r < 0)
goto finish;
}
/* Remount / as the desired mode. Not that this will not
* reestablish propagation from our side to the host, since
* what's disconnected is disconnected. */
if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0)
return -errno; /* at this point, we cannot rollback */
return 0;
fail:
if (n > 0) {
for (m = mounts; m < mounts + n; ++m) {
if (!m->done)
continue;
(void) umount2(m->path, MNT_DETACH);
}
if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
r = -errno;
goto finish;
}
r = 0;
finish:
for (m = mounts; m < mounts + n; m++)
free(m->chased);
return r;
}

View File

@ -20,16 +20,109 @@
#include <unistd.h>
#include "alloc-util.h"
#include "fileio.h"
#include "fd-util.h"
#include "fileio.h"
#include "fs-util.h"
#include "macro.h"
#include "mkdir.h"
#include "path-util.h"
#include "rm-rf.h"
#include "string-util.h"
#include "strv.h"
#include "util.h"
static void test_chase_symlinks(void) {
_cleanup_free_ char *result = NULL;
char temp[] = "/tmp/test-chase.XXXXXX";
const char *top, *p, *q;
int r;
assert_se(mkdtemp(temp));
top = strjoina(temp, "/top");
assert_se(mkdir(top, 0700) >= 0);
p = strjoina(top, "/dot");
assert_se(symlink(".", p) >= 0);
p = strjoina(top, "/dotdot");
assert_se(symlink("..", p) >= 0);
p = strjoina(top, "/dotdota");
assert_se(symlink("../a", p) >= 0);
p = strjoina(temp, "/a");
assert_se(symlink("b", p) >= 0);
p = strjoina(temp, "/b");
assert_se(symlink("/usr", p) >= 0);
p = strjoina(temp, "/start");
assert_se(symlink("top/dot/dotdota", p) >= 0);
r = chase_symlinks(p, NULL, &result);
assert_se(r >= 0);
assert_se(path_equal(result, "/usr"));
result = mfree(result);
r = chase_symlinks(p, temp, &result);
assert_se(r == -ENOENT);
q = strjoina(temp, "/usr");
assert_se(mkdir(q, 0700) >= 0);
r = chase_symlinks(p, temp, &result);
assert_se(r >= 0);
assert_se(path_equal(result, q));
p = strjoina(temp, "/slash");
assert_se(symlink("/", p) >= 0);
result = mfree(result);
r = chase_symlinks(p, NULL, &result);
assert_se(r >= 0);
assert_se(path_equal(result, "/"));
result = mfree(result);
r = chase_symlinks(p, temp, &result);
assert_se(r >= 0);
assert_se(path_equal(result, temp));
p = strjoina(temp, "/slashslash");
assert_se(symlink("///usr///", p) >= 0);
result = mfree(result);
r = chase_symlinks(p, NULL, &result);
assert_se(r >= 0);
assert_se(path_equal(result, "/usr"));
result = mfree(result);
r = chase_symlinks(p, temp, &result);
assert_se(r >= 0);
assert_se(path_equal(result, q));
result = mfree(result);
r = chase_symlinks("/etc/./.././", NULL, &result);
assert_se(r >= 0);
assert_se(path_equal(result, "/"));
result = mfree(result);
r = chase_symlinks("/etc/./.././", "/etc", &result);
assert_se(r == -EINVAL);
result = mfree(result);
r = chase_symlinks("/etc/machine-id/foo", NULL, &result);
assert_se(r == -ENOTDIR);
result = mfree(result);
p = strjoina(temp, "/recursive-symlink");
assert_se(symlink("recursive-symlink", p) >= 0);
r = chase_symlinks(p, NULL, &result);
assert_se(r == -ELOOP);
assert_se(rm_rf(temp, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0);
}
static void test_unlink_noerrno(void) {
char name[] = "/tmp/test-close_nointr.XXXXXX";
int fd;
@ -144,6 +237,7 @@ int main(int argc, char *argv[]) {
test_readlink_and_make_absolute();
test_get_files_in_directory();
test_var_tmp();
test_chase_symlinks();
return 0;
}

View File

@ -26,14 +26,18 @@
int main(int argc, char *argv[]) {
const char * const writable[] = {
"/home",
"/home/lennart/projects/foobar", /* this should be masked automatically */
"-/home/lennart/projects/foobar", /* this should be masked automatically */
NULL
};
const char * const readonly[] = {
"/",
"/usr",
/* "/", */
/* "/usr", */
"/boot",
"/lib",
"/usr/lib",
"-/lib64",
"-/usr/lib64",
NULL
};