namespace: rework namespace support

- don't use pivot_root() anymore, just reuse root hierarchy
- first create all mounts, then mark them read-only so that we get the
  right behaviour when people want writable mounts inside of
  read-only mounts
- don't pass invalid combinations of MS_ constants to the kernel
This commit is contained in:
Lennart Poettering 2012-08-13 15:27:04 +02:00
parent 6f67a45d8e
commit ac0930c892
6 changed files with 117 additions and 145 deletions

View File

@ -1043,20 +1043,13 @@
<option>shared</option>,
<option>slave</option> or
<option>private</option>, which
control whether namespaces set up with
<varname>ReadWriteDirectories=</varname>,
<varname>ReadOnlyDirectories=</varname>
and
<varname>InaccessibleDirectories=</varname>
receive or propagate new mounts
from/to the main namespace. See
control whether the file system
namespace set up for this unit's
processes will receive or propagate
new mounts. See
<citerefentry><refentrytitle>mount</refentrytitle><manvolnum>1</manvolnum></citerefentry>
for details. Defaults to
<option>shared</option>, i.e. the new
namespace will both receive new mount
points from the main namespace as well
as propagate new mounts to
it.</para></listitem>
for details. Default to
<option>shared</option>.</para></listitem>
</varlistentry>
<varlistentry>

View File

@ -1304,7 +1304,7 @@ int exec_spawn(ExecCommand *command,
if (strv_length(context->read_write_dirs) > 0 ||
strv_length(context->read_only_dirs) > 0 ||
strv_length(context->inaccessible_dirs) > 0 ||
context->mount_flags != MS_SHARED ||
context->mount_flags != 0 ||
context->private_tmp) {
err = setup_namespace(context->read_write_dirs,
context->read_only_dirs,
@ -1540,7 +1540,6 @@ void exec_context_init(ExecContext *c) {
c->cpu_sched_policy = SCHED_OTHER;
c->syslog_priority = LOG_DAEMON|LOG_INFO;
c->syslog_level_prefix = true;
c->mount_flags = MS_SHARED;
c->control_group_persistent = -1;
c->ignore_sigpipe = true;
c->timer_slack_nsec = (nsec_t) -1;

View File

@ -41,13 +41,15 @@ typedef enum PathMode {
/* This is ordered by priority! */
INACCESSIBLE,
READONLY,
PRIVATE,
PRIVATE_TMP,
PRIVATE_VAR_TMP,
READWRITE
} PathMode;
typedef struct Path {
const char *path;
PathMode mode;
bool done;
} Path;
static int append_paths(Path **p, char **strv, PathMode mode) {
@ -91,25 +93,22 @@ static int path_compare(const void *a, const void *b) {
return 0;
}
static void drop_duplicates(Path *p, unsigned *n, bool *need_inaccessible, bool *need_private) {
static void drop_duplicates(Path *p, unsigned *n, bool *need_inaccessible) {
Path *f, *t, *previous;
assert(p);
assert(n);
assert(need_inaccessible);
assert(need_private);
for (f = p, t = p, previous = NULL; f < p+*n; f++) {
/* The first one wins */
if (previous && path_equal(f->path, previous->path))
continue;
t->path = f->path;
t->mode = f->mode;
if (t->mode == PRIVATE)
*need_private = true;
if (t->mode == INACCESSIBLE)
*need_inaccessible = true;
@ -121,67 +120,64 @@ static void drop_duplicates(Path *p, unsigned *n, bool *need_inaccessible, bool
*n = t - p;
}
static int apply_mount(Path *p, const char *root_dir, const char *inaccessible_dir, const char *private_dir, unsigned long flags) {
static int apply_mount(
Path *p,
const char *tmp_dir,
const char *var_tmp_dir,
const char *inaccessible_dir) {
const char *what;
char *where;
int r;
assert(p);
assert(root_dir);
assert(inaccessible_dir);
assert(private_dir);
where = strappend(root_dir, p->path);
if (!where)
return -ENOMEM;
switch (p->mode) {
case INACCESSIBLE:
what = inaccessible_dir;
flags |= MS_RDONLY;
break;
case READONLY:
flags |= MS_RDONLY;
/* Fall through */
case READWRITE:
what = p->path;
break;
case PRIVATE:
what = private_dir;
case PRIVATE_TMP:
what = tmp_dir;
break;
case PRIVATE_VAR_TMP:
what = var_tmp_dir;
break;
default:
assert_not_reached("Unknown mode");
}
r = mount(what, where, NULL, MS_BIND|MS_REC, NULL);
if (r >= 0) {
log_debug("Successfully mounted %s to %s", what, where);
assert(what);
/* The bind mount will always inherit the original
* flags. If we want to set any flag we need
* to do so in a second independent step. */
if (flags)
r = mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|MS_REC|flags, NULL);
r = mount(what, p->path, NULL, MS_BIND, NULL);
if (r >= 0)
log_debug("Successfully mounted %s to %s", what, p->path);
/* Avoid exponential growth of trees */
if (r >= 0 && path_equal(p->path, "/"))
r = mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL);
if (r < 0) {
r = -errno;
umount2(where, MNT_DETACH);
}
}
free(where);
return r;
}
static int make_read_only(Path *p) {
int r;
assert(p);
if (p->mode != INACCESSIBLE && p->mode != READONLY)
return 0;
r = mount(NULL, p->path, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
if (r < 0)
return -errno;
return 0;
}
int setup_namespace(
char **writable,
char **readable,
@ -190,30 +186,26 @@ int setup_namespace(
unsigned long flags) {
char
tmp_dir[] = "/tmp/systemd-namespace-XXXXXX",
root_dir[] = "/tmp/systemd-namespace-XXXXXX/root",
old_root_dir[] = "/tmp/systemd-namespace-XXXXXX/root/tmp/old-root-XXXXXX",
inaccessible_dir[] = "/tmp/systemd-namespace-XXXXXX/inaccessible",
private_dir[] = "/tmp/systemd-namespace-XXXXXX/private";
tmp_dir[] = "/tmp/systemd-private-XXXXXX",
var_tmp_dir[] = "/var/tmp/systemd-private-XXXXXX",
inaccessible_dir[] = "/tmp/systemd-inaccessible-XXXXXX";
Path *paths, *p;
unsigned n;
bool need_private = false, need_inaccessible = false;
bool remove_tmp = false, remove_root = false, remove_old_root = false, remove_inaccessible = false, remove_private = false;
bool need_inaccessible = false;
bool remove_tmp = false, remove_var_tmp = false, remove_inaccessible = false;
int r;
const char *t;
if (!flags)
flags = MS_SHARED;
n =
strv_length(writable) +
strv_length(readable) +
strv_length(inaccessible) +
(private_tmp ? 3 : 1);
(private_tmp ? 2 : 0);
paths = new(Path, n);
if (!paths)
return -ENOMEM;
p = paths;
p = paths = alloca(sizeof(Path) * n);
if ((r = append_paths(&p, writable, READWRITE)) < 0 ||
(r = append_paths(&p, readable, READONLY)) < 0 ||
(r = append_paths(&p, inaccessible, INACCESSIBLE)) < 0)
@ -221,60 +213,70 @@ int setup_namespace(
if (private_tmp) {
p->path = "/tmp";
p->mode = PRIVATE;
p->mode = PRIVATE_TMP;
p++;
p->path = "/var/tmp";
p->mode = PRIVATE;
p->mode = PRIVATE_VAR_TMP;
p++;
}
p->path = "/";
p->mode = READWRITE;
p++;
assert(paths + n == p);
qsort(paths, n, sizeof(Path), path_compare);
drop_duplicates(paths, &n, &need_inaccessible, &need_private);
if (!mkdtemp(tmp_dir)) {
r = -errno;
goto fail;
}
remove_tmp = true;
memcpy(root_dir, tmp_dir, sizeof(tmp_dir)-1);
if (mkdir(root_dir, 0777) < 0) {
r = -errno;
goto fail;
}
remove_root = true;
drop_duplicates(paths, &n, &need_inaccessible);
if (need_inaccessible) {
memcpy(inaccessible_dir, tmp_dir, sizeof(tmp_dir)-1);
if (mkdir(inaccessible_dir, 0) < 0) {
mode_t u;
char *d;
u = umask(0777);
d = mkdtemp(inaccessible_dir);
umask(u);
if (!d) {
r = -errno;
goto fail;
}
remove_inaccessible = true;
}
if (need_private) {
if (private_tmp) {
mode_t u;
memcpy(private_dir, tmp_dir, sizeof(tmp_dir)-1);
char *d;
u = umask(0000);
if (mkdir(private_dir, 0777 + S_ISVTX) < 0) {
umask(u);
d = mkdtemp(tmp_dir);
umask(u);
if (!d) {
r = -errno;
goto fail;
}
remove_tmp = true;
u = umask(0000);
d = mkdtemp(var_tmp_dir);
umask(u);
remove_private = true;
if (!d) {
r = -errno;
goto fail;
}
remove_var_tmp = true;
if (chmod(tmp_dir, 0777 + S_ISVTX) < 0) {
r = -errno;
goto fail;
}
if (chmod(var_tmp_dir, 0777 + S_ISVTX) < 0) {
r = -errno;
goto fail;
}
}
if (unshare(CLONE_NEWNS) < 0) {
@ -282,7 +284,7 @@ int setup_namespace(
goto fail;
}
/* Remount / as SLAVE so that nothing mounted in the namespace
/* Remount / as SLAVE so that nothing now mounted in the namespace
shows up in the parent */
if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
r = -errno;
@ -290,69 +292,39 @@ int setup_namespace(
}
for (p = paths; p < paths + n; p++) {
r = apply_mount(p, root_dir, inaccessible_dir, private_dir, flags);
r = apply_mount(p, tmp_dir, var_tmp_dir, inaccessible_dir);
if (r < 0)
goto undo_mounts;
}
memcpy(old_root_dir, tmp_dir, sizeof(tmp_dir)-1);
if (!mkdtemp(old_root_dir)) {
for (p = paths; p < paths + n; p++) {
r = make_read_only(p);
if (r < 0)
goto undo_mounts;
}
/* Remount / as the desired mode */
if (mount(NULL, "/", NULL, flags|MS_REC, NULL) < 0) {
r = -errno;
goto undo_mounts;
}
remove_old_root = true;
if (chdir(root_dir) < 0) {
r = -errno;
goto undo_mounts;
}
if (pivot_root(root_dir, old_root_dir) < 0) {
r = -errno;
goto undo_mounts;
}
free(paths);
t = old_root_dir + sizeof(root_dir) - 1;
if (umount2(t, MNT_DETACH) < 0)
/* At this point it's too late to turn anything back,
* since we are already in the new root. */
return -errno;
if (rmdir(t) < 0)
return -errno;
return 0;
undo_mounts:
for (p--; p >= paths; p--) {
char full_path[PATH_MAX];
snprintf(full_path, sizeof(full_path), "%s%s", root_dir, p->path);
char_array_0(full_path);
umount2(full_path, MNT_DETACH);
}
for (p = paths; p < paths + n; p++)
if (p->done)
umount2(p->path, MNT_DETACH);
fail:
if (remove_old_root)
rmdir(old_root_dir);
if (remove_inaccessible)
rmdir(inaccessible_dir);
if (remove_private)
rmdir(private_dir);
if (remove_root)
rmdir(root_dir);
if (remove_tmp)
rmdir(tmp_dir);
free(paths);
if (remove_var_tmp)
rmdir(var_tmp_dir);
return r;
}

View File

@ -115,6 +115,12 @@ int switch_root(const char *new_root) {
goto fail;
}
if (chdir("/") < 0) {
r = -errno;
log_error("Failed to change directory: %m");
goto fail;
}
if (old_root_fd >= 0) {
struct stat rb;

View File

@ -3011,7 +3011,8 @@ unsigned long long random_ull(void) {
uint64_t ull;
ssize_t r;
if ((fd = open("/dev/urandom", O_RDONLY|O_CLOEXEC|O_NOCTTY)) < 0)
fd = open("/dev/urandom", O_RDONLY|O_CLOEXEC|O_NOCTTY);
if (fd < 0)
goto fallback;
r = loop_read(fd, &ull, sizeof(ull), true);

View File

@ -34,7 +34,7 @@ int main(int argc, char *argv[]) {
NULL
};
const char * const readable[] = {
const char * const readonly[] = {
"/",
"/usr",
"/boot",
@ -48,7 +48,8 @@ int main(int argc, char *argv[]) {
int r;
if ((r = setup_namespace((char**) writable, (char**) readable, (char**) inaccessible, true, MS_SHARED)) < 0) {
r = setup_namespace((char**) writable, (char**) readonly, (char**) inaccessible, true, 0);
if (r < 0) {
log_error("Failed to setup namespace: %s", strerror(-r));
return 1;
}