nspawn,pid1: pass "inaccessible" nodes from cntr mgr to pid1 payload via /run/host

Let's make /run/host the sole place we pass stuff from host to container
in and place the "inaccessible" nodes in /run/host too.

In contrast to the previous two commits this is a minor compat break, but
not a relevant one I think. Previously the container manager would place
these nodes in /run/systemd/inaccessible/ and that's where PID 1 in the
container would try to add them too when missing. Container manager and
PID 1 in the container would thus manage the same dir together.

With this change the container manager now passes an immutable directory
to the container and leaves /run/systemd entirely untouched, and managed
exclusively by PID 1 inside the container, which is nice to have clear
separation on who manages what.

In order to make sure systemd then usses the /run/host/inaccesible/
nodes this commit changes PID 1 to look for that dir and if it exists
will symlink it to /run/systemd/inaccessible.

Now, this will work fine if new nspawn and new pid 1 in the container
work together. as then the symlink is created and the difference between
the two dirs won't matter.

For the case where an old nspawn invokes a new PID 1: in this case
things work as they always worked: the dir is managed together.

For the case where different container manager invokes a new PID 1: in
this case the nodes aren't typically passed in, and PID 1 in the
container will try to create them and will likely fail partially (though
gracefully) when trying to create char/block device nodes. THis is fine
though as there are fallbacks in place for that case.

For the case where a new nspawn invokes an old PID1: this is were the
(minor) incompatibily happens: in this case new nspawn will place the
nodes in the /run/host/inaccessible/ subdir, but the PID 1 in the
container won't look for them there. Since the nodes are also not
pre-created in /run/systed/inaccessible/ PID 1 will try to create them
there as if a different container manager sets them up. This is of
course not sexy, but is not a total loss, since as mentioned fallbacks
are in place anyway. Hence I think it's OK to accept this minor
incompatibility.
This commit is contained in:
Lennart Poettering 2020-08-14 18:56:54 +02:00
parent e96ceabac9
commit 9fac502920
6 changed files with 33 additions and 23 deletions

View file

@ -536,8 +536,17 @@ int mount_setup(bool loaded_policy, bool leave_propagation) {
(void) mkdir_label("/run/systemd/system", 0755);
/* Also create /run/systemd/inaccessible nodes, so that we always have something to mount
* inaccessible nodes from. */
(void) make_inaccessible_nodes(NULL, UID_INVALID, GID_INVALID);
* inaccessible nodes from. If we run in a container the host might have created these for us already
* in /run/host/inaccessible/. Use those if we can, since tht way we likely get access to block/char
* device nodes that are inaccessible, and if userns is used to nodes that are on mounts owned by a
* userns outside the container and thus nicely read-only and not remountable. */
if (access("/run/host/inaccessible/", F_OK) < 0) {
if (errno != ENOENT)
log_debug_errno(errno, "Failed to check if /run/host/inaccessible exists, ignoring: %m");
(void) make_inaccessible_nodes("/run/systemd", UID_INVALID, GID_INVALID);
} else
(void) symlink("../host/inaccessible", "/run/systemd/inaccessible");
return 0;
}

View file

@ -49,6 +49,7 @@ static int user_mkdir_runtime_path(
uint64_t runtime_dir_size,
uint64_t runtime_dir_inodes) {
const char *p;
int r;
assert(runtime_path);
@ -99,7 +100,9 @@ static int user_mkdir_runtime_path(
}
/* Set up inaccessible nodes now so they're available if we decide to use them with user namespaces. */
(void) make_inaccessible_nodes(runtime_path, uid, gid);
p = strjoina(runtime_path, "/systemd");
(void) mkdir(p, 0755);
(void) make_inaccessible_nodes(p, uid, gid);
return 0;
fail:

View file

@ -3523,7 +3523,7 @@ static int outer_child(
(void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
p = prefix_roota(directory, "/run");
p = prefix_roota(directory, "/run/host");
(void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
r = setup_pts(directory);

View file

@ -57,7 +57,7 @@ int dev_setup(const char *prefix, uid_t uid, gid_t gid) {
}
int make_inaccessible_nodes(
const char *runtime_dir,
const char *parent_dir,
uid_t uid,
gid_t gid) {
@ -65,28 +65,26 @@ int make_inaccessible_nodes(
const char *name;
mode_t mode;
} table[] = {
{ "/systemd", S_IFDIR | 0755 },
{ "/systemd/inaccessible", S_IFDIR | 0000 },
{ "/systemd/inaccessible/reg", S_IFREG | 0000 },
{ "/systemd/inaccessible/dir", S_IFDIR | 0000 },
{ "/systemd/inaccessible/fifo", S_IFIFO | 0000 },
{ "/systemd/inaccessible/sock", S_IFSOCK | 0000 },
{ "inaccessible", S_IFDIR | 0755 },
{ "inaccessible/reg", S_IFREG | 0000 },
{ "inaccessible/dir", S_IFDIR | 0000 },
{ "inaccessible/fifo", S_IFIFO | 0000 },
{ "inaccessible/sock", S_IFSOCK | 0000 },
/* The following two are likely to fail if we lack the privs for it (for example in an userns
* environment, if CAP_SYS_MKNOD is missing, or if a device node policy prohibit major/minor of 0
* device nodes to be created). But that's entirely fine. Consumers of these files should carry
* fallback to use a different node then, for example <root>/inaccessible/sock, which is close
* enough in behaviour and semantics for most uses. */
{ "/systemd/inaccessible/chr", S_IFCHR | 0000 },
{ "/systemd/inaccessible/blk", S_IFBLK | 0000 },
{ "inaccessible/chr", S_IFCHR | 0000 },
{ "inaccessible/blk", S_IFBLK | 0000 },
};
_cleanup_umask_ mode_t u;
size_t i;
int r;
if (!runtime_dir)
runtime_dir = "/run";
if (!parent_dir)
parent_dir = "/run/systemd";
u = umask(0000);
@ -95,10 +93,10 @@ int make_inaccessible_nodes(
* to lock down these nodes as much as we can, but otherwise try to match them as closely as possible with the
* underlying file, i.e. in the best case we offer the same node type as the underlying node. */
for (i = 0; i < ELEMENTSOF(table); i++) {
for (size_t i = 0; i < ELEMENTSOF(table); i++) {
_cleanup_free_ char *path = NULL;
path = path_join(runtime_dir, table[i].name);
path = path_join(parent_dir, table[i].name);
if (!path)
return log_oom();
@ -107,8 +105,7 @@ int make_inaccessible_nodes(
else
r = mknod_label(path, table[i].mode, makedev(0, 0));
if (r < 0) {
if (r != -EEXIST)
log_debug_errno(r, "Failed to create '%s', ignoring: %m", path);
log_debug_errno(r, "Failed to create '%s', ignoring: %m", path);
continue;
}

View file

@ -5,4 +5,4 @@
int dev_setup(const char *prefix, uid_t uid, gid_t gid);
int make_inaccessible_nodes(const char *root, uid_t uid, gid_t gid);
int make_inaccessible_nodes(const char *parent_dir, uid_t uid, gid_t gid);

View file

@ -3,6 +3,7 @@
#include "capability-util.h"
#include "dev-setup.h"
#include "fs-util.h"
#include "mkdir.h"
#include "path-util.h"
#include "rm-rf.h"
#include "tmpfile-util.h"
@ -17,8 +18,8 @@ int main(int argc, char *argv[]) {
assert_se(mkdtemp_malloc("/tmp/test-dev-setupXXXXXX", &p) >= 0);
f = prefix_roota(p, "/run");
assert_se(mkdir(f, 0755) >= 0);
f = prefix_roota(p, "/run/systemd");
assert_se(mkdir_p(f, 0755) >= 0);
assert_se(make_inaccessible_nodes(f, 1, 1) >= 0);