nspawn,pid1: pass "inaccessible" nodes from cntr mgr to pid1 payload via /run/host

Let's make /run/host the sole place we pass stuff from host to container in and place the "inaccessible" nodes in /run/host too. In contrast to the previous two commits this is a minor compat break, but not a relevant one I think. Previously the container manager would place these nodes in /run/systemd/inaccessible/ and that's where PID 1 in the container would try to add them too when missing. Container manager and PID 1 in the container would thus manage the same dir together. With this change the container manager now passes an immutable directory to the container and leaves /run/systemd entirely untouched, and managed exclusively by PID 1 inside the container, which is nice to have clear separation on who manages what. In order to make sure systemd then usses the /run/host/inaccesible/ nodes this commit changes PID 1 to look for that dir and if it exists will symlink it to /run/systemd/inaccessible. Now, this will work fine if new nspawn and new pid 1 in the container work together. as then the symlink is created and the difference between the two dirs won't matter. For the case where an old nspawn invokes a new PID 1: in this case things work as they always worked: the dir is managed together. For the case where different container manager invokes a new PID 1: in this case the nodes aren't typically passed in, and PID 1 in the container will try to create them and will likely fail partially (though gracefully) when trying to create char/block device nodes. THis is fine though as there are fallbacks in place for that case. For the case where a new nspawn invokes an old PID1: this is were the (minor) incompatibily happens: in this case new nspawn will place the nodes in the /run/host/inaccessible/ subdir, but the PID 1 in the container won't look for them there. Since the nodes are also not pre-created in /run/systed/inaccessible/ PID 1 will try to create them there as if a different container manager sets them up. This is of course not sexy, but is not a total loss, since as mentioned fallbacks are in place anyway. Hence I think it's OK to accept this minor incompatibility.
2020-08-14 18:56:54 +02:00 · 2020-08-14 18:56:54 +02:00 · 9fac502920
parent e96ceabac9
commit 9fac502920
6 changed files with 33 additions and 23 deletions
--- a/src/core/mount-setup.c
+++ b/src/core/mount-setup.c
@ -536,8 +536,17 @@ int mount_setup(bool loaded_policy, bool leave_propagation) {
        (void) mkdir_label("/run/systemd/system", 0755);

        /* Also create /run/systemd/inaccessible nodes, so that we always have something to mount
-         * inaccessible nodes from. */
-        (void) make_inaccessible_nodes(NULL, UID_INVALID, GID_INVALID);
+         * inaccessible nodes from. If we run in a container the host might have created these for us already
+         * in /run/host/inaccessible/. Use those if we can, since tht way we likely get access to block/char
+         * device nodes that are inaccessible, and if userns is used to nodes that are on mounts owned by a
+         * userns outside the container and thus nicely read-only and not remountable. */
+        if (access("/run/host/inaccessible/", F_OK) < 0) {
+                if (errno != ENOENT)
+                        log_debug_errno(errno, "Failed to check if /run/host/inaccessible exists, ignoring: %m");
+
+                (void) make_inaccessible_nodes("/run/systemd", UID_INVALID, GID_INVALID);
+        } else
+                (void) symlink("../host/inaccessible", "/run/systemd/inaccessible");

        return 0;
 }
--- a/src/login/user-runtime-dir.c
+++ b/src/login/user-runtime-dir.c
@ -49,6 +49,7 @@ static int user_mkdir_runtime_path(
                uint64_t runtime_dir_size,
                uint64_t runtime_dir_inodes) {

+        const char *p;
        int r;

        assert(runtime_path);
@ -99,7 +100,9 @@ static int user_mkdir_runtime_path(
        }

        /* Set up inaccessible nodes now so they're available if we decide to use them with user namespaces. */
-        (void) make_inaccessible_nodes(runtime_path, uid, gid);
+        p = strjoina(runtime_path, "/systemd");
+        (void) mkdir(p, 0755);
+        (void) make_inaccessible_nodes(p, uid, gid);
        return 0;

 fail:
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@ -3523,7 +3523,7 @@ static int outer_child(

        (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);

-        p = prefix_roota(directory, "/run");
+        p = prefix_roota(directory, "/run/host");
        (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);

        r = setup_pts(directory);
--- a/src/shared/dev-setup.c
+++ b/src/shared/dev-setup.c
@ -57,7 +57,7 @@ int dev_setup(const char *prefix, uid_t uid, gid_t gid) {
 }

 int make_inaccessible_nodes(
-                const char *runtime_dir,
+                const char *parent_dir,
                uid_t uid,
                gid_t gid) {

@ -65,28 +65,26 @@ int make_inaccessible_nodes(
                const char *name;
                mode_t mode;
        } table[] = {
-                { "/systemd",                   S_IFDIR  | 0755 },
-                { "/systemd/inaccessible",      S_IFDIR  | 0000 },
-                { "/systemd/inaccessible/reg",  S_IFREG  | 0000 },
-                { "/systemd/inaccessible/dir",  S_IFDIR  | 0000 },
-                { "/systemd/inaccessible/fifo", S_IFIFO  | 0000 },
-                { "/systemd/inaccessible/sock", S_IFSOCK | 0000 },
+                { "inaccessible",      S_IFDIR  | 0755 },
+                { "inaccessible/reg",  S_IFREG  | 0000 },
+                { "inaccessible/dir",  S_IFDIR  | 0000 },
+                { "inaccessible/fifo", S_IFIFO  | 0000 },
+                { "inaccessible/sock", S_IFSOCK | 0000 },

                /* The following two are likely to fail if we lack the privs for it (for example in an userns
                 * environment, if CAP_SYS_MKNOD is missing, or if a device node policy prohibit major/minor of 0
                 * device nodes to be created). But that's entirely fine. Consumers of these files should carry
                 * fallback to use a different node then, for example <root>/inaccessible/sock, which is close
                 * enough in behaviour and semantics for most uses. */
-                { "/systemd/inaccessible/chr",  S_IFCHR  | 0000 },
-                { "/systemd/inaccessible/blk",  S_IFBLK  | 0000 },
+                { "inaccessible/chr",  S_IFCHR  | 0000 },
+                { "inaccessible/blk",  S_IFBLK  | 0000 },
        };

        _cleanup_umask_ mode_t u;
-        size_t i;
        int r;

-        if (!runtime_dir)
-                runtime_dir = "/run";
+        if (!parent_dir)
+                parent_dir = "/run/systemd";

        u = umask(0000);

@ -95,10 +93,10 @@ int make_inaccessible_nodes(
         * to lock down these nodes as much as we can, but otherwise try to match them as closely as possible with the
         * underlying file, i.e. in the best case we offer the same node type as the underlying node. */

-        for (i = 0; i < ELEMENTSOF(table); i++) {
+        for (size_t i = 0; i < ELEMENTSOF(table); i++) {
                _cleanup_free_ char *path = NULL;

-                path = path_join(runtime_dir, table[i].name);
+                path = path_join(parent_dir, table[i].name);
                if (!path)
                        return log_oom();

@ -107,8 +105,7 @@ int make_inaccessible_nodes(
                else
                        r = mknod_label(path, table[i].mode, makedev(0, 0));
                if (r < 0) {
-                        if (r != -EEXIST)
-                                log_debug_errno(r, "Failed to create '%s', ignoring: %m", path);
+                        log_debug_errno(r, "Failed to create '%s', ignoring: %m", path);
                        continue;
                }

--- a/src/shared/dev-setup.h
+++ b/src/shared/dev-setup.h
@ -5,4 +5,4 @@

 int dev_setup(const char *prefix, uid_t uid, gid_t gid);

-int make_inaccessible_nodes(const char *root, uid_t uid, gid_t gid);
+int make_inaccessible_nodes(const char *parent_dir, uid_t uid, gid_t gid);
--- a/src/test/test-dev-setup.c
+++ b/src/test/test-dev-setup.c
@ -3,6 +3,7 @@
 #include "capability-util.h"
 #include "dev-setup.h"
 #include "fs-util.h"
+#include "mkdir.h"
 #include "path-util.h"
 #include "rm-rf.h"
 #include "tmpfile-util.h"
@ -17,8 +18,8 @@ int main(int argc, char *argv[]) {

        assert_se(mkdtemp_malloc("/tmp/test-dev-setupXXXXXX", &p) >= 0);

-        f = prefix_roota(p, "/run");
-        assert_se(mkdir(f, 0755) >= 0);
+        f = prefix_roota(p, "/run/systemd");
+        assert_se(mkdir_p(f, 0755) >= 0);

        assert_se(make_inaccessible_nodes(f, 1, 1) >= 0);