From 5a27b395187e64c17d90e880a678aa1c354c74c0 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 22 Jul 2020 17:57:29 +0200 Subject: [PATCH 1/6] nspawn/machine: move mount propagation dir to /run/host/incoming Previously we'd use a directory /run/systemd/nspawn/incoming for accepting mounts to propagate from the host. This is a bit weird, since we have a shared namespace: /run/systemd/ contains both stuff managed by the surround nspawn as well as from the systemd inside. We now have the /run/host/ hierarchy that has special stuff we want to pass from host to container. Let's make use of that here, and move this directory here too. This is not a compat breakage, since the payload never interfaces with that directory natively: it's only nspawn and machined that need to agree on it. --- src/machine/machine-dbus.c | 8 +++----- src/nspawn/nspawn.c | 17 ++++++----------- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/src/machine/machine-dbus.c b/src/machine/machine-dbus.c index a3c97d8d8f..f2285abc0e 100644 --- a/src/machine/machine-dbus.c +++ b/src/machine/machine-dbus.c @@ -978,9 +978,8 @@ int bus_machine_method_bind_mount(sd_bus_message *message, void *userdata, sd_bu goto finish; } if (r == 0) { - const char *mount_inside; + const char *mount_inside, *q; int mntfd; - const char *q; errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]); @@ -1001,12 +1000,11 @@ int bus_machine_method_bind_mount(sd_bus_message *message, void *userdata, sd_bu (void) mkdir_p(dest, 0755); else { (void) mkdir_parents(dest, 0755); - safe_close(open(dest, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOCTTY, 0600)); + (void) mknod(dest, S_IFREG|0600, 0); } } - /* Fifth, move the mount to the right place inside */ - mount_inside = strjoina("/run/systemd/nspawn/incoming/", basename(mount_outside)); + mount_inside = strjoina("/run/host/incoming/", basename(mount_outside)); if (mount(mount_inside, dest, NULL, MS_MOVE, NULL) < 0) { r = log_error_errno(errno, "Failed to mount: %m"); goto child_fail; diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 1b83f5ad58..98ba33935e 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2517,19 +2517,15 @@ static int setup_propagate(const char *root) { p = strjoina("/run/systemd/nspawn/propagate/", arg_machine); (void) mkdir_p(p, 0600); - r = userns_mkdir(root, "/run/systemd", 0755, 0, 0); + r = userns_mkdir(root, "/run/host", 0755, 0, 0); if (r < 0) - return log_error_errno(r, "Failed to create /run/systemd: %m"); + return log_error_errno(r, "Failed to create /run/host: %m"); - r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0); + r = userns_mkdir(root, "/run/host/incoming", 0600, 0, 0); if (r < 0) - return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m"); + return log_error_errno(r, "Failed to create /run/host/incoming: %m"); - r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0); - if (r < 0) - return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m"); - - q = prefix_roota(root, "/run/systemd/nspawn/incoming"); + q = prefix_roota(root, "/run/host/incoming"); r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL); if (r < 0) return r; @@ -2538,8 +2534,7 @@ static int setup_propagate(const char *root) { if (r < 0) return r; - /* machined will MS_MOVE into that directory, and that's only - * supported for non-shared mounts. */ + /* machined will MS_MOVE into that directory, and that's only supported for non-shared mounts. */ return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL); } From e96ceabac958bf30dcbbf51c6cdf44477ca96c16 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 22 Jul 2020 18:00:18 +0200 Subject: [PATCH 2/6] nspawn: move $NOTIFY_SOCKET into /run/host/ too The sd_notify() socket that nspawn binds that the payload can use to talk to it was previously stored in /run/systemd/nspawn/notify, which is weird (as in the previous commit) since this makes /run/systemd something that is cooperatively maintained by systemd inside the container and nspawn outside of it. We now have a better place where container managers can put the stuff they want to pass to the payload: /run/host/, hence let's make use of that. This is not a compat breakage, since the sd_notify() protocol is based on the $NOTIFY_SOCKET env var, where we place the new socket path. --- src/nspawn/nspawn.c | 14 ++++++-------- test/units/testsuite-13.sh | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 98ba33935e..c28868d415 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -101,10 +101,8 @@ #include "user-util.h" #include "util.h" -/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path - * nspawn_notify_socket_path is relative to the container - * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */ -#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify" +/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */ +#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify" #define EXIT_FORCE_RESTART 133 @@ -3273,7 +3271,7 @@ static int inner_child( return log_error_errno(errno, "execv(%s) failed: %m", exec_target); } -static int setup_sd_notify_child(void) { +static int setup_notify_child(void) { _cleanup_close_ int fd = -1; union sockaddr_union sa = { .un.sun_family = AF_UNIX, @@ -3583,7 +3581,7 @@ static int outer_child( if (r < 0) return log_error_errno(r, "Failed to move root directory: %m"); - fd = setup_sd_notify_child(); + fd = setup_notify_child(); if (fd < 0) return fd; @@ -3796,7 +3794,7 @@ static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t r return 0; } -static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) { +static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) { int r; r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid); @@ -4627,7 +4625,7 @@ static int run_container( return log_error_errno(r, "Failed to attach bus to event loop: %m"); } - r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), ¬ify_event_source); + r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), ¬ify_event_source); if (r < 0) return r; diff --git a/test/units/testsuite-13.sh b/test/units/testsuite-13.sh index d2dba0751b..969ca4a8d9 100755 --- a/test/units/testsuite-13.sh +++ b/test/units/testsuite-13.sh @@ -60,7 +60,7 @@ function check_norbind { function check_notification_socket { # https://github.com/systemd/systemd/issues/4944 - local _cmd='echo a | $(busybox which nc) -U -u -w 1 /run/systemd/nspawn/notify' + local _cmd='echo a | $(busybox which nc) -U -u -w 1 /run/host/notify' # /testsuite-13.nc-container is prepared by test.sh systemd-nspawn $SUSE_OPTS--register=no -D /testsuite-13.nc-container /bin/sh -x -c "$_cmd" systemd-nspawn $SUSE_OPTS--register=no -D /testsuite-13.nc-container -U /bin/sh -x -c "$_cmd" From 9fac502920a648d82e21b207989bfc3c00fbdebc Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 14 Aug 2020 18:56:54 +0200 Subject: [PATCH 3/6] nspawn,pid1: pass "inaccessible" nodes from cntr mgr to pid1 payload via /run/host Let's make /run/host the sole place we pass stuff from host to container in and place the "inaccessible" nodes in /run/host too. In contrast to the previous two commits this is a minor compat break, but not a relevant one I think. Previously the container manager would place these nodes in /run/systemd/inaccessible/ and that's where PID 1 in the container would try to add them too when missing. Container manager and PID 1 in the container would thus manage the same dir together. With this change the container manager now passes an immutable directory to the container and leaves /run/systemd entirely untouched, and managed exclusively by PID 1 inside the container, which is nice to have clear separation on who manages what. In order to make sure systemd then usses the /run/host/inaccesible/ nodes this commit changes PID 1 to look for that dir and if it exists will symlink it to /run/systemd/inaccessible. Now, this will work fine if new nspawn and new pid 1 in the container work together. as then the symlink is created and the difference between the two dirs won't matter. For the case where an old nspawn invokes a new PID 1: in this case things work as they always worked: the dir is managed together. For the case where different container manager invokes a new PID 1: in this case the nodes aren't typically passed in, and PID 1 in the container will try to create them and will likely fail partially (though gracefully) when trying to create char/block device nodes. THis is fine though as there are fallbacks in place for that case. For the case where a new nspawn invokes an old PID1: this is were the (minor) incompatibily happens: in this case new nspawn will place the nodes in the /run/host/inaccessible/ subdir, but the PID 1 in the container won't look for them there. Since the nodes are also not pre-created in /run/systed/inaccessible/ PID 1 will try to create them there as if a different container manager sets them up. This is of course not sexy, but is not a total loss, since as mentioned fallbacks are in place anyway. Hence I think it's OK to accept this minor incompatibility. --- src/core/mount-setup.c | 13 +++++++++++-- src/login/user-runtime-dir.c | 5 ++++- src/nspawn/nspawn.c | 2 +- src/shared/dev-setup.c | 29 +++++++++++++---------------- src/shared/dev-setup.h | 2 +- src/test/test-dev-setup.c | 5 +++-- 6 files changed, 33 insertions(+), 23 deletions(-) diff --git a/src/core/mount-setup.c b/src/core/mount-setup.c index 7df1562c8a..048bd37a6c 100644 --- a/src/core/mount-setup.c +++ b/src/core/mount-setup.c @@ -536,8 +536,17 @@ int mount_setup(bool loaded_policy, bool leave_propagation) { (void) mkdir_label("/run/systemd/system", 0755); /* Also create /run/systemd/inaccessible nodes, so that we always have something to mount - * inaccessible nodes from. */ - (void) make_inaccessible_nodes(NULL, UID_INVALID, GID_INVALID); + * inaccessible nodes from. If we run in a container the host might have created these for us already + * in /run/host/inaccessible/. Use those if we can, since tht way we likely get access to block/char + * device nodes that are inaccessible, and if userns is used to nodes that are on mounts owned by a + * userns outside the container and thus nicely read-only and not remountable. */ + if (access("/run/host/inaccessible/", F_OK) < 0) { + if (errno != ENOENT) + log_debug_errno(errno, "Failed to check if /run/host/inaccessible exists, ignoring: %m"); + + (void) make_inaccessible_nodes("/run/systemd", UID_INVALID, GID_INVALID); + } else + (void) symlink("../host/inaccessible", "/run/systemd/inaccessible"); return 0; } diff --git a/src/login/user-runtime-dir.c b/src/login/user-runtime-dir.c index 38058d7b2a..a56c527df8 100644 --- a/src/login/user-runtime-dir.c +++ b/src/login/user-runtime-dir.c @@ -49,6 +49,7 @@ static int user_mkdir_runtime_path( uint64_t runtime_dir_size, uint64_t runtime_dir_inodes) { + const char *p; int r; assert(runtime_path); @@ -99,7 +100,9 @@ static int user_mkdir_runtime_path( } /* Set up inaccessible nodes now so they're available if we decide to use them with user namespaces. */ - (void) make_inaccessible_nodes(runtime_path, uid, gid); + p = strjoina(runtime_path, "/systemd"); + (void) mkdir(p, 0755); + (void) make_inaccessible_nodes(p, uid, gid); return 0; fail: diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index c28868d415..c263b0ff70 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -3523,7 +3523,7 @@ static int outer_child( (void) dev_setup(directory, arg_uid_shift, arg_uid_shift); - p = prefix_roota(directory, "/run"); + p = prefix_roota(directory, "/run/host"); (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift); r = setup_pts(directory); diff --git a/src/shared/dev-setup.c b/src/shared/dev-setup.c index 6e57e2a99d..528440b82f 100644 --- a/src/shared/dev-setup.c +++ b/src/shared/dev-setup.c @@ -57,7 +57,7 @@ int dev_setup(const char *prefix, uid_t uid, gid_t gid) { } int make_inaccessible_nodes( - const char *runtime_dir, + const char *parent_dir, uid_t uid, gid_t gid) { @@ -65,28 +65,26 @@ int make_inaccessible_nodes( const char *name; mode_t mode; } table[] = { - { "/systemd", S_IFDIR | 0755 }, - { "/systemd/inaccessible", S_IFDIR | 0000 }, - { "/systemd/inaccessible/reg", S_IFREG | 0000 }, - { "/systemd/inaccessible/dir", S_IFDIR | 0000 }, - { "/systemd/inaccessible/fifo", S_IFIFO | 0000 }, - { "/systemd/inaccessible/sock", S_IFSOCK | 0000 }, + { "inaccessible", S_IFDIR | 0755 }, + { "inaccessible/reg", S_IFREG | 0000 }, + { "inaccessible/dir", S_IFDIR | 0000 }, + { "inaccessible/fifo", S_IFIFO | 0000 }, + { "inaccessible/sock", S_IFSOCK | 0000 }, /* The following two are likely to fail if we lack the privs for it (for example in an userns * environment, if CAP_SYS_MKNOD is missing, or if a device node policy prohibit major/minor of 0 * device nodes to be created). But that's entirely fine. Consumers of these files should carry * fallback to use a different node then, for example /inaccessible/sock, which is close * enough in behaviour and semantics for most uses. */ - { "/systemd/inaccessible/chr", S_IFCHR | 0000 }, - { "/systemd/inaccessible/blk", S_IFBLK | 0000 }, + { "inaccessible/chr", S_IFCHR | 0000 }, + { "inaccessible/blk", S_IFBLK | 0000 }, }; _cleanup_umask_ mode_t u; - size_t i; int r; - if (!runtime_dir) - runtime_dir = "/run"; + if (!parent_dir) + parent_dir = "/run/systemd"; u = umask(0000); @@ -95,10 +93,10 @@ int make_inaccessible_nodes( * to lock down these nodes as much as we can, but otherwise try to match them as closely as possible with the * underlying file, i.e. in the best case we offer the same node type as the underlying node. */ - for (i = 0; i < ELEMENTSOF(table); i++) { + for (size_t i = 0; i < ELEMENTSOF(table); i++) { _cleanup_free_ char *path = NULL; - path = path_join(runtime_dir, table[i].name); + path = path_join(parent_dir, table[i].name); if (!path) return log_oom(); @@ -107,8 +105,7 @@ int make_inaccessible_nodes( else r = mknod_label(path, table[i].mode, makedev(0, 0)); if (r < 0) { - if (r != -EEXIST) - log_debug_errno(r, "Failed to create '%s', ignoring: %m", path); + log_debug_errno(r, "Failed to create '%s', ignoring: %m", path); continue; } diff --git a/src/shared/dev-setup.h b/src/shared/dev-setup.h index 72b90ec4de..437c0e96e6 100644 --- a/src/shared/dev-setup.h +++ b/src/shared/dev-setup.h @@ -5,4 +5,4 @@ int dev_setup(const char *prefix, uid_t uid, gid_t gid); -int make_inaccessible_nodes(const char *root, uid_t uid, gid_t gid); +int make_inaccessible_nodes(const char *parent_dir, uid_t uid, gid_t gid); diff --git a/src/test/test-dev-setup.c b/src/test/test-dev-setup.c index 038484e475..11196cd4d6 100644 --- a/src/test/test-dev-setup.c +++ b/src/test/test-dev-setup.c @@ -3,6 +3,7 @@ #include "capability-util.h" #include "dev-setup.h" #include "fs-util.h" +#include "mkdir.h" #include "path-util.h" #include "rm-rf.h" #include "tmpfile-util.h" @@ -17,8 +18,8 @@ int main(int argc, char *argv[]) { assert_se(mkdtemp_malloc("/tmp/test-dev-setupXXXXXX", &p) >= 0); - f = prefix_roota(p, "/run"); - assert_se(mkdir(f, 0755) >= 0); + f = prefix_roota(p, "/run/systemd"); + assert_se(mkdir_p(f, 0755) >= 0); assert_se(make_inaccessible_nodes(f, 1, 1) >= 0); From 0f48ba7b8489770629f1b951a859e719666616ce Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 14 Aug 2020 19:58:37 +0200 Subject: [PATCH 4/6] nspawn: provide $container and $container_uuid in /run/host too This has the major benefit that the entire payload of the container can access these files there. Previously, we'd set them only as env vars, but that meant only PID 1 could read them directly or other privileged payload code with access to /run/1/environ. --- src/basic/virt.c | 10 ++++++++++ src/nspawn/nspawn.c | 8 ++++++++ 2 files changed, 18 insertions(+) diff --git a/src/basic/virt.c b/src/basic/virt.c index 4c2280cfd6..212b3b7039 100644 --- a/src/basic/virt.c +++ b/src/basic/virt.c @@ -491,6 +491,16 @@ int detect_container(void) { } } + /* The container manager might have placed this in the /run/host hierarchy for us, which is best + * because we can be consumed just like that, without special privileges. */ + r = read_one_line_file("/run/host/container-manager", &m); + if (r > 0) { + e = m; + goto translate_name; + } + if (!IN_SET(r, -ENOENT, 0)) + return log_debug_errno(r, "Failed to read /run/systemd/container: %m"); + if (getpid_cached() == 1) { /* If we are PID 1 we can just check our own environment variable, and that's authoritative. * We distinguish three cases: diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index c263b0ff70..3ad8829855 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -3564,6 +3564,14 @@ static int outer_child( if (r < 0) return r; + /* The same stuff as the $container env var, but nicely readable for the entire payload */ + p = prefix_roota(directory, "/run/host/container-manager"); + (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE); + + /* The same stuff as the $container_uuid env var */ + p = prefix_roota(directory, "/run/host/container-uuid"); + (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)); + if (!arg_use_cgns) { r = mount_cgroups( directory, From 00e64c6d064c1024d57d0e5b6cfd83944e9e626b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 14 Aug 2020 19:49:29 +0200 Subject: [PATCH 5/6] doc: document what we now place in /run/host --- docs/CONTAINER_INTERFACE.md | 63 +++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/docs/CONTAINER_INTERFACE.md b/docs/CONTAINER_INTERFACE.md index a36d2edc72..c7c57c7c06 100644 --- a/docs/CONTAINER_INTERFACE.md +++ b/docs/CONTAINER_INTERFACE.md @@ -172,6 +172,13 @@ manager, please consider supporting the following interfaces. unit they created for their container. That's private property of systemd, and no other code should modify it. +6. systemd running inside the container can report when boot-up is complete + using the usual `sd_notify()` protocol that is also used when a service + wants to tell the service manager about readiness. A container manager can + set the `$NOTIFY_SOCKET` environment variable to a suitable socket path to + make use of this functionality. (Also see information about + `/run/host/notify` below.) + ## Networking 1. Inside of a container, if a `veth` link is named `host0`, `systemd-networkd` @@ -189,6 +196,62 @@ manager, please consider supporting the following interfaces. devices, for example hashed out of the container names. That way it is more likely that DHCP and IPv4LL will acquire stable addresses. +## The `/run/host/` Hierarchy + +Container managers may place certain resources the manager wants to provide to +the container payload below the `/run/host/` hierarchy. This hierarchy should +be mostly immutable (possibly some subdirs might be writable, but the top-level +hierarchy — and probably most subdirs should be read-only to the +container). Note that this hierarchy is used by various container managers, and +care should be taken to avoid naming conflicts. `systemd` (and in particular +`systemd-nspawn`) use the hierarchy for the following resources: + +1. The `/run/host/incoming/` directory mount point is configured for `MS_SLAVE` + mount propagation with the host, and is used as intermediary location for + mounts to establish in the container, for the implementation of `machinectl + bind`. Container payload should usually not directly interact with this + directory: it's used by code outside the container to insert mounts inside + it only, and is mostly an internal vehicle to achieve this. Other container + managers that want to implement similar functionality might consider using + the same directory. + +2. The `/run/host/inaccessible/` directory may be set up by the container + manager to include six file nodes: `reg`, `dir`, `fifo`, `sock`, `chr`, + `blk`. These nodes correspond with the six types of file nodes Linux knows + (with the exceptions of symlinks). Each node should be of the specific type + and have an all zero access mode, i.e. be inaccessible. The two device node + types should have major and minor of zero (which are unallocated devices on + Linux). These nodes are used as mount source for implementing the + `InaccessiblePath=` setting of unit files, i.e. file nodes to mask this way + are overmounted with these "inaccessible" inodes, guaranteeing that the file + node type does not change this way but the nodes still become + inaccessible. Note that systemd when run as PID 1 in the container payload + will create these nodes on its own if not passed in by the container + manager. However, in that case it likely lacks the privileges to create the + character and block devices nodes (there all fallbacks for this case). + +3. The `/run/host/notify` path is a good choice to place the `sd_notify()` + socket in, that may be used for the container's PID 1 to report to the + container manager when boot-up is complete. The path used for this doesn't + matter much as it is communicated via the `$NOTIFY_SOCKET` environment + variable, following the usual protocol for this, however it's suitable, and + recommended place for this socket in case ready notification is desired. + +4. The `/run/host/os-release` file contains the `/etc/os-release` file of the + host, i.e. may be used by the container payload to gather limited + information about the host environment, on top of what `uname -a` reports. + +5. The `/run/host/container-manager` file may be used to pass the same + information as the `$container` environment variable (see above), i.e. a + short string identifying the container manager implementation. This file + should be newline terminated. Passing this information via this file has the + benefit that payload code can easily access it, even when running + unprivileged without access to the container PID1's environment block. + +6. The `/run/host/container-uuid` file may be used to pass the same information + as the `$container_uuid` environment variable (see above). This file should + be newline terminated. + ## What You Shouldn't Do 1. Do not drop `CAP_MKNOD` from the container. `PrivateDevices=` is a commonly From 3242980582d501ec2adbcc0f794c7161056812e8 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 19 Aug 2020 17:42:33 +0200 Subject: [PATCH 6/6] core: create per-user inaccessible node from the service manager Previously, we'd create them from user-runtime-dir@.service. That has one benefit: since this service runs privileged, we can create the full set of device nodes. It has one major drawback though: it security-wise problematic to create files/directories in directories as privileged user in directories owned by unprivileged users, since they can use symlinks to redirect what we want to do. As a general rule we hence avoid this logic: only unpriv code should populate unpriv directories. Hence, let's move this code to an appropriate place in the service manager. This means we lose the inaccessible block device node, but since there's already a fallback in place, this shouldn't be too bad. --- src/core/main.c | 16 ++++++++++++++++ src/login/user-runtime-dir.c | 8 ++------ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/core/main.c b/src/core/main.c index 8d53c0bf85..4812f309f2 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -32,6 +32,7 @@ #include "dbus-manager.h" #include "dbus.h" #include "def.h" +#include "dev-setup.h" #include "efi-random.h" #include "efivars.h" #include "emergency-action.h" @@ -53,6 +54,7 @@ #include "loopback-setup.h" #include "machine-id-setup.h" #include "manager.h" +#include "mkdir.h" #include "mount-setup.h" #include "os-util.h" #include "pager.h" @@ -2073,6 +2075,20 @@ static int initialize_runtime( if (r < 0) log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", arg_watchdog_device); } + } else { + _cleanup_free_ char *p = NULL; + + /* Create the runtime directory and place the inaccessible device nodes there, if we run in + * user mode. In system mode mount_setup() already did that. */ + + r = xdg_user_runtime_dir(&p, "/systemd"); + if (r < 0) { + *ret_error_message = "$XDG_RUNTIME_DIR is not set"; + return log_emergency_errno(r, "Failed to determine $XDG_RUNTIME_DIR path: %m"); + } + + (void) mkdir_p(p, 0755); + (void) make_inaccessible_nodes(p, UID_INVALID, GID_INVALID); } if (arg_timer_slack_nsec != NSEC_INFINITY) diff --git a/src/login/user-runtime-dir.c b/src/login/user-runtime-dir.c index a56c527df8..c5d27b215d 100644 --- a/src/login/user-runtime-dir.c +++ b/src/login/user-runtime-dir.c @@ -49,7 +49,6 @@ static int user_mkdir_runtime_path( uint64_t runtime_dir_size, uint64_t runtime_dir_inodes) { - const char *p; int r; assert(runtime_path); @@ -84,7 +83,8 @@ static int user_mkdir_runtime_path( goto fail; } - log_debug_errno(errno, "Failed to mount per-user tmpfs directory %s.\n" + log_debug_errno(errno, + "Failed to mount per-user tmpfs directory %s.\n" "Assuming containerized execution, ignoring: %m", runtime_path); r = chmod_and_chown(runtime_path, 0700, uid, gid); @@ -99,10 +99,6 @@ static int user_mkdir_runtime_path( log_warning_errno(r, "Failed to fix label of \"%s\", ignoring: %m", runtime_path); } - /* Set up inaccessible nodes now so they're available if we decide to use them with user namespaces. */ - p = strjoina(runtime_path, "/systemd"); - (void) mkdir(p, 0755); - (void) make_inaccessible_nodes(p, uid, gid); return 0; fail: