Merge pull request #8898 from poettering/nspawn-mount-block

some nspawn cgroup and mount lock-down fixes
2018-05-08 12:54:58 +02:00 · 2018-05-08 12:54:58 +02:00 · 6b1ca2a948
parent 6157694dc6 c7db2a9de4
commit 6b1ca2a948
10 changed files with 213 additions and 133 deletions
--- a/7
+++ b/7
@ -24,6 +24,10 @@ Janitorial Clean-ups:

 Features:

+* nspawn: greater control over hostname, resolv.conf, timezone, rlim
+
+* nspawn: when operating in a scope, also create /payload subcrgoup
+
 * the error paths in usbffs_dispatch_ep() leak memory

 * cgroups: figure out if we can somehow communicate in a cleaner way whether a
@ -52,9 +56,6 @@ Features:

 * add --vacuum-xyz options to coredumpctl, matching those journalctl already has.

-* list the exit codes from the BSD/glibc <sysexits.h> in our own
-  exit-codes.[ch] tables.
-
 * SuccessExitStatus= and friends should probably also accept symbolic exit
  codes names, i.e. error codes from the list maintained in exit-codes.[ch]

--- a/doc/CGROUP_DELEGATION.md
+++ b/doc/CGROUP_DELEGATION.md
@ -424,15 +424,16 @@ unified you (of course, I guess) need to provide only `/sys/fs/cgroup/` itself.
   cgroup tree of systemd itself is out of limits for you. It's fine to *read*
   from any attribute you like however. That's totally OK and welcome.

-4. 🚫 When not using `CLONE_NEWCGROUP` when delegating a sub-tree to a container
-   payload running systemd, then don't get the idea that you can bind mount
-   only a sub-tree of the host's cgroup tree into the container. Part of the
-   cgroup API is that `/proc/$PID/cgroup` reports the cgroup path of every
+4. 🚫 When not using `CLONE_NEWCGROUP` when delegating a sub-tree to a
+   container payload running systemd, then don't get the idea that you can bind
+   mount only a sub-tree of the host's cgroup tree into the container. Part of
+   the cgroup API is that `/proc/$PID/cgroup` reports the cgroup path of every
   process, and hence any path below `/sys/fs/cgroup/` needs to match what
   `/proc/$PID/cgroup` of the payload processes reports. What you can do safely
-   however, is mount the upper parts of the cgroup tree read-only or even
-   replace it with an intermediary `tmpfs`, as long as the path to the
-   delegated sub-tree remains accessible as-is.
+   however, is mount the upper parts of the cgroup tree read-only (or even
+   replace the middle bits with an intermediary `tmpfs` — but be careful not to
+   break the `statfs()` detection logic discussed above), as long as the path
+   to the delegated sub-tree remains accessible as-is.

 5. ⚡ Currently, the algorithm for mapping between slice/scope/service unit
   naming and their cgroup paths is not considered public API of systemd, and
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@ -81,23 +81,26 @@ static const MountEntry apivfs_table[] = {

 /* ProtectKernelTunables= option and the related filesystem APIs */
 static const MountEntry protect_kernel_tunables_table[] = {
-        { "/proc/sys",           READONLY,     false },
-        { "/proc/sysrq-trigger", READONLY,     true  },
-        { "/proc/latency_stats", READONLY,     true  },
-        { "/proc/mtrr",          READONLY,     true  },
-        { "/proc/apm",           READONLY,     true  }, /* Obsolete API, there's no point in permitting access to this, ever */
        { "/proc/acpi",          READONLY,     true  },
-        { "/proc/timer_stats",   READONLY,     true  },
+        { "/proc/apm",           READONLY,     true  }, /* Obsolete API, there's no point in permitting access to this, ever */
        { "/proc/asound",        READONLY,     true  },
        { "/proc/bus",           READONLY,     true  },
        { "/proc/fs",            READONLY,     true  },
        { "/proc/irq",           READONLY,     true  },
+        { "/proc/kallsyms",      INACCESSIBLE, true  },
+        { "/proc/kcore",         INACCESSIBLE, true  },
+        { "/proc/latency_stats", READONLY,     true  },
+        { "/proc/mtrr",          READONLY,     true  },
+        { "/proc/scsi",          READONLY,     true  },
+        { "/proc/sys",           READONLY,     false },
+        { "/proc/sysrq-trigger", READONLY,     true  },
+        { "/proc/timer_stats",   READONLY,     true  },
        { "/sys",                READONLY,     false },
-        { "/sys/kernel/debug",   READONLY,     true  },
-        { "/sys/kernel/tracing", READONLY,     true  },
        { "/sys/fs/bpf",         READONLY,     true  },
        { "/sys/fs/cgroup",      READWRITE,    false }, /* READONLY is set by ProtectControlGroups= option */
        { "/sys/fs/selinux",     READWRITE,    true  },
+        { "/sys/kernel/debug",   READONLY,     true  },
+        { "/sys/kernel/tracing", READONLY,     true  },
 };

 /* ProtectKernelModules= option */
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@ -141,44 +141,53 @@ finish:
        return r;
 }

-int create_subcgroup(pid_t pid, CGroupUnified unified_requested) {
+int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) {
        _cleanup_free_ char *cgroup = NULL;
-        const char *child;
-        int r;
        CGroupMask supported;
+        const char *payload;
+        int r;

-        /* In the unified hierarchy inner nodes may only contain
-         * subgroups, but not processes. Hence, if we running in the
-         * unified hierarchy and the container does the same, and we
-         * did not create a scope unit for the container move us and
-         * the container into two separate subcgroups. */
+        assert(pid > 1);

-        if (unified_requested == CGROUP_UNIFIED_NONE)
-                return 0;
-
-        r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
-        if (r < 0)
-                return log_error_errno(r, "Failed to determine whether the systemd controller is unified: %m");
-        if (r == 0)
-                return 0;
+        /* In the unified hierarchy inner nodes may only contain subgroups, but not processes. Hence, if we running in
+         * the unified hierarchy and the container does the same, and we did not create a scope unit for the container
+         * move us and the container into two separate subcgroups.
+         *
+         * Moreover, container payloads such as systemd try to manage the cgroup they run in in full (i.e. including
+         * its attributes), while the host systemd will only delegate cgroups for children of the cgroup created for a
+         * delegation unit, instead of the cgroup itself. This means, if we'd pass on the cgroup allocated from the
+         * host systemd directly to the payload, the host and payload systemd might fight for the cgroup
+         * attributes. Hence, let's insert an intermediary cgroup to cover that case too.
+         *
+         * Note that we only bother with the main hierarchy here, not with any secondary ones. On the unified setup
+         * that's fine because there's only one hiearchy anyway and controllers are enabled directly on it. On the
+         * legacy setup, this is fine too, since delegation of controllers is generally not safe there, hence we won't
+         * do it. */

        r = cg_mask_supported(&supported);
        if (r < 0)
                return log_error_errno(r, "Failed to determine supported controllers: %m");

-        r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
+        if (keep_unit)
+                r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
+        else
+                r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
        if (r < 0)
                return log_error_errno(r, "Failed to get our control group: %m");

-        child = strjoina(cgroup, "/payload");
-        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
+        payload = strjoina(cgroup, "/payload");
+        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid);
        if (r < 0)
-                return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
+                return log_error_errno(r, "Failed to create %s subcgroup: %m", payload);

-        child = strjoina(cgroup, "/supervisor");
-        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
-        if (r < 0)
-                return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
+        if (keep_unit) {
+                const char *supervisor;
+
+                supervisor = strjoina(cgroup, "/supervisor");
+                r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, supervisor, 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to create %s subcgroup: %m", supervisor);
+        }

        /* Try to enable as many controllers as possible for the new payload. */
        (void) cg_enable_everywhere(supported, supported, cgroup);
--- a/src/nspawn/nspawn-cgroup.h
+++ b/src/nspawn/nspawn-cgroup.h
@ -14,4 +14,4 @@

 int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
 int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
-int create_subcgroup(pid_t pid, CGroupUnified unified_requested);
+int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested);
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@ -27,7 +27,7 @@
 #include "user-util.h"
 #include "util.h"

-CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
+CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) {
        CustomMount *c, *ret;

        assert(l);
@ -48,8 +48,8 @@ CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
        return ret;
 }

-void custom_mount_free_all(CustomMount *l, unsigned n) {
-        unsigned i;
+void custom_mount_free_all(CustomMount *l, size_t n) {
+        size_t i;

        for (i = 0; i < n; i++) {
                CustomMount *m = l + i;
@ -110,8 +110,8 @@ static char *resolve_source_path(const char *dest, const char *source) {
        return strdup(source);
 }

-int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
-        unsigned i;
+int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) {
+        size_t i;
        int r;

        /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
@ -133,8 +133,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
                        if (!s)
                                return log_oom();

-                        free(m->source);
-                        m->source = s;
+                        free_and_replace(m->source, s);
                } else {
                        /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */

@ -165,8 +164,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
                                if (!s)
                                        return log_oom();

-                                free(*j);
-                                *j = s;
+                                free_and_replace(*j, s);
                        }

                        if (m->work_dir) {
@ -176,8 +174,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
                                if (!s)
                                        return log_oom();

-                                free(m->work_dir);
-                                m->work_dir = s;
+                                free_and_replace(m->work_dir, s);
                        } else {
                                assert(m->source);

@ -193,7 +190,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
        return 0;
 }

-int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
+int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
        _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
        const char *p = s;
        CustomMount *m;
@ -239,7 +236,7 @@ int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only
        return 0;
 }

-int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
+int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s) {
        _cleanup_free_ char *path = NULL, *opts = NULL;
        const char *p = s;
        CustomMount *m;
@ -275,7 +272,7 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
        return 0;
 }

-int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
+int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
        _cleanup_free_ char *upper = NULL, *destination = NULL;
        _cleanup_strv_free_ char **lower = NULL;
        CustomMount *m;
@ -511,6 +508,18 @@ int mount_all(const char *dest,
              uid_t uid_shift, uid_t uid_range,
              const char *selinux_apifs_context) {

+#define PROC_INACCESSIBLE(path)                                         \
+        { NULL, (path), NULL, NULL, MS_BIND,                            \
+          MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_INACCESSIBLE_REG }, /* Bind mount first ... */ \
+        { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
+          MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
+
+#define PROC_READ_ONLY(path)                                            \
+        { (path), (path), NULL, NULL, MS_BIND,                          \
+          MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
+        { NULL,   (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
+          MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
+
        typedef struct MountPoint {
                const char *what;
                const char *where;
@ -521,39 +530,72 @@ int mount_all(const char *dest,
        } MountPoint;

        static const MountPoint mount_table[] = {
-                /* inner child mounts */
-                { "proc",                "/proc",               "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,                              MOUNT_FATAL|MOUNT_IN_USERNS },
-                { "/proc/sys",           "/proc/sys",           NULL,    NULL,        MS_BIND,                                                   MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },                          /* Bind mount first ... */
-                { "/proc/sys/net",       "/proc/sys/net",       NULL,    NULL,        MS_BIND,                                                   MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
-                { NULL,                  "/proc/sys",           NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },                          /* ... then, make it r/o */
-                { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL,    NULL,        MS_BIND,                                                   MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },  /* Bind mount first ... */
-                { NULL,                  "/proc/sysrq-trigger", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },  /* ... then, make it r/o */
+                /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
+                { "proc",            "/proc",           "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,
+                  MOUNT_FATAL|MOUNT_IN_USERNS },

-                /* outer child mounts */
-                { "tmpfs",               "/tmp",                "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         MOUNT_FATAL },
-                { "tmpfs",               "/sys",                "tmpfs", "mode=755",  MS_NOSUID|MS_NOEXEC|MS_NODEV,                              MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
-                { "sysfs",               "/sys",                "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,                    MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO },    /* skipped if above was mounted */
-                { "sysfs",               "/sys",                "sysfs", NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,                              MOUNT_FATAL },                          /* skipped if above was mounted */
+                { "/proc/sys",       "/proc/sys",       NULL,    NULL,        MS_BIND,
+                  MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },                          /* Bind mount first ... */
+
+                { "/proc/sys/net",   "/proc/sys/net",   NULL,    NULL,        MS_BIND,
+                  MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
+
+                { NULL,              "/proc/sys",       NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
+                  MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },                          /* ... then, make it r/o */
+
+                /* Make these files inaccessible to container payloads: they potentially leak information about kernel
+                 * internals or the host's execution environment to the container */
+                PROC_INACCESSIBLE("/proc/kallsyms"),
+                PROC_INACCESSIBLE("/proc/kcore"),
+                PROC_INACCESSIBLE("/proc/keys"),
+                PROC_INACCESSIBLE("/proc/sysrq-trigger"),
+                PROC_INACCESSIBLE("/proc/timer_list"),
+
+                /* Make these directories read-only to container payloads: they show hardware information, and in some
+                 * cases contain tunables the container really shouldn't have access to. */
+                PROC_READ_ONLY("/proc/acpi"),
+                PROC_READ_ONLY("/proc/apm"),
+                PROC_READ_ONLY("/proc/asound"),
+                PROC_READ_ONLY("/proc/bus"),
+                PROC_READ_ONLY("/proc/fs"),
+                PROC_READ_ONLY("/proc/irq"),
+                PROC_READ_ONLY("/proc/scsi"),
+
+                /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
+                { "tmpfs",           "/tmp",            "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
+                  MOUNT_FATAL },
+                { "tmpfs",           "/sys",            "tmpfs", "mode=755",  MS_NOSUID|MS_NOEXEC|MS_NODEV,
+                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
+                { "sysfs",           "/sys",            "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
+                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO },    /* skipped if above was mounted */
+                { "sysfs",           "/sys",            "sysfs", NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,
+                  MOUNT_FATAL },                          /* skipped if above was mounted */
+                { "tmpfs",           "/dev",            "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,
+                  MOUNT_FATAL },
+                { "tmpfs",           "/dev/shm",        "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
+                  MOUNT_FATAL },
+                { "tmpfs",           "/run",            "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,
+                  MOUNT_FATAL },

-                { "tmpfs",               "/dev",                "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,                                  MOUNT_FATAL },
-                { "tmpfs",               "/dev/shm",            "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         MOUNT_FATAL },
-                { "tmpfs",               "/run",                "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         MOUNT_FATAL },
 #if HAVE_SELINUX
-                { "/sys/fs/selinux",     "/sys/fs/selinux",     NULL,    NULL,        MS_BIND,                                                   0 },  /* Bind mount first */
-                { NULL,                  "/sys/fs/selinux",     NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 },  /* Then, make it r/o */
+                { "/sys/fs/selinux", "/sys/fs/selinux", NULL,    NULL,        MS_BIND,
+                  0 },  /* Bind mount first */
+                { NULL,              "/sys/fs/selinux", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
+                  0 },  /* Then, make it r/o */
 #endif
        };

-        unsigned k;
-        int r;
+        _cleanup_(unlink_and_freep) char *inaccessible = NULL;
        bool use_userns = (mount_settings & MOUNT_USE_USERNS);
        bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
        bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
        bool in_userns = (mount_settings & MOUNT_IN_USERNS);
+        size_t k;
+        int r;

        for (k = 0; k < ELEMENTSOF(mount_table); k++) {
                _cleanup_free_ char *where = NULL, *options = NULL;
-                const char *o;
+                const char *o, *what;
                bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);

                if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
@ -569,12 +611,32 @@ int mount_all(const char *dest,
                if (r < 0)
                        return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);

+                if (mount_table[k].mount_settings & MOUNT_INACCESSIBLE_REG) {
+
+                        if (!inaccessible) {
+                                _cleanup_free_ char *np = NULL;
+
+                                r = tempfn_random_child(NULL, "inaccessible", &np);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to generate inaccessible file node path: %m");
+
+                                r = touch_file(np, false, USEC_INFINITY, UID_INVALID, GID_INVALID, 0000);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to create inaccessible file node '%s': %m", np);
+
+                                inaccessible = TAKE_PTR(np);
+                        }
+
+                        what = inaccessible;
+                } else
+                        what = mount_table[k].what;
+
                r = path_is_mount_point(where, NULL, 0);
                if (r < 0 && r != -ENOENT)
                        return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);

                /* Skip this entry if it is not a remount. */
-                if (mount_table[k].what && r > 0)
+                if (what && r > 0)
                        continue;

                r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift);
@ -603,7 +665,7 @@ int mount_all(const char *dest,
                }

                r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
-                                  mount_table[k].what,
+                                  what,
                                  where,
                                  mount_table[k].type,
                                  mount_table[k].flags,
@ -766,11 +828,11 @@ static int mount_overlay(const char *dest, CustomMount *m) {

 int mount_custom(
                const char *dest,
-                CustomMount *mounts, unsigned n,
+                CustomMount *mounts, size_t n,
                bool userns, uid_t uid_shift, uid_t uid_range,
                const char *selinux_apifs_context) {

-        unsigned i;
+        size_t i;
        int r;

        assert(dest);
--- a/src/nspawn/nspawn-mount.h
+++ b/src/nspawn/nspawn-mount.h
@ -13,12 +13,13 @@
 #include "volatile-util.h"

 typedef enum MountSettingsMask {
-        MOUNT_FATAL              = 1 << 0, /* if set, a mount error is considered fatal */
-        MOUNT_USE_USERNS         = 1 << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */
-        MOUNT_IN_USERNS          = 1 << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */
-        MOUNT_APPLY_APIVFS_RO    = 1 << 3, /* if set, /proc/sys, and /sysfs will be mounted read-only, otherwise read-write. */
-        MOUNT_APPLY_APIVFS_NETNS = 1 << 4, /* if set, /proc/sys/net will be mounted read-write.
-                                              Works only if MOUNT_APPLY_APIVFS_RO is also set. */
+        MOUNT_FATAL              = 1U << 0, /* if set, a mount error is considered fatal */
+        MOUNT_USE_USERNS         = 1U << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */
+        MOUNT_IN_USERNS          = 1U << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */
+        MOUNT_APPLY_APIVFS_RO    = 1U << 3, /* if set, /proc/sys, and /sys will be mounted read-only, otherwise read-write. */
+        MOUNT_APPLY_APIVFS_NETNS = 1U << 4, /* if set, /proc/sys/net will be mounted read-write.
+                                               Works only if MOUNT_APPLY_APIVFS_RO is also set. */
+        MOUNT_INACCESSIBLE_REG   = 1U << 5, /* if set, create an inaccessible regular file first and use as bind mount source */
 } MountSettingsMask;

 typedef enum CustomMountType {
@ -40,13 +41,13 @@ typedef struct CustomMount {
        char *rm_rf_tmpdir;
 } CustomMount;

-CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t);
-void custom_mount_free_all(CustomMount *l, unsigned n);
-int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n);
+CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t);
+void custom_mount_free_all(CustomMount *l, size_t n);
+int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n);

-int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only);
-int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s);
-int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only);
+int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only);
+int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s);
+int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only);

 int mount_all(const char *dest, MountSettingsMask mount_settings, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
 int mount_sysfs(const char *dest, MountSettingsMask mount_settings);
@ -54,7 +55,7 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings);
 int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
 int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested);

-int mount_custom(const char *dest, CustomMount *mounts, unsigned n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
+int mount_custom(const char *dest, CustomMount *mounts, size_t n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);

 int setup_volatile(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
 int setup_volatile_state(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
--- a/src/nspawn/nspawn-register.c
+++ b/src/nspawn/nspawn-register.c
@ -11,6 +11,7 @@
 #include "bus-unit-util.h"
 #include "bus-util.h"
 #include "nspawn-register.h"
+#include "special.h"
 #include "stat-util.h"
 #include "strv.h"
 #include "util.h"
@ -309,7 +310,7 @@ int allocate_scope(
                                  "PIDs", "au", 1, pid,
                                  "Description", "s", description,
                                  "Delegate", "b", 1,
-                                  "Slice", "s", isempty(slice) ? "machine.slice" : slice);
+                                  "Slice", "s", isempty(slice) ? SPECIAL_MACHINE_SLICE : slice);
        if (r < 0)
                return bus_log_create_error(r);

--- a/src/nspawn/nspawn-settings.h
+++ b/src/nspawn/nspawn-settings.h
@ -76,7 +76,7 @@ typedef struct Settings {
        int read_only;
        VolatileMode volatile_mode;
        CustomMount *custom_mounts;
-        unsigned n_custom_mounts;
+        size_t n_custom_mounts;
        int userns_chown;

        /* [Network] */
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@ -165,7 +165,7 @@ static uint64_t arg_caps_retain =
        (1ULL << CAP_SYS_RESOURCE) |
        (1ULL << CAP_SYS_TTY_CONFIG);
 static CustomMount *arg_custom_mounts = NULL;
-static unsigned arg_n_custom_mounts = 0;
+static size_t arg_n_custom_mounts = 0;
 static char **arg_setenv = NULL;
 static bool arg_quiet = false;
 static bool arg_register = true;
@ -291,7 +291,7 @@ static void help(void) {
 }

 static int custom_mount_check_all(void) {
-        unsigned i;
+        size_t i;

        for (i = 0; i < arg_n_custom_mounts; i++) {
                CustomMount *m = &arg_custom_mounts[i];
@ -1470,31 +1470,35 @@ static int setup_resolv_conf(const char *dest) {
 }

 static int setup_boot_id(void) {
+        _cleanup_(unlink_and_freep) char *from = NULL;
+        _cleanup_free_ char *path = NULL;
        sd_id128_t rnd = SD_ID128_NULL;
-        const char *from, *to;
+        const char *to;
        int r;

        /* Generate a new randomized boot ID, so that each boot-up of
         * the container gets a new one */

-        from = "/run/proc-sys-kernel-random-boot-id";
-        to = "/proc/sys/kernel/random/boot_id";
+        r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate random boot ID path: %m");

        r = sd_id128_randomize(&rnd);
        if (r < 0)
                return log_error_errno(r, "Failed to generate random boot id: %m");

-        r = id128_write(from, ID128_UUID, rnd, false);
+        r = id128_write(path, ID128_UUID, rnd, false);
        if (r < 0)
                return log_error_errno(r, "Failed to write boot id: %m");

-        r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
-        if (r >= 0)
-                r = mount_verbose(LOG_ERR, NULL, to, NULL,
-                                  MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
+        from = TAKE_PTR(path);
+        to = "/proc/sys/kernel/random/boot_id";

-        (void) unlink(from);
-        return r;
+        r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
+        if (r < 0)
+                return r;
+
+        return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
 }

 static int copy_devnodes(const char *dest) {
@ -1662,26 +1666,32 @@ static int setup_keyring(void) {
 }

 static int setup_kmsg(int kmsg_socket) {
-        const char *from, *to;
+        _cleanup_(unlink_and_freep) char *from = NULL;
+        _cleanup_free_ char *fifo = NULL;
+        _cleanup_close_ int fd = -1;
        _cleanup_umask_ mode_t u;
-        int fd, r;
+        const char *to;
+        int r;

        assert(kmsg_socket >= 0);

        u = umask(0000);

-        /* We create the kmsg FIFO as /run/kmsg, but immediately
-         * delete it after bind mounting it to /proc/kmsg. While FIFOs
-         * on the reading side behave very similar to /proc/kmsg,
-         * their writing side behaves differently from /dev/kmsg in
-         * that writing blocks when nothing is reading. In order to
-         * avoid any problems with containers deadlocking due to this
-         * we simply make /dev/kmsg unavailable to the container. */
-        from = "/run/kmsg";
+        /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
+         * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
+         * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
+         * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
+
+        r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate kmsg path: %m");
+
+        if (mkfifo(fifo, 0600) < 0)
+                return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
+
+        from = TAKE_PTR(fifo);
        to = "/proc/kmsg";

-        if (mkfifo(from, 0600) < 0)
-                return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
        r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
        if (r < 0)
                return r;
@ -1690,17 +1700,11 @@ static int setup_kmsg(int kmsg_socket) {
        if (fd < 0)
                return log_error_errno(errno, "Failed to open fifo: %m");

-        /* Store away the fd in the socket, so that it stays open as
-         * long as we run the child */
+        /* Store away the fd in the socket, so that it stays open as long as we run the child */
        r = send_one_fd(kmsg_socket, fd, 0);
-        safe_close(fd);
-
        if (r < 0)
                return log_error_errno(r, "Failed to send FIFO fd: %m");

-        /* And now make the FIFO unavailable as /run/kmsg... */
-        (void) unlink(from);
-
        return 0;
 }

@ -2265,7 +2269,7 @@ static int inner_child(

        _cleanup_free_ char *home = NULL;
        char as_uuid[37];
-        unsigned n_env = 1;
+        size_t n_env = 1;
        const char *envp[] = {
                "PATH=" DEFAULT_PATH_COMPAT,
                NULL, /* container */
@ -3639,11 +3643,9 @@ static int run(int master,
        if (r < 0)
                return r;

-        if (arg_keep_unit) {
-                r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
-                if (r < 0)
-                        return r;
-        }
+        r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
+        if (r < 0)
+                return r;

        r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
        if (r < 0)