Merge pull request #8898 from poettering/nspawn-mount-block

some nspawn cgroup and mount lock-down fixes
This commit is contained in:
Zbigniew Jędrzejewski-Szmek 2018-05-08 12:54:58 +02:00 committed by GitHub
commit 6b1ca2a948
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 213 additions and 133 deletions

7
TODO
View File

@ -24,6 +24,10 @@ Janitorial Clean-ups:
Features:
* nspawn: greater control over hostname, resolv.conf, timezone, rlim
* nspawn: when operating in a scope, also create /payload subcrgoup
* the error paths in usbffs_dispatch_ep() leak memory
* cgroups: figure out if we can somehow communicate in a cleaner way whether a
@ -52,9 +56,6 @@ Features:
* add --vacuum-xyz options to coredumpctl, matching those journalctl already has.
* list the exit codes from the BSD/glibc <sysexits.h> in our own
exit-codes.[ch] tables.
* SuccessExitStatus= and friends should probably also accept symbolic exit
codes names, i.e. error codes from the list maintained in exit-codes.[ch]

View File

@ -424,15 +424,16 @@ unified you (of course, I guess) need to provide only `/sys/fs/cgroup/` itself.
cgroup tree of systemd itself is out of limits for you. It's fine to *read*
from any attribute you like however. That's totally OK and welcome.
4. 🚫 When not using `CLONE_NEWCGROUP` when delegating a sub-tree to a container
payload running systemd, then don't get the idea that you can bind mount
only a sub-tree of the host's cgroup tree into the container. Part of the
cgroup API is that `/proc/$PID/cgroup` reports the cgroup path of every
4. 🚫 When not using `CLONE_NEWCGROUP` when delegating a sub-tree to a
container payload running systemd, then don't get the idea that you can bind
mount only a sub-tree of the host's cgroup tree into the container. Part of
the cgroup API is that `/proc/$PID/cgroup` reports the cgroup path of every
process, and hence any path below `/sys/fs/cgroup/` needs to match what
`/proc/$PID/cgroup` of the payload processes reports. What you can do safely
however, is mount the upper parts of the cgroup tree read-only or even
replace it with an intermediary `tmpfs`, as long as the path to the
delegated sub-tree remains accessible as-is.
however, is mount the upper parts of the cgroup tree read-only (or even
replace the middle bits with an intermediary `tmpfs` — but be careful not to
break the `statfs()` detection logic discussed above), as long as the path
to the delegated sub-tree remains accessible as-is.
5. ⚡ Currently, the algorithm for mapping between slice/scope/service unit
naming and their cgroup paths is not considered public API of systemd, and

View File

@ -81,23 +81,26 @@ static const MountEntry apivfs_table[] = {
/* ProtectKernelTunables= option and the related filesystem APIs */
static const MountEntry protect_kernel_tunables_table[] = {
{ "/proc/sys", READONLY, false },
{ "/proc/sysrq-trigger", READONLY, true },
{ "/proc/latency_stats", READONLY, true },
{ "/proc/mtrr", READONLY, true },
{ "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
{ "/proc/acpi", READONLY, true },
{ "/proc/timer_stats", READONLY, true },
{ "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
{ "/proc/asound", READONLY, true },
{ "/proc/bus", READONLY, true },
{ "/proc/fs", READONLY, true },
{ "/proc/irq", READONLY, true },
{ "/proc/kallsyms", INACCESSIBLE, true },
{ "/proc/kcore", INACCESSIBLE, true },
{ "/proc/latency_stats", READONLY, true },
{ "/proc/mtrr", READONLY, true },
{ "/proc/scsi", READONLY, true },
{ "/proc/sys", READONLY, false },
{ "/proc/sysrq-trigger", READONLY, true },
{ "/proc/timer_stats", READONLY, true },
{ "/sys", READONLY, false },
{ "/sys/kernel/debug", READONLY, true },
{ "/sys/kernel/tracing", READONLY, true },
{ "/sys/fs/bpf", READONLY, true },
{ "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
{ "/sys/fs/selinux", READWRITE, true },
{ "/sys/kernel/debug", READONLY, true },
{ "/sys/kernel/tracing", READONLY, true },
};
/* ProtectKernelModules= option */

View File

@ -141,44 +141,53 @@ finish:
return r;
}
int create_subcgroup(pid_t pid, CGroupUnified unified_requested) {
int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) {
_cleanup_free_ char *cgroup = NULL;
const char *child;
int r;
CGroupMask supported;
const char *payload;
int r;
/* In the unified hierarchy inner nodes may only contain
* subgroups, but not processes. Hence, if we running in the
* unified hierarchy and the container does the same, and we
* did not create a scope unit for the container move us and
* the container into two separate subcgroups. */
assert(pid > 1);
if (unified_requested == CGROUP_UNIFIED_NONE)
return 0;
r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
if (r < 0)
return log_error_errno(r, "Failed to determine whether the systemd controller is unified: %m");
if (r == 0)
return 0;
/* In the unified hierarchy inner nodes may only contain subgroups, but not processes. Hence, if we running in
* the unified hierarchy and the container does the same, and we did not create a scope unit for the container
* move us and the container into two separate subcgroups.
*
* Moreover, container payloads such as systemd try to manage the cgroup they run in in full (i.e. including
* its attributes), while the host systemd will only delegate cgroups for children of the cgroup created for a
* delegation unit, instead of the cgroup itself. This means, if we'd pass on the cgroup allocated from the
* host systemd directly to the payload, the host and payload systemd might fight for the cgroup
* attributes. Hence, let's insert an intermediary cgroup to cover that case too.
*
* Note that we only bother with the main hierarchy here, not with any secondary ones. On the unified setup
* that's fine because there's only one hiearchy anyway and controllers are enabled directly on it. On the
* legacy setup, this is fine too, since delegation of controllers is generally not safe there, hence we won't
* do it. */
r = cg_mask_supported(&supported);
if (r < 0)
return log_error_errno(r, "Failed to determine supported controllers: %m");
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
if (keep_unit)
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
else
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
if (r < 0)
return log_error_errno(r, "Failed to get our control group: %m");
child = strjoina(cgroup, "/payload");
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
payload = strjoina(cgroup, "/payload");
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid);
if (r < 0)
return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
return log_error_errno(r, "Failed to create %s subcgroup: %m", payload);
child = strjoina(cgroup, "/supervisor");
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
if (r < 0)
return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
if (keep_unit) {
const char *supervisor;
supervisor = strjoina(cgroup, "/supervisor");
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, supervisor, 0);
if (r < 0)
return log_error_errno(r, "Failed to create %s subcgroup: %m", supervisor);
}
/* Try to enable as many controllers as possible for the new payload. */
(void) cg_enable_everywhere(supported, supported, cgroup);

View File

@ -14,4 +14,4 @@
int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
int create_subcgroup(pid_t pid, CGroupUnified unified_requested);
int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested);

View File

@ -27,7 +27,7 @@
#include "user-util.h"
#include "util.h"
CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) {
CustomMount *c, *ret;
assert(l);
@ -48,8 +48,8 @@ CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
return ret;
}
void custom_mount_free_all(CustomMount *l, unsigned n) {
unsigned i;
void custom_mount_free_all(CustomMount *l, size_t n) {
size_t i;
for (i = 0; i < n; i++) {
CustomMount *m = l + i;
@ -110,8 +110,8 @@ static char *resolve_source_path(const char *dest, const char *source) {
return strdup(source);
}
int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
unsigned i;
int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) {
size_t i;
int r;
/* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
@ -133,8 +133,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
if (!s)
return log_oom();
free(m->source);
m->source = s;
free_and_replace(m->source, s);
} else {
/* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
@ -165,8 +164,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
if (!s)
return log_oom();
free(*j);
*j = s;
free_and_replace(*j, s);
}
if (m->work_dir) {
@ -176,8 +174,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
if (!s)
return log_oom();
free(m->work_dir);
m->work_dir = s;
free_and_replace(m->work_dir, s);
} else {
assert(m->source);
@ -193,7 +190,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
return 0;
}
int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
_cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
const char *p = s;
CustomMount *m;
@ -239,7 +236,7 @@ int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only
return 0;
}
int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s) {
_cleanup_free_ char *path = NULL, *opts = NULL;
const char *p = s;
CustomMount *m;
@ -275,7 +272,7 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
return 0;
}
int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
_cleanup_free_ char *upper = NULL, *destination = NULL;
_cleanup_strv_free_ char **lower = NULL;
CustomMount *m;
@ -511,6 +508,18 @@ int mount_all(const char *dest,
uid_t uid_shift, uid_t uid_range,
const char *selinux_apifs_context) {
#define PROC_INACCESSIBLE(path) \
{ NULL, (path), NULL, NULL, MS_BIND, \
MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_INACCESSIBLE_REG }, /* Bind mount first ... */ \
{ NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
#define PROC_READ_ONLY(path) \
{ (path), (path), NULL, NULL, MS_BIND, \
MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
{ NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
typedef struct MountPoint {
const char *what;
const char *where;
@ -521,39 +530,72 @@ int mount_all(const char *dest,
} MountPoint;
static const MountPoint mount_table[] = {
/* inner child mounts */
{ "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_IN_USERNS },
{ "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
{ "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
{ NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
{ "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
{ NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
/* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
{ "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
MOUNT_FATAL|MOUNT_IN_USERNS },
/* outer child mounts */
{ "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
{ "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
{ "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL }, /* skipped if above was mounted */
{ "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND,
MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
{ "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND,
MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
{ NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
/* Make these files inaccessible to container payloads: they potentially leak information about kernel
* internals or the host's execution environment to the container */
PROC_INACCESSIBLE("/proc/kallsyms"),
PROC_INACCESSIBLE("/proc/kcore"),
PROC_INACCESSIBLE("/proc/keys"),
PROC_INACCESSIBLE("/proc/sysrq-trigger"),
PROC_INACCESSIBLE("/proc/timer_list"),
/* Make these directories read-only to container payloads: they show hardware information, and in some
* cases contain tunables the container really shouldn't have access to. */
PROC_READ_ONLY("/proc/acpi"),
PROC_READ_ONLY("/proc/apm"),
PROC_READ_ONLY("/proc/asound"),
PROC_READ_ONLY("/proc/bus"),
PROC_READ_ONLY("/proc/fs"),
PROC_READ_ONLY("/proc/irq"),
PROC_READ_ONLY("/proc/scsi"),
/* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
{ "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
MOUNT_FATAL },
{ "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV,
MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
{ "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
MOUNT_FATAL }, /* skipped if above was mounted */
{ "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
MOUNT_FATAL },
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
MOUNT_FATAL },
{ "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
MOUNT_FATAL },
{ "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL },
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
{ "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
#if HAVE_SELINUX
{ "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, 0 }, /* Bind mount first */
{ NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 }, /* Then, make it r/o */
{ "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,
0 }, /* Bind mount first */
{ NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
0 }, /* Then, make it r/o */
#endif
};
unsigned k;
int r;
_cleanup_(unlink_and_freep) char *inaccessible = NULL;
bool use_userns = (mount_settings & MOUNT_USE_USERNS);
bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
bool in_userns = (mount_settings & MOUNT_IN_USERNS);
size_t k;
int r;
for (k = 0; k < ELEMENTSOF(mount_table); k++) {
_cleanup_free_ char *where = NULL, *options = NULL;
const char *o;
const char *o, *what;
bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
@ -569,12 +611,32 @@ int mount_all(const char *dest,
if (r < 0)
return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
if (mount_table[k].mount_settings & MOUNT_INACCESSIBLE_REG) {
if (!inaccessible) {
_cleanup_free_ char *np = NULL;
r = tempfn_random_child(NULL, "inaccessible", &np);
if (r < 0)
return log_error_errno(r, "Failed to generate inaccessible file node path: %m");
r = touch_file(np, false, USEC_INFINITY, UID_INVALID, GID_INVALID, 0000);
if (r < 0)
return log_error_errno(r, "Failed to create inaccessible file node '%s': %m", np);
inaccessible = TAKE_PTR(np);
}
what = inaccessible;
} else
what = mount_table[k].what;
r = path_is_mount_point(where, NULL, 0);
if (r < 0 && r != -ENOENT)
return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
/* Skip this entry if it is not a remount. */
if (mount_table[k].what && r > 0)
if (what && r > 0)
continue;
r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift);
@ -603,7 +665,7 @@ int mount_all(const char *dest,
}
r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
mount_table[k].what,
what,
where,
mount_table[k].type,
mount_table[k].flags,
@ -766,11 +828,11 @@ static int mount_overlay(const char *dest, CustomMount *m) {
int mount_custom(
const char *dest,
CustomMount *mounts, unsigned n,
CustomMount *mounts, size_t n,
bool userns, uid_t uid_shift, uid_t uid_range,
const char *selinux_apifs_context) {
unsigned i;
size_t i;
int r;
assert(dest);

View File

@ -13,12 +13,13 @@
#include "volatile-util.h"
typedef enum MountSettingsMask {
MOUNT_FATAL = 1 << 0, /* if set, a mount error is considered fatal */
MOUNT_USE_USERNS = 1 << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */
MOUNT_IN_USERNS = 1 << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */
MOUNT_APPLY_APIVFS_RO = 1 << 3, /* if set, /proc/sys, and /sysfs will be mounted read-only, otherwise read-write. */
MOUNT_APPLY_APIVFS_NETNS = 1 << 4, /* if set, /proc/sys/net will be mounted read-write.
Works only if MOUNT_APPLY_APIVFS_RO is also set. */
MOUNT_FATAL = 1U << 0, /* if set, a mount error is considered fatal */
MOUNT_USE_USERNS = 1U << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */
MOUNT_IN_USERNS = 1U << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */
MOUNT_APPLY_APIVFS_RO = 1U << 3, /* if set, /proc/sys, and /sys will be mounted read-only, otherwise read-write. */
MOUNT_APPLY_APIVFS_NETNS = 1U << 4, /* if set, /proc/sys/net will be mounted read-write.
Works only if MOUNT_APPLY_APIVFS_RO is also set. */
MOUNT_INACCESSIBLE_REG = 1U << 5, /* if set, create an inaccessible regular file first and use as bind mount source */
} MountSettingsMask;
typedef enum CustomMountType {
@ -40,13 +41,13 @@ typedef struct CustomMount {
char *rm_rf_tmpdir;
} CustomMount;
CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t);
void custom_mount_free_all(CustomMount *l, unsigned n);
int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n);
CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t);
void custom_mount_free_all(CustomMount *l, size_t n);
int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n);
int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only);
int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s);
int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only);
int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only);
int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s);
int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only);
int mount_all(const char *dest, MountSettingsMask mount_settings, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
int mount_sysfs(const char *dest, MountSettingsMask mount_settings);
@ -54,7 +55,7 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings);
int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested);
int mount_custom(const char *dest, CustomMount *mounts, unsigned n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
int mount_custom(const char *dest, CustomMount *mounts, size_t n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
int setup_volatile(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
int setup_volatile_state(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);

View File

@ -11,6 +11,7 @@
#include "bus-unit-util.h"
#include "bus-util.h"
#include "nspawn-register.h"
#include "special.h"
#include "stat-util.h"
#include "strv.h"
#include "util.h"
@ -309,7 +310,7 @@ int allocate_scope(
"PIDs", "au", 1, pid,
"Description", "s", description,
"Delegate", "b", 1,
"Slice", "s", isempty(slice) ? "machine.slice" : slice);
"Slice", "s", isempty(slice) ? SPECIAL_MACHINE_SLICE : slice);
if (r < 0)
return bus_log_create_error(r);

View File

@ -76,7 +76,7 @@ typedef struct Settings {
int read_only;
VolatileMode volatile_mode;
CustomMount *custom_mounts;
unsigned n_custom_mounts;
size_t n_custom_mounts;
int userns_chown;
/* [Network] */

View File

@ -165,7 +165,7 @@ static uint64_t arg_caps_retain =
(1ULL << CAP_SYS_RESOURCE) |
(1ULL << CAP_SYS_TTY_CONFIG);
static CustomMount *arg_custom_mounts = NULL;
static unsigned arg_n_custom_mounts = 0;
static size_t arg_n_custom_mounts = 0;
static char **arg_setenv = NULL;
static bool arg_quiet = false;
static bool arg_register = true;
@ -291,7 +291,7 @@ static void help(void) {
}
static int custom_mount_check_all(void) {
unsigned i;
size_t i;
for (i = 0; i < arg_n_custom_mounts; i++) {
CustomMount *m = &arg_custom_mounts[i];
@ -1470,31 +1470,35 @@ static int setup_resolv_conf(const char *dest) {
}
static int setup_boot_id(void) {
_cleanup_(unlink_and_freep) char *from = NULL;
_cleanup_free_ char *path = NULL;
sd_id128_t rnd = SD_ID128_NULL;
const char *from, *to;
const char *to;
int r;
/* Generate a new randomized boot ID, so that each boot-up of
* the container gets a new one */
from = "/run/proc-sys-kernel-random-boot-id";
to = "/proc/sys/kernel/random/boot_id";
r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
if (r < 0)
return log_error_errno(r, "Failed to generate random boot ID path: %m");
r = sd_id128_randomize(&rnd);
if (r < 0)
return log_error_errno(r, "Failed to generate random boot id: %m");
r = id128_write(from, ID128_UUID, rnd, false);
r = id128_write(path, ID128_UUID, rnd, false);
if (r < 0)
return log_error_errno(r, "Failed to write boot id: %m");
r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
if (r >= 0)
r = mount_verbose(LOG_ERR, NULL, to, NULL,
MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
from = TAKE_PTR(path);
to = "/proc/sys/kernel/random/boot_id";
(void) unlink(from);
return r;
r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
if (r < 0)
return r;
return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
}
static int copy_devnodes(const char *dest) {
@ -1662,26 +1666,32 @@ static int setup_keyring(void) {
}
static int setup_kmsg(int kmsg_socket) {
const char *from, *to;
_cleanup_(unlink_and_freep) char *from = NULL;
_cleanup_free_ char *fifo = NULL;
_cleanup_close_ int fd = -1;
_cleanup_umask_ mode_t u;
int fd, r;
const char *to;
int r;
assert(kmsg_socket >= 0);
u = umask(0000);
/* We create the kmsg FIFO as /run/kmsg, but immediately
* delete it after bind mounting it to /proc/kmsg. While FIFOs
* on the reading side behave very similar to /proc/kmsg,
* their writing side behaves differently from /dev/kmsg in
* that writing blocks when nothing is reading. In order to
* avoid any problems with containers deadlocking due to this
* we simply make /dev/kmsg unavailable to the container. */
from = "/run/kmsg";
/* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
* /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
* differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
* with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
if (r < 0)
return log_error_errno(r, "Failed to generate kmsg path: %m");
if (mkfifo(fifo, 0600) < 0)
return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
from = TAKE_PTR(fifo);
to = "/proc/kmsg";
if (mkfifo(from, 0600) < 0)
return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
if (r < 0)
return r;
@ -1690,17 +1700,11 @@ static int setup_kmsg(int kmsg_socket) {
if (fd < 0)
return log_error_errno(errno, "Failed to open fifo: %m");
/* Store away the fd in the socket, so that it stays open as
* long as we run the child */
/* Store away the fd in the socket, so that it stays open as long as we run the child */
r = send_one_fd(kmsg_socket, fd, 0);
safe_close(fd);
if (r < 0)
return log_error_errno(r, "Failed to send FIFO fd: %m");
/* And now make the FIFO unavailable as /run/kmsg... */
(void) unlink(from);
return 0;
}
@ -2265,7 +2269,7 @@ static int inner_child(
_cleanup_free_ char *home = NULL;
char as_uuid[37];
unsigned n_env = 1;
size_t n_env = 1;
const char *envp[] = {
"PATH=" DEFAULT_PATH_COMPAT,
NULL, /* container */
@ -3639,11 +3643,9 @@ static int run(int master,
if (r < 0)
return r;
if (arg_keep_unit) {
r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
if (r < 0)
return r;
}
r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
if (r < 0)
return r;
r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
if (r < 0)