diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index 51db003a67..eb411102bc 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -238,6 +238,15 @@ together with , . + + + + Takes the path to an OCI runtime bundle to invoke, as specified in the OCI Runtime Specification. In + this case no .nspawn file is loaded, and the root directory and various settings are read + from the OCI runtime JSON data (but data passed on the command line takes precedence). + + @@ -952,6 +961,16 @@ make them read-only, using . + + + + Make the specified path inaccessible in the container. This over-mounts the specified path + (which must exist in the container) with a file node of the same type that is empty and has the most + restrictive access mode supported. This is an effective way to mask files, directories and other file system + objects from the container payload. This option may be used more than once in case all specified paths are + masked. + + @@ -1084,6 +1103,42 @@ same as the one reported on the host. + + MODE + + Configures how to set up standard input, output and error output for the container payload, as + well as the /dev/console device for the container. Takes one of + , , or . If + a pseudo-TTY is allocated and made available as /dev/console + in the container. It is then bi-directionally connected to the standard input and output passed to + systemd-nspawn. is similar but only the output of the container + is propagated and no input from the caller is read. In mode a pseudo TTY is allocated, + but it is not connected anywhere. Finally, in mode no pseudo TTY is allocated, but the + passed standard input, output and error output file descriptors are passed on — as they are — to the container + payload. In this mode /dev/console will not exist in the container. Note that in this mode + the container payload generally cannot be a full init system as init systems tend to require + /dev/console to be available. On the other hand, in this mode container invocations can be + used within shell pipelines. This is because intermediary pseudo TTYs do not permit independent bidirectional + propagation of the end-of-file (EOF) condition, which is necessary for shell pipelines to work + correctly. + + Note that the mode should be used carefully, as passing arbitrary file descriptors + to less trusted container payloads might open up unwanted interfaces for access by the container payload. For + example, if a passed file descriptor refers to a TTY of some form, APIs such as TIOCSTI + may be used to synthesize input that might be used for escaping the container. Hence mode + should only be used if the payload is sufficiently trusted or when the standard input/output/error output file + descriptors are known safe, for example pipes. Defaults to if + systemd-nspawn is invoked from a terminal, and + otherwise. + + + + + + + Equivalent to . + + diff --git a/man/systemd.nspawn.xml b/man/systemd.nspawn.xml index 39e1d6fe73..1485a26f02 100644 --- a/man/systemd.nspawn.xml +++ b/man/systemd.nspawn.xml @@ -425,6 +425,17 @@ is privileged (see above). + + Inaccessible= + + Masks the specified file or directly in the container, by over-mounting it with an empty file + node of the same type with the most restrictive access mode. Takes a file system path as arugment. This option + may be used multiple times to mask multiple files or directories. This option is equivalent to the command line + switch , see + systemd-nspawn1 for details + about the specific options supported. This setting is privileged (see above). + + Overlay= OverlayReadOnly= diff --git a/src/basic/capability-util.c b/src/basic/capability-util.c index b944ee6ea1..45fadb9faa 100644 --- a/src/basic/capability-util.c +++ b/src/basic/capability-util.c @@ -47,6 +47,13 @@ unsigned long cap_last_cap(void) { if (r >= 0) { r = safe_atolu(content, &p); if (r >= 0) { + + if (p > 63) /* Safety for the future: if one day the kernel learns more than 64 caps, + * then we are in trouble (since we, as much userspace and kernel space + * store capability masks in uint64_t types. Let's hence protect + * ourselves against that and always cap at 63 for now. */ + p = 63; + saved = p; valid = true; return p; @@ -58,17 +65,15 @@ unsigned long cap_last_cap(void) { if (prctl(PR_CAPBSET_READ, p) < 0) { - /* Hmm, look downwards, until we find one that - * works */ + /* Hmm, look downwards, until we find one that works */ for (p--; p > 0; p --) if (prctl(PR_CAPBSET_READ, p) >= 0) break; } else { - /* Hmm, look upwards, until we find one that doesn't - * work */ - for (;; p++) + /* Hmm, look upwards, until we find one that doesn't work */ + for (; p < 63; p++) if (prctl(PR_CAPBSET_READ, p+1) < 0) break; } @@ -363,6 +368,7 @@ bool ambient_capabilities_supported(void) { int capability_quintet_enforce(const CapabilityQuintet *q) { _cleanup_cap_free_ cap_t c = NULL; + bool need_set_proc_again = false; int r; if (q->ambient != (uint64_t) -1) { @@ -393,7 +399,6 @@ int capability_quintet_enforce(const CapabilityQuintet *q) { if (cap_set_flag(c, CAP_INHERITABLE, 1, &cv, CAP_SET) < 0) return -errno; - if (cap_set_flag(c, CAP_PERMITTED, 1, &cv, CAP_SET) < 0) return -errno; @@ -426,8 +431,15 @@ int capability_quintet_enforce(const CapabilityQuintet *q) { if (q->inheritable != (uint64_t) -1) { cap_flag_value_t old_value, new_value; - if (cap_get_flag(c, cv, CAP_INHERITABLE, &old_value) < 0) + if (cap_get_flag(c, cv, CAP_INHERITABLE, &old_value) < 0) { + if (errno == EINVAL) /* If the kernel knows more caps than this + * version of libcap, then this will return + * EINVAL. In that case, simply ignore it, + * pretend it doesn't exist. */ + continue; + return -errno; + } new_value = (q->inheritable & m) ? CAP_SET : CAP_CLEAR; @@ -442,8 +454,12 @@ int capability_quintet_enforce(const CapabilityQuintet *q) { if (q->permitted != (uint64_t) -1) { cap_flag_value_t old_value, new_value; - if (cap_get_flag(c, cv, CAP_PERMITTED, &old_value) < 0) + if (cap_get_flag(c, cv, CAP_PERMITTED, &old_value) < 0) { + if (errno == EINVAL) + continue; + return -errno; + } new_value = (q->permitted & m) ? CAP_SET : CAP_CLEAR; @@ -458,8 +474,12 @@ int capability_quintet_enforce(const CapabilityQuintet *q) { if (q->effective != (uint64_t) -1) { cap_flag_value_t old_value, new_value; - if (cap_get_flag(c, cv, CAP_EFFECTIVE, &old_value) < 0) + if (cap_get_flag(c, cv, CAP_EFFECTIVE, &old_value) < 0) { + if (errno == EINVAL) + continue; + return -errno; + } new_value = (q->effective & m) ? CAP_SET : CAP_CLEAR; @@ -472,9 +492,39 @@ int capability_quintet_enforce(const CapabilityQuintet *q) { } } - if (changed) - if (cap_set_proc(c) < 0) + if (changed) { + _cleanup_cap_free_ cap_t modified = NULL; + + /* In order to change the bounding caps, we need to keep CAP_SETPCAP for a bit + * longer. Let's add it to our list hence for now. */ + if (q->bounding != (uint64_t) -1) { + cap_value_t cv = CAP_SETPCAP; + + modified = cap_dup(c); + if (!modified) + return -ENOMEM; + + if (cap_set_flag(modified, CAP_PERMITTED, 1, &cv, CAP_SET) < 0) + return -errno; + if (cap_set_flag(modified, CAP_EFFECTIVE, 1, &cv, CAP_SET) < 0) + return -errno; + + if (cap_compare(modified, c) == 0) { + /* No change? then drop this nonsense again */ + cap_free(modified); + modified = NULL; + } + } + + /* Now, let's enforce the caps for the first time. Note that this is where we acquire + * caps in any of the sets we currently don't have. We have to do this before + * droppoing the bounding caps below, since at that point we can never acquire new + * caps in inherited/permitted/effective anymore, but only lose them.*/ + if (cap_set_proc(modified ?: c) < 0) return -errno; + + need_set_proc_again = !!modified; + } } if (q->bounding != (uint64_t) -1) { @@ -483,5 +533,13 @@ int capability_quintet_enforce(const CapabilityQuintet *q) { return r; } + /* If needed, let's now set the caps again, this time in the final version, which differs from what + * we have already set only in the CAP_SETPCAP bit, which we needed for dropping the bounding + * bits. This call only undoes bits and doesn't acquire any which means the bounding caps don't + * matter. */ + if (need_set_proc_again) + if (cap_set_proc(c) < 0) + return -errno; + return 0; } diff --git a/src/basic/capability-util.h b/src/basic/capability-util.h index 02c7d5c3e2..e69b2fbb95 100644 --- a/src/basic/capability-util.h +++ b/src/basic/capability-util.h @@ -33,10 +33,12 @@ static inline void cap_free_charpp(char **p) { } #define _cleanup_cap_free_charp_ _cleanup_(cap_free_charpp) +static inline uint64_t all_capabilities(void) { + return UINT64_MAX >> (63 - cap_last_cap()); +} + static inline bool cap_test_all(uint64_t caps) { - uint64_t m; - m = (UINT64_C(1) << (cap_last_cap() + 1)) - 1; - return FLAGS_SET(caps, m); + return FLAGS_SET(caps, all_capabilities()); } bool ambient_capabilities_supported(void); diff --git a/src/nspawn/meson.build b/src/nspawn/meson.build index be54ba36c5..31217c7b46 100644 --- a/src/nspawn/meson.build +++ b/src/nspawn/meson.build @@ -10,6 +10,8 @@ libnspawn_core_sources = files(''' nspawn-mount.h nspawn-network.c nspawn-network.h + nspawn-oci.c + nspawn-oci.h nspawn-patch-uid.c nspawn-patch-uid.h nspawn-register.c diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c index 97fa092cae..4a16f56d1c 100644 --- a/src/nspawn/nspawn-cgroup.c +++ b/src/nspawn/nspawn-cgroup.c @@ -263,7 +263,7 @@ static int mount_legacy_cgroup_hierarchy( if (r > 0) return 0; - mkdir_p(to, 0755); + (void) mkdir_p(to, 0755); /* The superblock mount options of the mount point need to be * identical to the hosts', and hence writable... */ diff --git a/src/nspawn/nspawn-gperf.gperf b/src/nspawn/nspawn-gperf.gperf index dec53a06f3..79304d21ab 100644 --- a/src/nspawn/nspawn-gperf.gperf +++ b/src/nspawn/nspawn-gperf.gperf @@ -62,6 +62,7 @@ Files.Volatile, config_parse_volatile_mode, 0, of Files.Bind, config_parse_bind, 0, 0 Files.BindReadOnly, config_parse_bind, 1, 0 Files.TemporaryFileSystem, config_parse_tmpfs, 0, 0 +Files.Inaccessible, config_parse_inaccessible, 0, 0 Files.Overlay, config_parse_overlay, 0, 0 Files.OverlayReadOnly, config_parse_overlay, 1, 0 Files.PrivateUsersChown, config_parse_tristate, 0, offsetof(Settings, userns_chown) diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 4878586c4e..cf093cd0a2 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -65,6 +65,7 @@ void custom_mount_free_all(CustomMount *l, size_t n) { } strv_free(m->lower); + free(m->type_argument); } free(l); @@ -116,32 +117,40 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) { for (i = 0; i < n; i++) { CustomMount *m = l + i; - if (m->source) { - char *s; + /* /proc we mount in the inner child, i.e. when we acquired CLONE_NEWPID. All other mounts we mount + * already in the outer child, so that the mounts are already established before CLONE_NEWPID and in + * particular CLONE_NEWUSER. This also means any custom mounts below /proc also need to be mounted in + * the inner child, not the outer one. Determine this here. */ + m->in_userns = path_startswith(m->destination, "/proc"); - s = resolve_source_path(dest, m->source); - if (!s) - return log_oom(); + if (m->type == CUSTOM_MOUNT_BIND) { + if (m->source) { + char *s; - free_and_replace(m->source, s); - } else { - /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */ + s = resolve_source_path(dest, m->source); + if (!s) + return log_oom(); - m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX"); - if (!m->rm_rf_tmpdir) - return log_oom(); + free_and_replace(m->source, s); + } else { + /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */ - if (!mkdtemp(m->rm_rf_tmpdir)) { - m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir); - return log_error_errno(errno, "Failed to acquire temporary directory: %m"); + m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX"); + if (!m->rm_rf_tmpdir) + return log_oom(); + + if (!mkdtemp(m->rm_rf_tmpdir)) { + m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir); + return log_error_errno(errno, "Failed to acquire temporary directory: %m"); + } + + m->source = strjoin(m->rm_rf_tmpdir, "/src"); + if (!m->source) + return log_oom(); + + if (mkdir(m->source, 0755) < 0) + return log_error_errno(errno, "Failed to create %s: %m", m->source); } - - m->source = strjoin(m->rm_rf_tmpdir, "/src"); - if (!m->source) - return log_oom(); - - if (mkdir(m->source, 0755) < 0) - return log_error_errno(errno, "Failed to create %s: %m", m->source); } if (m->type == CUSTOM_MOUNT_OVERLAY) { @@ -223,6 +232,7 @@ int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) m->destination = TAKE_PTR(destination); m->read_only = read_only; m->options = TAKE_PTR(opts); + return 0; } @@ -327,6 +337,29 @@ int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_onl return 0; } +int inaccessible_mount_parse(CustomMount **l, size_t *n, const char *s) { + _cleanup_free_ char *path = NULL; + CustomMount *m; + + assert(l); + assert(n); + assert(s); + + if (!path_is_absolute(s)) + return -EINVAL; + + path = strdup(s); + if (!path) + return -ENOMEM; + + m = custom_mount_add(l, n, CUSTOM_MOUNT_INACCESSIBLE); + if (!m) + return -ENOMEM; + + m->destination = TAKE_PTR(path); + return 0; +} + int tmpfs_patch_options( const char *options, uid_t uid_shift, @@ -494,9 +527,9 @@ int mount_all(const char *dest, uid_t uid_shift, const char *selinux_apifs_context) { -#define PROC_INACCESSIBLE(path) \ - { NULL, (path), NULL, NULL, MS_BIND, \ - MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_INACCESSIBLE_REG }, /* Bind mount first ... */ \ +#define PROC_INACCESSIBLE_REG(path) \ + { "/run/systemd/inaccessible/reg", (path), NULL, NULL, MS_BIND, \ + MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \ { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \ MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */ @@ -531,11 +564,11 @@ int mount_all(const char *dest, /* Make these files inaccessible to container payloads: they potentially leak information about kernel * internals or the host's execution environment to the container */ - PROC_INACCESSIBLE("/proc/kallsyms"), - PROC_INACCESSIBLE("/proc/kcore"), - PROC_INACCESSIBLE("/proc/keys"), - PROC_INACCESSIBLE("/proc/sysrq-trigger"), - PROC_INACCESSIBLE("/proc/timer_list"), + PROC_INACCESSIBLE_REG("/proc/kallsyms"), + PROC_INACCESSIBLE_REG("/proc/kcore"), + PROC_INACCESSIBLE_REG("/proc/keys"), + PROC_INACCESSIBLE_REG("/proc/sysrq-trigger"), + PROC_INACCESSIBLE_REG("/proc/timer_list"), /* Make these directories read-only to container payloads: they show hardware information, and in some * cases contain tunables the container really shouldn't have access to. */ @@ -573,7 +606,6 @@ int mount_all(const char *dest, #endif }; - _cleanup_(unlink_and_freep) char *inaccessible = NULL; bool use_userns = (mount_settings & MOUNT_USE_USERNS); bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS); bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO); @@ -584,7 +616,7 @@ int mount_all(const char *dest, for (k = 0; k < ELEMENTSOF(mount_table); k++) { _cleanup_free_ char *where = NULL, *options = NULL; - const char *o, *what; + const char *o; bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL); if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS)) @@ -603,33 +635,14 @@ int mount_all(const char *dest, if (r < 0) return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where); - if (mount_table[k].mount_settings & MOUNT_INACCESSIBLE_REG) { - - if (!inaccessible) { - _cleanup_free_ char *np = NULL; - - r = tempfn_random_child(NULL, "inaccessible", &np); - if (r < 0) - return log_error_errno(r, "Failed to generate inaccessible file node path: %m"); - - r = touch_file(np, false, USEC_INFINITY, UID_INVALID, GID_INVALID, 0000); - if (r < 0) - return log_error_errno(r, "Failed to create inaccessible file node '%s': %m", np); - - inaccessible = TAKE_PTR(np); - } - - what = inaccessible; - } else - what = mount_table[k].what; - - r = path_is_mount_point(where, NULL, 0); - if (r < 0 && r != -ENOENT) - return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where); - /* Skip this entry if it is not a remount. */ - if (what && r > 0) - continue; + if (mount_table[k].what) { + r = path_is_mount_point(where, NULL, 0); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where); + if (r > 0) + continue; + } r = mkdir_userns_p(dest, where, 0755, (use_userns && !in_userns) ? uid_shift : UID_INVALID); if (r < 0 && r != -EEXIST) { @@ -654,7 +667,7 @@ int mount_all(const char *dest, } r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG, - what, + mount_table[k].what, where, mount_table[k].type, mount_table[k].flags, @@ -667,7 +680,6 @@ int mount_all(const char *dest, } static int mount_bind(const char *dest, CustomMount *m) { - _cleanup_free_ char *where = NULL; struct stat source_st, dest_st; int r; @@ -711,7 +723,6 @@ static int mount_bind(const char *dest, CustomMount *m) { r = touch(where); if (r < 0) return log_error_errno(r, "Failed to create mount point %s: %m", where); - } r = mount_verbose(LOG_ERR, m->source, where, NULL, MS_BIND | MS_REC, m->options); @@ -773,7 +784,6 @@ static char *joined_and_escaped_lower_dirs(char **lower) { } static int mount_overlay(const char *dest, CustomMount *m) { - _cleanup_free_ char *lower = NULL, *where = NULL, *escaped_source = NULL; const char *options; int r; @@ -815,11 +825,59 @@ static int mount_overlay(const char *dest, CustomMount *m) { return mount_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options); } +static int mount_inaccessible(const char *dest, CustomMount *m) { + _cleanup_free_ char *where = NULL; + const char *source; + struct stat st; + int r; + + assert(dest); + assert(m); + + r = chase_symlinks_and_stat(m->destination, dest, CHASE_PREFIX_ROOT, &where, &st); + if (r < 0) { + log_full_errno(m->graceful ? LOG_DEBUG : LOG_ERR, r, "Failed to resolve %s/%s: %m", dest, m->destination); + return m->graceful ? 0 : r; + } + + assert_se(source = mode_to_inaccessible_node(st.st_mode)); + + r = mount_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, source, where, NULL, MS_BIND, NULL); + if (r < 0) + return m->graceful ? 0 : r; + + r = mount_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, NULL, where, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, NULL); + if (r < 0) + return m->graceful ? 0 : r; + + return 0; +} + +static int mount_arbitrary(const char *dest, CustomMount *m) { + _cleanup_free_ char *where = NULL; + int r; + + assert(dest); + assert(m); + + r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where); + if (r < 0) + return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination); + if (r == 0) { /* Doesn't exist yet? */ + r = mkdir_p_label(where, 0755); + if (r < 0) + return log_error_errno(r, "Creating mount point for mount %s failed: %m", where); + } + + return mount_verbose(LOG_ERR, m->source, where, m->type_argument, 0, m->options); +} + int mount_custom( const char *dest, CustomMount *mounts, size_t n, bool userns, uid_t uid_shift, uid_t uid_range, - const char *selinux_apifs_context) { + const char *selinux_apifs_context, + bool in_userns) { size_t i; int r; @@ -829,6 +887,9 @@ int mount_custom( for (i = 0; i < n; i++) { CustomMount *m = mounts + i; + if (m->in_userns != in_userns) + continue; + switch (m->type) { case CUSTOM_MOUNT_BIND: @@ -843,6 +904,14 @@ int mount_custom( r = mount_overlay(dest, m); break; + case CUSTOM_MOUNT_INACCESSIBLE: + r = mount_inaccessible(dest, m); + break; + + case CUSTOM_MOUNT_ARBITRARY: + r = mount_arbitrary(dest, m); + break; + default: assert_not_reached("Unknown custom mount type"); } diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h index e060ca0e4d..ff6990c734 100644 --- a/src/nspawn/nspawn-mount.h +++ b/src/nspawn/nspawn-mount.h @@ -13,14 +13,15 @@ typedef enum MountSettingsMask { MOUNT_APPLY_APIVFS_RO = 1 << 3, /* if set, /proc/sys, and /sys will be mounted read-only, otherwise read-write. */ MOUNT_APPLY_APIVFS_NETNS = 1 << 4, /* if set, /proc/sys/net will be mounted read-write. Works only if MOUNT_APPLY_APIVFS_RO is also set. */ - MOUNT_INACCESSIBLE_REG = 1 << 5, /* if set, create an inaccessible regular file first and use as bind mount source */ - MOUNT_APPLY_TMPFS_TMP = 1 << 6, /* if set, /tmp will be mounted as tmpfs */ + MOUNT_APPLY_TMPFS_TMP = 1 << 5, /* if set, /tmp will be mounted as tmpfs */ } MountSettingsMask; typedef enum CustomMountType { CUSTOM_MOUNT_BIND, CUSTOM_MOUNT_TMPFS, CUSTOM_MOUNT_OVERLAY, + CUSTOM_MOUNT_INACCESSIBLE, + CUSTOM_MOUNT_ARBITRARY, _CUSTOM_MOUNT_TYPE_MAX, _CUSTOM_MOUNT_TYPE_INVALID = -1 } CustomMountType; @@ -34,6 +35,9 @@ typedef struct CustomMount { char *work_dir; char **lower; char *rm_rf_tmpdir; + char *type_argument; /* only for CUSTOM_MOUNT_ARBITRARY */ + bool graceful; + bool in_userns; } CustomMount; CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t); @@ -43,11 +47,12 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n); int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only); int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s); int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only); +int inaccessible_mount_parse(CustomMount **l, size_t *n, const char *s); int mount_all(const char *dest, MountSettingsMask mount_settings, uid_t uid_shift, const char *selinux_apifs_context); int mount_sysfs(const char *dest, MountSettingsMask mount_settings); -int mount_custom(const char *dest, CustomMount *mounts, size_t n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); +int mount_custom(const char *dest, CustomMount *mounts, size_t n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool in_userns); int setup_volatile_mode(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); diff --git a/src/nspawn/nspawn-oci.c b/src/nspawn/nspawn-oci.c new file mode 100644 index 0000000000..5a41de3810 --- /dev/null +++ b/src/nspawn/nspawn-oci.c @@ -0,0 +1,2352 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include +#include +#include + +#include "bus-util.h" +#include "cap-list.h" +#include "cpu-set-util.h" +#include "env-util.h" +#include "fs-util.h" +#include "hostname-util.h" +#include "json.h" +#include "missing_sched.h" +#include "nspawn-oci.h" +#include "path-util.h" +#include "rlimit-util.h" +#include "seccomp-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +/* TODO: + * OCI runtime tool implementation + * hooks + * + * Spec issues: + * + * How is RLIM_INFINITY supposed to be encoded? + * configured effective caps is bullshit, as execv() corrupts it anyway + * pipes bind mounted is *very* different from pipes newly created, comments regarding bind mount or not are bogus + * annotation values structured? or string? + * configurable file system namespace path, but then also root path? wtf? + * apply sysctl inside of the container? or outside? + * how is unlimited pids tasks limit to be encoded? + * what are the defaults for caps if not specified? + * what are the default uid/gid mappings if one is missing but the other set, or when user ns is on but no namespace configured + * the source field of "mounts" is really weird, as it cannot realistically be relative to the bundle, since we never know if that's what the fs wants + * spec contradicts itself on the mount "type" field, as the example uses "bind" as type, but it's not listed in /proc/filesystem, and is something made up by /bin/mount + * if type of mount is left out, what shall be assumed? "bind"? + * readonly mounts is entirely redundant? + * should escaping be applied when joining mount options with ","? + * devices cgroup support is bogus, "allow" and "deny" on the kernel level is about adding/removing entries, not about access + * spec needs to say that "rwm" devices cgroup combination can't be the empty string + * cgrouspv1 crap: kernel, kernelTCP, swapiness, disableOOMKiller, swap, devices, leafWeight + * general: it shouldn't leak lower level abstractions this obviously + * unmanagable cgroups stuff: realtimeRuntime/realtimePeriod + * needs to say what happense when some option is not specified, i.e. which defautls apply + * no architecture? no personality? + * seccomp example and logic is simply broken: there's no constant "SCMP_ACT_ERRNO". + * spec should say what to do with unknown props + * /bin/mount regarding NFS and FUSE required? + * what does terminal=false mean? + * sysctl inside or outside? whitelisting? + * + * Unsupported: + * + * apparmorProfile + * selinuxLabel + mountLabel + * hugepageLimits + * network + * rdma + * intelRdt + * swappiness, disableOOMKiller, kernel, kernelTCP, leafWeight (because it's dead, cgroupsv2 can't do it and hence systemd neither) + * + * Non-slice cgroup paths + * Propagation that is not slave + shared + * more than one uid/gid mapping, mappings with a container base != 0, or non-matching uid/gid mappings + * device cgroups access = false items that are not catchall + * device cgroups matches where minor is specified, but major isn't. similar where major is specified but char/block is not. also, any match that only has a type set that has less than "rwm" set. also, any entry that has none of rwm set. + * + */ + +static int oci_unexpected(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + json_log(v, flags, 0, "Unexpected OCI element '%s' of type '%s'.", name, json_variant_type_to_string(json_variant_type(v))); + return -EINVAL; +} + +static int oci_unsupported(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + json_log(v, flags, 0, "Unsupported OCI element '%s' of type '%s'.", name, json_variant_type_to_string(json_variant_type(v))); + return -EOPNOTSUPP; +} + +static int oci_terminal(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + + /* If not specified, or set to true, we'll default to either an interactive or a read-only + * console. If specifiy as false, we'll forcibly move to "pipe" mode though. */ + s->console_mode = json_variant_boolean(v) ? _CONSOLE_MODE_INVALID : CONSOLE_PIPE; + return 0; +} + +static int oci_console_dimension(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + unsigned *u = userdata; + uintmax_t k; + + assert(u); + + k = json_variant_unsigned(variant); + if (k == 0) { + json_log(variant, flags, 0, "Console size field '%s' is too small.", strna(name)); + return -ERANGE; + } + if (k > USHRT_MAX) { /* TIOCSWINSZ's struct winsize uses "unsigned short" for width and height */ + json_log(variant, flags, 0, "Console size field '%s' is too large.", strna(name)); + return -ERANGE; + } + + *u = (unsigned) k; + return 0; +} + +static int oci_console_size(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "height", JSON_VARIANT_UNSIGNED, oci_console_dimension, offsetof(Settings, console_height), JSON_MANDATORY }, + { "width", JSON_VARIANT_UNSIGNED, oci_console_dimension, offsetof(Settings, console_width), JSON_MANDATORY }, + {} + }; + + return json_dispatch(v, table, oci_unexpected, flags, userdata); +} + +static int oci_absolute_path(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + char **p = userdata; + const char *n; + int r; + + assert(p); + + n = json_variant_string(v); + + if (!path_is_absolute(n)) { + json_log(v, flags, 0, "Path in JSON field '%s' is not absolute: %s", strna(name), n); + return -EINVAL; + } + + r = free_and_strdup(p, n); + if (r < 0) + return log_oom(); + + return 0; +} + +static int oci_env(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + char ***l = userdata; + JsonVariant *e; + int r; + + assert(l); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + const char *n; + + if (!json_variant_is_string(e)) { + json_log(e, flags, 0, "Environment array contains non-string."); + return -EINVAL; + } + + assert_se(n = json_variant_string(e)); + + if (!env_assignment_is_valid(n)) { + json_log(e, flags, 0, "Environment assignment not valid: %s", n); + return -EINVAL; + } + + r = strv_extend(l, n); + if (r < 0) + return log_oom(); + } + + return 0; +} + +static int oci_args(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + _cleanup_strv_free_ char **l = NULL; + char ***value = userdata; + JsonVariant *e; + int r; + + assert(value); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + const char *n; + + if (!json_variant_is_string(e)) { + json_log(v, flags, 0, "Argument is not a string."); + return -EINVAL; + } + + assert_se(n = json_variant_string(e)); + + r = strv_extend(&l, n); + if (r < 0) + return log_oom(); + } + + if (strv_isempty(l)) { + json_log(v, flags, 0, "Argument list empty, refusing."); + return -EINVAL; + } + + if (isempty(l[0])) { + json_log(v, flags, 0, "Executable name is empty, refusing."); + return -EINVAL; + } + + strv_free_and_replace(*value, l); + return 0; +} + +static int oci_rlimit_type(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + const char *z; + int t, *type = userdata; + + assert_se(type); + + z = startswith(json_variant_string(v), "RLIMIT_"); + if (!z) { + json_log(v, flags, 0, "rlimit entry's name does not begin with 'RLIMIT_', refusing: %s", json_variant_string(v)); + return -EINVAL; + } + + t = rlimit_from_string(z); + if (t < 0) { + json_log(v, flags, 0, "rlimit name unknown: %s", json_variant_string(v)); + return -EINVAL; + } + + *type = t; + return 0; +} + +static int oci_rlimit_value(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + rlim_t z, *value = userdata; + + assert(value); + + if (json_variant_is_negative(v)) + z = RLIM_INFINITY; + else { + if (!json_variant_is_unsigned(v)) { + json_log(v, flags, 0, "rlimits limit not unsigned, refusing."); + return -ERANGE; + } + + z = (rlim_t) json_variant_unsigned(v); + + if ((uintmax_t) z != json_variant_unsigned(v)) { + json_log(v, flags, 0, "rlimits limit out of range, refusing."); + return -EINVAL; + } + } + + *value = z; + return 0; +} + +static int oci_rlimits(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + Settings *s = userdata; + JsonVariant *e; + int r; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + + struct rlimit_data { + int type; + rlim_t soft; + rlim_t hard; + } data = { + .type = -1, + .soft = RLIM_INFINITY, + .hard = RLIM_INFINITY, + }; + + static const JsonDispatch table[] = { + { "soft", JSON_VARIANT_NUMBER, oci_rlimit_value, offsetof(struct rlimit_data, soft), JSON_MANDATORY }, + { "hard", JSON_VARIANT_NUMBER, oci_rlimit_value, offsetof(struct rlimit_data, hard), JSON_MANDATORY }, + { "type", JSON_VARIANT_STRING, oci_rlimit_type, offsetof(struct rlimit_data, type), JSON_MANDATORY }, + {} + }; + + + r = json_dispatch(e, table, oci_unexpected, flags, &data); + if (r < 0) + return r; + + assert(data.type >= 0); + assert(data.type < _RLIMIT_MAX); + + if (s->rlimit[data.type]) { + json_log(v, flags, 0, "rlimits array contains duplicate entry, refusing."); + return -EINVAL; + } + + s->rlimit[data.type] = new(struct rlimit, 1); + if (!s->rlimit[data.type]) + return log_oom(); + + *s->rlimit[data.type] = (struct rlimit) { + .rlim_cur = data.soft, + .rlim_max = data.hard, + }; + + } + return 0; +} + +static int oci_capability_array(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uint64_t *mask = userdata, m = 0; + JsonVariant *e; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + const char *n; + int cap; + + if (!json_variant_is_string(e)) { + json_log(v, flags, 0, "Entry in capabilities array is not a string."); + return -EINVAL; + } + + assert_se(n = json_variant_string(e)); + + cap = capability_from_name(n); + if (cap < 0) { + json_log(v, flags, 0, "Unknown capability: %s", n); + return -EINVAL; + } + + m |= UINT64_C(1) << cap; + } + + if (*mask == (uint64_t) -1) + *mask = m; + else + *mask |= m; + + return 0; +} + +static int oci_capabilities(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "effective", JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, effective) }, + { "bounding", JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, bounding) }, + { "inheritable", JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, inheritable) }, + { "permitted", JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, permitted) }, + { "ambient", JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, ambient) }, + {} + }; + + Settings *s = userdata; + int r; + + assert(s); + + r = json_dispatch(v, table, oci_unexpected, flags, &s->full_capabilities); + if (r < 0) + return r; + + if (s->full_capabilities.bounding != (uint64_t) -1) { + s->capability = s->full_capabilities.bounding; + s->drop_capability = ~s->full_capabilities.bounding; + } + + return 0; +} + +static int oci_oom_score_adj(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + intmax_t k; + + assert(s); + + k = json_variant_integer(v); + if (k < OOM_SCORE_ADJ_MIN || k > OOM_SCORE_ADJ_MAX) { + json_log(v, flags, 0, "oomScoreAdj value out of range: %ji", k); + return -EINVAL; + } + + s->oom_score_adjust = (int) k; + s->oom_score_adjust_set = true; + + return 0; +} + +static int oci_uid_gid(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uid_t *uid = userdata, u; + uintmax_t k; + + assert(uid); + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + + k = json_variant_unsigned(v); + u = (uid_t) k; + if ((uintmax_t) u != k) { + json_log(v, flags, 0, "UID/GID out of range: %ji", k); + return -EINVAL; + } + + if (!uid_is_valid(u)) { + json_log(v, flags, 0, "UID/GID is not valid: " UID_FMT, u); + return -EINVAL; + } + + *uid = u; + return 0; +} + +static int oci_supplementary_gids(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + JsonVariant *e; + int r; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + gid_t gid, *a; + + if (!json_variant_is_unsigned(e)) { + json_log(v, flags, 0, "Supplementary GID entry is not a UID."); + return -EINVAL; + } + + r = oci_uid_gid(name, e, flags, &gid); + if (r < 0) + return r; + + a = reallocarray(s->supplementary_gids, s->n_supplementary_gids + 1, sizeof(gid_t)); + if (!a) + return log_oom(); + + s->supplementary_gids = a; + s->supplementary_gids[s->n_supplementary_gids++] = gid; + } + + return 0; +} + +static int oci_user(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + static const JsonDispatch table[] = { + { "uid", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(Settings, uid), JSON_MANDATORY }, + { "gid", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(Settings, gid), JSON_MANDATORY }, + { "additionalGids", JSON_VARIANT_ARRAY, oci_supplementary_gids, 0, 0 }, + {} + }; + + return json_dispatch(v, table, oci_unexpected, flags, userdata); +} + +static int oci_process(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "terminal", JSON_VARIANT_BOOLEAN, oci_terminal, 0, 0 }, + { "consoleSize", JSON_VARIANT_OBJECT, oci_console_size, 0, 0 }, + { "cwd", JSON_VARIANT_STRING, oci_absolute_path, offsetof(Settings, working_directory), 0 }, + { "env", JSON_VARIANT_ARRAY, oci_env, offsetof(Settings, environment), 0 }, + { "args", JSON_VARIANT_ARRAY, oci_args, offsetof(Settings, parameters), 0 }, + { "rlimits", JSON_VARIANT_ARRAY, oci_rlimits, 0, 0 }, + { "apparmorProfile", JSON_VARIANT_STRING, oci_unsupported, 0, JSON_PERMISSIVE }, + { "capabilities", JSON_VARIANT_OBJECT, oci_capabilities, 0, 0 }, + { "noNewPrivileges", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(Settings, no_new_privileges), 0 }, + { "oomScoreAdj", JSON_VARIANT_INTEGER, oci_oom_score_adj, 0, 0 }, + { "selinuxLabel", JSON_VARIANT_STRING, oci_unsupported, 0, JSON_PERMISSIVE }, + { "user", JSON_VARIANT_OBJECT, oci_user, 0, 0 }, + {} + }; + + return json_dispatch(v, table, oci_unexpected, flags, userdata); +} + +static int oci_root(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(Settings, root) }, + { "readonly", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(Settings, read_only) }, + {} + }; + + return json_dispatch(v, table, oci_unexpected, flags, userdata); +} + +static int oci_hostname(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + const char *n; + + assert(s); + + assert_se(n = json_variant_string(v)); + + if (!hostname_is_valid(n, false)) { + json_log(v, flags, 0, "Hostname string is not a valid hostname: %s", n); + return -EINVAL; + } + + if (free_and_strdup(&s->hostname, n) < 0) + return log_oom(); + + return 0; +} + +static bool oci_exclude_mount(const char *path) { + + /* Returns "true" for all mounts we insist to mount on our own, and hence ignore the OCI data. */ + + if (PATH_IN_SET(path, + "/dev", + "/dev/mqueue", + "/dev/pts", + "/dev/shm", + "/proc", + "/proc/acpi", + "/proc/apm", + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/kallsyms", + "/proc/kcore", + "/proc/keys", + "/proc/scsi", + "/proc/sys", + "/proc/sys/net", + "/proc/sysrq-trigger", + "/proc/timer_list", + "/run", + "/sys", + "/sys", + "/sys/fs/selinux", + "/tmp")) + return true; + + /* Similar, skip the whole /sys/fs/cgroups subtree */ + if (path_startswith(path, "/sys/fs/cgroup")) + return true; + + return false; +} + +static int oci_mounts(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + JsonVariant *e; + int r; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + + struct mount_data { + char *destination; + char *source; + char *type; + char **options; + } data = {}; + + static const JsonDispatch table[] = { + { "destination", JSON_VARIANT_STRING, oci_absolute_path, offsetof(struct mount_data, destination), JSON_MANDATORY }, + { "source", JSON_VARIANT_STRING, json_dispatch_string, offsetof(struct mount_data, source), 0 }, + { "options", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(struct mount_data, options), 0, }, + { "type", JSON_VARIANT_STRING, json_dispatch_string, offsetof(struct mount_data, type), 0 }, + {} + }; + + _cleanup_free_ char *joined_options = NULL; + CustomMount *m; + + r = json_dispatch(e, table, oci_unexpected, flags, &data); + if (r < 0) + goto fail_item; + + if (!path_is_absolute(data.destination)) { + json_log(e, flags, 0, "Mount destination not an absolute path: %s", data.destination); + r = -EINVAL; + goto fail_item; + } + + if (oci_exclude_mount(data.destination)) + goto skip_item; + + if (data.options) { + joined_options = strv_join(data.options, ","); + if (!joined_options) { + r = log_oom(); + goto fail_item; + } + } + + if (!data.type || streq(data.type, "bind")) { + + if (!path_is_absolute(data.source)) { + char *joined; + + joined = path_join(s->bundle, data.source); + if (!joined) { + r = log_oom(); + goto fail_item; + } + + free_and_replace(data.source, joined); + } + + data.type = mfree(data.type); + + m = custom_mount_add(&s->custom_mounts, &s->n_custom_mounts, CUSTOM_MOUNT_BIND); + } else + m = custom_mount_add(&s->custom_mounts, &s->n_custom_mounts, CUSTOM_MOUNT_ARBITRARY); + if (!m) { + r = log_oom(); + goto fail_item; + } + + m->destination = TAKE_PTR(data.destination); + m->source = TAKE_PTR(data.source); + m->options = TAKE_PTR(joined_options); + m->type_argument = TAKE_PTR(data.type); + + strv_free(data.options); + continue; + + fail_item: + free(data.destination); + free(data.source); + strv_free(data.options); + free(data.type); + + return r; + + skip_item: + free(data.destination); + free(data.source); + strv_free(data.options); + free(data.type); + } + + return 0; +} + +static int oci_namespace_type(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + unsigned long *nsflags = userdata; + const char *n; + + assert(nsflags); + assert_se(n = json_variant_string(v)); + + /* We don't use namespace_flags_from_string() here, as the OCI spec uses slightly different names than the + * kernel here. */ + if (streq(n, "pid")) + *nsflags = CLONE_NEWPID; + else if (streq(n, "network")) + *nsflags = CLONE_NEWNET; + else if (streq(n, "mount")) + *nsflags = CLONE_NEWNS; + else if (streq(n, "ipc")) + *nsflags = CLONE_NEWIPC; + else if (streq(n, "uts")) + *nsflags = CLONE_NEWUTS; + else if (streq(n, "user")) + *nsflags = CLONE_NEWUSER; + else if (streq(n, "cgroup")) + *nsflags = CLONE_NEWCGROUP; + else { + json_log(v, flags, 0, "Unknown cgroup type, refusing: %s", n); + return -EINVAL; + } + + return 0; +} + +static int oci_namespaces(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + unsigned long n = 0; + JsonVariant *e; + int r; + + assert_se(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + + struct namespace_data { + unsigned long type; + char *path; + } data = {}; + + static const JsonDispatch table[] = { + { "type", JSON_VARIANT_STRING, oci_namespace_type, offsetof(struct namespace_data, type), JSON_MANDATORY }, + { "path", JSON_VARIANT_STRING, oci_absolute_path, offsetof(struct namespace_data, path), 0 }, + {} + }; + + r = json_dispatch(e, table, oci_unexpected, flags, &data); + if (r < 0) { + free(data.path); + return r; + } + + if (data.path) { + if (data.type != CLONE_NEWNET) { + json_log(e, flags, 0, "Specifying namespace path for non-network namespace is not supported."); + free(data.path); + return -EOPNOTSUPP; + } + + if (s->network_namespace_path) { + json_log(e, flags, 0, "Network namespace path specified more than once, refusing."); + free(data.path); + return -EINVAL; + } + + free(s->network_namespace_path); + s->network_namespace_path = data.path; + } + + if (FLAGS_SET(n, data.type)) { + json_log(e, flags, 0, "Duplicat namespace specification, refusing."); + return -EINVAL; + } + + n |= data.type; + } + + if (!FLAGS_SET(n, CLONE_NEWNS)) { + json_log(v, flags, 0, "Containers without file system namespace aren't supported."); + return -EOPNOTSUPP; + } + + s->private_network = FLAGS_SET(n, CLONE_NEWNET); + s->userns_mode = FLAGS_SET(n, CLONE_NEWUSER) ? USER_NAMESPACE_FIXED : USER_NAMESPACE_NO; + s->use_cgns = FLAGS_SET(n, CLONE_NEWCGROUP); + + s->clone_ns_flags = n & (CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS); + + return 0; +} + +static int oci_uid_gid_range(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uid_t *uid = userdata, u; + uintmax_t k; + + assert(uid); + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + + /* This is very much like oci_uid_gid(), except the checks are a bit different, as this is a UID range rather + * than a specific UID, and hence (uid_t) -1 has no special significance. OTOH a range of zero makes no + * sense. */ + + k = json_variant_unsigned(v); + u = (uid_t) k; + if ((uintmax_t) u != k) { + json_log(v, flags, 0, "UID/GID out of range: %ji", k); + return -ERANGE; + } + if (u == 0) { + json_log(v, flags, 0, "UID/GID range can't be zero."); + return -ERANGE; + } + + *uid = u; + return 0; +} + +static int oci_uid_gid_mappings(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + struct mapping_data { + uid_t host_id; + uid_t container_id; + uid_t range; + } data = { + .host_id = UID_INVALID, + .container_id = UID_INVALID, + .range = 0, + }; + + static const JsonDispatch table[] = { + { "containerID", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(struct mapping_data, container_id), JSON_MANDATORY }, + { "hostID", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(struct mapping_data, host_id), JSON_MANDATORY }, + { "size", JSON_VARIANT_UNSIGNED, oci_uid_gid_range, offsetof(struct mapping_data, range), JSON_MANDATORY }, + {} + }; + + Settings *s = userdata; + JsonVariant *e; + int r; + + assert(s); + + if (json_variant_elements(v) == 0) + return 0; + + if (json_variant_elements(v) > 1) { + json_log(v, flags, 0, "UID/GID mappings with more than one entry are not supported."); + return -EOPNOTSUPP; + } + + assert_se(e = json_variant_by_index(v, 0)); + + r = json_dispatch(e, table, oci_unexpected, flags, &data); + if (r < 0) + return r; + + if (data.host_id + data.range < data.host_id || + data.container_id + data.range < data.container_id) { + json_log(v, flags, 0, "UID/GID range goes beyond UID/GID validity range, refusing."); + return -EINVAL; + } + + if (data.container_id != 0) { + json_log(v, flags, 0, "UID/GID mappings with a non-zero container base are not supported."); + return -EOPNOTSUPP; + } + + if (data.range < 0x10000) + json_log(v, flags|JSON_WARNING, 0, "UID/GID mapping with less than 65536 UID/GIDS set up, you are looking for trouble."); + + if (s->uid_range != UID_INVALID && + (s->uid_shift != data.host_id || s->uid_range != data.range)) { + json_log(v, flags, 0, "Non-matching UID and GID mappings are not supported."); + return -EOPNOTSUPP; + } + + s->uid_shift = data.host_id; + s->uid_range = data.range; + + return 0; +} + +static int oci_device_type(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + mode_t *mode = userdata; + const char *t; + + assert(mode); + assert_se(t = json_variant_string(v)); + + if (STR_IN_SET(t, "c", "u")) + *mode = (*mode & ~S_IFMT) | S_IFCHR; + else if (streq(t, "b")) + *mode = (*mode & ~S_IFMT) | S_IFBLK; + else if (streq(t, "p")) + *mode = (*mode & ~S_IFMT) | S_IFIFO; + else { + json_log(v, flags, 0, "Unknown device type: %s", t); + return -EINVAL; + } + + return 0; +} + +static int oci_device_major(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + unsigned *u = userdata; + uintmax_t k; + + assert_se(u); + + k = json_variant_unsigned(v); + if (!DEVICE_MAJOR_VALID(k)) { + json_log(v, flags, 0, "Device major %ji out of range.", k); + return -ERANGE; + } + + *u = (unsigned) k; + return 0; +} + +static int oci_device_minor(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + unsigned *u = userdata; + uintmax_t k; + + assert_se(u); + + k = json_variant_unsigned(v); + if (!DEVICE_MINOR_VALID(k)) { + json_log(v, flags, 0, "Device minor %ji out of range.", k); + return -ERANGE; + } + + *u = (unsigned) k; + return 0; +} + +static int oci_device_file_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + mode_t *mode = userdata, m; + uintmax_t k; + + assert(mode); + + k = json_variant_unsigned(v); + m = (mode_t) k; + + if ((m & ~07777) != 0 || (uintmax_t) m != k) { + json_log(v, flags, 0, "fileMode out of range, refusing."); + return -ERANGE; + } + + *mode = m; + return 0; +} + +static int oci_devices(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + JsonVariant *e; + int r; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + + static const JsonDispatch table[] = { + { "type", JSON_VARIANT_STRING, oci_device_type, offsetof(DeviceNode, mode), JSON_MANDATORY }, + { "path", JSON_VARIANT_STRING, oci_absolute_path, offsetof(DeviceNode, path), JSON_MANDATORY }, + { "major", JSON_VARIANT_UNSIGNED, oci_device_major, offsetof(DeviceNode, major), 0 }, + { "minor", JSON_VARIANT_UNSIGNED, oci_device_minor, offsetof(DeviceNode, minor), 0 }, + { "fileMode", JSON_VARIANT_UNSIGNED, oci_device_file_mode, offsetof(DeviceNode, mode), 0 }, + { "uid", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(DeviceNode, uid), 0 }, + { "gid", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(DeviceNode, gid), 0 }, + {} + }; + + DeviceNode *node, *nodes; + + nodes = reallocarray(s->extra_nodes, s->n_extra_nodes + 1, sizeof(DeviceNode)); + if (!nodes) + return log_oom(); + + s->extra_nodes = nodes; + + node = nodes + s->n_extra_nodes; + *node = (DeviceNode) { + .uid = UID_INVALID, + .gid = GID_INVALID, + .major = (unsigned) -1, + .minor = (unsigned) -1, + .mode = 0644, + }; + + r = json_dispatch(e, table, oci_unexpected, flags, node); + if (r < 0) + goto fail_element; + + if (S_ISCHR(node->mode) || S_ISBLK(node->mode)) { + _cleanup_free_ char *path = NULL; + + if (node->major == (unsigned) -1 || node->minor == (unsigned) -1) { + json_log(e, flags, 0, "Major/minor required when device node is device node"); + r = -EINVAL; + goto fail_element; + } + + /* Suppress a couple of implicit device nodes */ + r = device_path_make_canonical(node->mode, makedev(node->major, node->minor), &path); + if (r < 0) + json_log(e, flags|JSON_DEBUG, 0, "Failed to resolve device node %u:%u, ignoring: %m", node->major, node->minor); + else { + if (PATH_IN_SET(path, + "/dev/null", + "/dev/zero", + "/dev/full", + "/dev/random", + "/dev/urandom", + "/dev/tty", + "/dev/net/tun", + "/dev/ptmx", + "/dev/pts/ptmx", + "/dev/console")) { + + json_log(e, flags|JSON_DEBUG, 0, "Ignoring devices item for device '%s', as it is implicitly created anyway.", path); + free(node->path); + continue; + } + } + } + + s->n_extra_nodes++; + continue; + + fail_element: + free(node->path); + return r; + } + + return 0; +} + +static int oci_cgroups_path(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + _cleanup_free_ char *slice = NULL, *backwards = NULL; + Settings *s = userdata; + const char *p; + int r; + + assert(s); + + assert_se(p = json_variant_string(v)); + + r = cg_path_get_slice(p, &slice); + if (r < 0) + return json_log(v, flags, r, "Couldn't derive slice unit name from path '%s': %m", p); + + r = cg_slice_to_path(slice, &backwards); + if (r < 0) + return json_log(v, flags, r, "Couldn't convert slice unit name '%s' back to path: %m", slice); + + if (!path_equal(backwards, p)) { + json_log(v, flags, 0, "Control group path '%s' does not refer to slice unit, refusing.", p); + return -EINVAL; + } + + free_and_replace(s->slice, slice); + return 0; +} + +static int oci_cgroup_device_type(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + mode_t *mode = userdata; + const char *n; + + assert_se(n = json_variant_string(v)); + + if (streq(n, "c")) + *mode = S_IFCHR; + else if (streq(n, "b")) + *mode = S_IFBLK; + else { + json_log(v, flags, 0, "Control group device type unknown: %s", n); + return -EINVAL; + } + + return 0; +} + +struct device_data { + bool allow; + bool r; + bool w; + bool m; + mode_t type; + unsigned major; + unsigned minor; +}; + +static int oci_cgroup_device_access(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + struct device_data *d = userdata; + bool r = false, w = false, m = false; + const char *s; + size_t i; + + assert_se(s = json_variant_string(v)); + + for (i = 0; s[i]; i++) { + if (s[i] == 'r') + r = true; + else if (s[i] == 'w') + w = true; + else if (s[i] == 'm') + m = true; + else { + json_log(v, flags, 0, "Unknown device access character '%c'.", s[i]); + return -EINVAL; + } + } + + d->r = r; + d->w = w; + d->m = m; + + return 0; +} + +static int oci_cgroup_devices(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + _cleanup_free_ struct device_data *list = NULL; + Settings *s = userdata; + size_t n_list = 0, i; + bool noop = false; + JsonVariant *e; + int r; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + + struct device_data data = { + .major = (unsigned) -1, + .minor = (unsigned) -1, + }, *a; + + static const JsonDispatch table[] = { + { "allow", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(struct device_data, allow), JSON_MANDATORY }, + { "type", JSON_VARIANT_STRING, oci_cgroup_device_type, offsetof(struct device_data, type), 0 }, + { "major", JSON_VARIANT_UNSIGNED, oci_device_major, offsetof(struct device_data, major), 0 }, + { "minor", JSON_VARIANT_UNSIGNED, oci_device_minor, offsetof(struct device_data, minor), 0 }, + { "access", JSON_VARIANT_STRING, oci_cgroup_device_access, 0, 0 }, + {} + }; + + r = json_dispatch(e, table, oci_unexpected, flags, &data); + if (r < 0) + return r; + + if (!data.allow) { + /* The fact that OCI allows 'deny' entries makes really no sense, as 'allow' vs. 'deny' for the + * devices cgroup controller is really not about whitelisting and blacklisting but about adding + * and removing entries from the whitelist. Since we always start out with an empty whitelist + * we hence ignore the whole thing, as removing entries which don't exist make no sense. We'll + * log about this, since this is really borked in the spec, with one exception: the entry + * that's supposed to drop the kernel's default we ignore silently */ + + if (!data.r || !data.w || !data.m || data.type != 0 || data.major != (unsigned) -1 || data.minor != (unsigned) -1) + json_log(v, flags|JSON_WARNING, 0, "Devices cgroup whitelist with arbitrary 'allow' entries not supported, ignoring."); + + /* We ignore the 'deny' entry as for us that's implied */ + continue; + } + + if (!data.r && !data.w && !data.m) { + json_log(v, flags|LOG_WARNING, 0, "Device cgroup whitelist entry with no effect found, ignoring."); + continue; + } + + if (data.minor != (unsigned) -1 && data.major == (unsigned) -1) { + json_log(v, flags, 0, "Device cgroup whitelist entries with minors but no majors not supported."); + return -EOPNOTSUPP; + } + + if (data.major != (unsigned) -1 && data.type == 0) { + json_log(v, flags, 0, "Device cgroup whitelist entries with majors but no device node type not supported."); + return -EOPNOTSUPP; + } + + if (data.type == 0) { + if (data.r && data.w && data.m) /* a catchall whitelist entry means we are looking at a noop */ + noop = true; + else { + json_log(v, flags, 0, "Device cgroup whitelist entries with no type not supported."); + return -EOPNOTSUPP; + } + } + + a = reallocarray(list, n_list + 1, sizeof(struct device_data)); + if (!a) + return log_oom(); + + list = a; + list[n_list++] = data; + } + + if (noop) + return 0; + + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_open_container(s->properties, 'r', "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(s->properties, "s", "DeviceAllow"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(s->properties, 'v', "a(ss)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(s->properties, 'a', "(ss)"); + if (r < 0) + return bus_log_create_error(r); + + for (i = 0; i < n_list; i++) { + _cleanup_free_ char *pattern = NULL; + char access[4]; + size_t n = 0; + + if (list[i].minor == (unsigned) -1) { + const char *t; + + if (list[i].type == S_IFBLK) + t = "block"; + else { + assert(list[i].type == S_IFCHR); + t = "char"; + } + + if (list[i].major == (unsigned) -1) { + pattern = strjoin(t, "-*"); + if (!pattern) + return log_oom(); + } else { + if (asprintf(&pattern, "%s-%u", t, list[i].major) < 0) + return log_oom(); + } + + } else { + assert(list[i].major != (unsigned) -1); /* If a minor is specified, then a major also needs to be specified */ + + r = device_path_make_major_minor(list[i].type, makedev(list[i].major, list[i].minor), &pattern); + if (r < 0) + return log_oom(); + } + + if (list[i].r) + access[n++] = 'r'; + if (list[i].w) + access[n++] = 'w'; + if (list[i].m) + access[n++] = 'm'; + access[n] = 0; + + assert(n > 0); + + r = sd_bus_message_append(s->properties, "(ss)", pattern, access); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(s->properties); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(s->properties); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(s->properties); + if (r < 0) + return bus_log_create_error(r); + + return 0; +} + +static int oci_cgroup_memory_limit(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uint64_t *m = userdata; + uintmax_t k; + + assert(m); + + if (json_variant_is_negative(v)) { + *m = UINT64_MAX; + return 0; + } + + if (!json_variant_is_unsigned(v)) { + json_log(v, flags, 0, "Memory limit is not an unsigned integer"); + return -EINVAL; + } + + k = json_variant_unsigned(v); + if (k >= UINT64_MAX) { + json_log(v, flags, 0, "Memory limit too large: %ji", k); + return -ERANGE; + } + + *m = (uint64_t) k; + return 0; +} + +static int oci_cgroup_memory(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + struct memory_data { + uint64_t limit; + uint64_t reservation; + uint64_t swap; + } data = { + .limit = UINT64_MAX, + .reservation = UINT64_MAX, + .swap = UINT64_MAX, + }; + + static const JsonDispatch table[] = { + { "limit", JSON_VARIANT_NUMBER, oci_cgroup_memory_limit, offsetof(struct memory_data, limit), 0 }, + { "reservation", JSON_VARIANT_NUMBER, oci_cgroup_memory_limit, offsetof(struct memory_data, reservation), 0 }, + { "swap", JSON_VARIANT_NUMBER, oci_cgroup_memory_limit, offsetof(struct memory_data, swap), 0 }, + { "kernel", JSON_VARIANT_NUMBER, oci_unsupported, 0, JSON_PERMISSIVE }, + { "kernelTCP", JSON_VARIANT_NUMBER, oci_unsupported, 0, JSON_PERMISSIVE }, + { "swapiness", JSON_VARIANT_NUMBER, oci_unsupported, 0, JSON_PERMISSIVE }, + { "disableOOMKiller", JSON_VARIANT_NUMBER, oci_unsupported, 0, JSON_PERMISSIVE }, + {} + }; + + Settings *s = userdata; + int r; + + r = json_dispatch(v, table, oci_unexpected, flags, &data); + if (r < 0) + return r; + + if (data.swap != UINT64_MAX) { + if (data.limit == UINT64_MAX) + json_log(v, flags|LOG_WARNING, 0, "swap limit without memory limit is not supported, ignoring."); + else if (data.swap < data.limit) + json_log(v, flags|LOG_WARNING, 0, "swap limit is below memory limit, ignoring."); + else { + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "MemorySwapMax", "t", data.swap - data.limit); + if (r < 0) + return bus_log_create_error(r); + } + } + + if (data.limit != UINT64_MAX) { + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "MemoryMax", "t", data.limit); + if (r < 0) + return bus_log_create_error(r); + } + + if (data.reservation != UINT64_MAX) { + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "MemoryLow", "t", data.reservation); + if (r < 0) + return bus_log_create_error(r); + } + + return 0; +} + +struct cpu_data { + uint64_t shares; + uint64_t quota; + uint64_t period; + cpu_set_t *cpuset; + unsigned ncpus; +}; + +static int oci_cgroup_cpu_shares(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uint64_t *u = userdata; + uintmax_t k; + + assert(u); + + k = json_variant_unsigned(v); + if (k < CGROUP_CPU_SHARES_MIN || k > CGROUP_CPU_SHARES_MAX) { + json_log(v, flags, 0, "shares value out of range."); + return -ERANGE; + } + + *u = (uint64_t) k; + return 0; +} + +static int oci_cgroup_cpu_quota(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uint64_t *u = userdata; + uintmax_t k; + + assert(u); + + k = json_variant_unsigned(v); + if (k <= 0 || k >= UINT64_MAX) { + json_log(v, flags, 0, "period/quota value out of range."); + return -ERANGE; + } + + *u = (uint64_t) k; + return 0; +} + +static int oci_cgroup_cpu_cpus(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + struct cpu_data *data = userdata; + cpu_set_t *set; + const char *n; + int ncpus; + + assert(data); + + assert_se(n = json_variant_string(v)); + + ncpus = parse_cpu_set(n, &set); + if (ncpus < 0) + return json_log(v, flags, ncpus, "Failed to parse CPU set specification: %s", n); + + CPU_FREE(data->cpuset); + data->cpuset = set; + data->ncpus = ncpus; + + return 0; +} + +static int oci_cgroup_cpu(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "shares", JSON_VARIANT_UNSIGNED, oci_cgroup_cpu_shares, offsetof(struct cpu_data, shares), 0 }, + { "quota", JSON_VARIANT_UNSIGNED, oci_cgroup_cpu_quota, offsetof(struct cpu_data, quota), 0 }, + { "period", JSON_VARIANT_UNSIGNED, oci_cgroup_cpu_quota, offsetof(struct cpu_data, period), 0 }, + { "realtimeRuntime", JSON_VARIANT_UNSIGNED, oci_unsupported, 0, 0 }, + { "realtimePeriod", JSON_VARIANT_UNSIGNED, oci_unsupported, 0, 0 }, + { "cpus", JSON_VARIANT_STRING, oci_cgroup_cpu_cpus, 0, 0 }, + { "mems", JSON_VARIANT_STRING, oci_unsupported, 0, 0 }, + {} + }; + + struct cpu_data data = { + .shares = UINT64_MAX, + .quota = UINT64_MAX, + .period = UINT64_MAX, + }; + + Settings *s = userdata; + int r; + + r = json_dispatch(v, table, oci_unexpected, flags, &data); + if (r < 0) { + CPU_FREE(data.cpuset); + return r; + } + + CPU_FREE(s->cpuset); + s->cpuset = data.cpuset; + s->cpuset_ncpus = data.ncpus; + + if (data.shares != UINT64_MAX) { + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "CPUShares", "t", data.shares); + if (r < 0) + return bus_log_create_error(r); + } + + if (data.quota != UINT64_MAX && data.period != UINT64_MAX) { + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "CPUQuotaPerSecUSec", "t", (uint64_t) (data.quota * USEC_PER_SEC / data.period)); + if (r < 0) + return bus_log_create_error(r); + + } else if ((data.quota != UINT64_MAX) != (data.period != UINT64_MAX)) { + json_log(v, flags, 0, "CPU quota and period not used together."); + return -EINVAL; + } + + return 0; +} + +static int oci_cgroup_block_io_weight(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + uintmax_t k; + int r; + + assert(s); + + k = json_variant_unsigned(v); + if (k < CGROUP_BLKIO_WEIGHT_MIN || k > CGROUP_BLKIO_WEIGHT_MAX) { + json_log(v, flags, 0, "Block I/O weight out of range."); + return -ERANGE; + } + + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "BlockIOWeight", "t", (uint64_t) k); + if (r < 0) + return bus_log_create_error(r); + + return 0; +} + +static int oci_cgroup_block_io_weight_device(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + JsonVariant *e; + int r; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + struct device_data { + unsigned major; + unsigned minor; + uintmax_t weight; + } data = { + .major = (unsigned) -1, + .minor = (unsigned) -1, + .weight = UINTMAX_MAX, + }; + + static const JsonDispatch table[] = { + { "major", JSON_VARIANT_UNSIGNED, oci_device_major, offsetof(struct device_data, major), JSON_MANDATORY }, + { "minor", JSON_VARIANT_UNSIGNED, oci_device_minor, offsetof(struct device_data, minor), JSON_MANDATORY }, + { "weight", JSON_VARIANT_UNSIGNED, json_dispatch_unsigned, offsetof(struct device_data, weight), 0 }, + { "leafWeight", JSON_VARIANT_INTEGER, oci_unsupported, 0, JSON_PERMISSIVE }, + {} + }; + + _cleanup_free_ char *path = NULL; + + r = json_dispatch(e, table, oci_unexpected, flags, &data); + if (r < 0) + return r; + + if (data.weight == UINTMAX_MAX) + continue; + + if (data.weight < CGROUP_BLKIO_WEIGHT_MIN || data.weight > CGROUP_BLKIO_WEIGHT_MAX) { + json_log(v, flags, 0, "Block I/O device weight out of range."); + return -ERANGE; + } + + r = device_path_make_major_minor(S_IFBLK, makedev(data.major, data.minor), &path); + if (r < 0) + return json_log(v, flags, r, "Failed to build device path: %m"); + + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "BlockIODeviceWeight", "a(st)", 1, path, (uint64_t) data.weight); + if (r < 0) + return bus_log_create_error(r); + } + + return 0; +} + +static int oci_cgroup_block_io_throttle(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + const char *pname; + JsonVariant *e; + int r; + + assert(s); + + pname = streq(name, "throttleReadBpsDevice") ? "IOReadBandwidthMax" : + streq(name, "throttleWriteBpsDevice") ? "IOWriteBandwidthMax" : + streq(name, "throttleReadIOPSDevice") ? "IOReadIOPSMax" : + "IOWriteIOPSMax"; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + struct device_data { + unsigned major; + unsigned minor; + uintmax_t rate; + } data = { + .major = (unsigned) -1, + .minor = (unsigned) -1, + }; + + static const JsonDispatch table[] = { + { "major", JSON_VARIANT_UNSIGNED, oci_device_major, offsetof(struct device_data, major), JSON_MANDATORY }, + { "minor", JSON_VARIANT_UNSIGNED, oci_device_minor, offsetof(struct device_data, minor), JSON_MANDATORY }, + { "rate", JSON_VARIANT_UNSIGNED, json_dispatch_unsigned, offsetof(struct device_data, rate), JSON_MANDATORY }, + {} + }; + + _cleanup_free_ char *path = NULL; + + r = json_dispatch(e, table, oci_unexpected, flags, &data); + if (r < 0) + return r; + + if (data.rate >= UINT64_MAX) { + json_log(v, flags, 0, "Block I/O device rate out of range."); + return -ERANGE; + } + + r = device_path_make_major_minor(S_IFBLK, makedev(data.major, data.minor), &path); + if (r < 0) + return json_log(v, flags, r, "Failed to build device path: %m"); + + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", pname, "a(st)", 1, path, (uint64_t) data.rate); + if (r < 0) + return bus_log_create_error(r); + } + + return 0; +} + +static int oci_cgroup_block_io(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "weight", JSON_VARIANT_UNSIGNED, oci_cgroup_block_io_weight, 0, 0 }, + { "leafWeight", JSON_VARIANT_UNSIGNED, oci_unsupported, 0, JSON_PERMISSIVE }, + { "weightDevice", JSON_VARIANT_ARRAY, oci_cgroup_block_io_weight_device, 0, 0 }, + { "throttleReadBpsDevice", JSON_VARIANT_ARRAY, oci_cgroup_block_io_throttle, 0, 0 }, + { "throttleWriteBpsDevice", JSON_VARIANT_ARRAY, oci_cgroup_block_io_throttle, 0, 0 }, + { "throttleReadIOPSDevice", JSON_VARIANT_ARRAY, oci_cgroup_block_io_throttle, 0, 0 }, + { "throttleWriteIOPSDevice", JSON_VARIANT_ARRAY, oci_cgroup_block_io_throttle, 0, 0 }, + {} + }; + + return json_dispatch(v, table, oci_unexpected, flags, userdata); +} + +static int oci_cgroup_pids(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "limit", JSON_VARIANT_NUMBER, json_dispatch_variant, 0, JSON_MANDATORY }, + {} + }; + + _cleanup_(json_variant_unrefp) JsonVariant *k = NULL; + Settings *s = userdata; + uint64_t m; + int r; + + assert(s); + + r = json_dispatch(v, table, oci_unexpected, flags, &k); + if (r < 0) + return r; + + if (json_variant_is_negative(k)) + m = UINT64_MAX; + else { + if (!json_variant_is_unsigned(k)) { + json_log(k, flags, 0, "pids limit not unsigned integer, refusing."); + return -EINVAL; + } + + m = (uint64_t) json_variant_unsigned(k); + + if ((uintmax_t) m != json_variant_unsigned(k)) { + json_log(v, flags, 0, "pids limit out of range, refusing."); + return -EINVAL; + } + } + + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "TasksMax", "t", m); + if (r < 0) + return bus_log_create_error(r); + + return 0; +} + +static int oci_resources(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "devices", JSON_VARIANT_ARRAY, oci_cgroup_devices, 0, 0 }, + { "memory", JSON_VARIANT_OBJECT, oci_cgroup_memory, 0, 0 }, + { "cpu", JSON_VARIANT_OBJECT, oci_cgroup_cpu, 0, 0 }, + { "blockIO", JSON_VARIANT_OBJECT, oci_cgroup_block_io, 0, 0 }, + { "hugepageLimits", JSON_VARIANT_ARRAY, oci_unsupported, 0, 0 }, + { "network", JSON_VARIANT_OBJECT, oci_unsupported, 0, 0 }, + { "pids", JSON_VARIANT_OBJECT, oci_cgroup_pids, 0, 0 }, + { "rdma", JSON_VARIANT_OBJECT, oci_unsupported, 0, 0 }, + {} + }; + + return json_dispatch(v, table, oci_unexpected, flags, userdata); +} + +static bool sysctl_key_valid(const char *s) { + bool dot = true; + + /* Note that we are a bit stricter here than in systemd-sysctl, as that inherited semantics from the old sysctl + * tool, which were really weird (as it swaps / and . in both ways) */ + + if (isempty(s)) + return false; + + for (; *s; s++) { + + if (*s <= ' ' || *s >= 127) + return false; + if (*s == '/') + return false; + if (*s == '.') { + + if (dot) /* Don't allow two dots next to each other (or at the beginning) */ + return false; + + dot = true; + } else + dot = false; + } + + if (dot) /* don't allow a dot at the end */ + return false; + + return true; +} + +static int oci_sysctl(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + JsonVariant *k, *w; + int r; + + assert(s); + + JSON_VARIANT_OBJECT_FOREACH(k, w, v) { + const char *n, *m; + + if (!json_variant_is_string(w)) { + json_log(v, flags, 0, "sysctl parameter is not a string, refusing."); + return -EINVAL; + } + + assert_se(n = json_variant_string(k)); + assert_se(m = json_variant_string(w)); + + if (sysctl_key_valid(n)) { + json_log(v, flags, 0, "sysctl key invalid, refusing: %s", n); + return -EINVAL; + } + + r = strv_extend_strv(&s->sysctl, STRV_MAKE(n, m), false); + if (r < 0) + return log_oom(); + } + + return 0; +} + +static int oci_seccomp_action_from_string(const char *name, uint32_t *ret) { + + static const struct { + const char *name; + uint32_t action; + } table[] = { + { "SCMP_ACT_ALLOW", SCMP_ACT_ALLOW }, + { "SCMP_ACT_ERRNO", SCMP_ACT_ERRNO(EPERM) }, /* the OCI spec doesn't document the error, but it appears EPERM is supposed to be used */ + { "SCMP_ACT_KILL", SCMP_ACT_KILL }, +#ifdef SCMP_ACT_LOG + { "SCMP_ACT_LOG", SCMP_ACT_LOG }, +#endif + { "SCMP_ACT_TRAP", SCMP_ACT_TRAP }, + + /* We don't support SCMP_ACT_TRACE because that requires a tracer, and that doesn't really make sense + * here */ + }; + + size_t i; + + for (i = 0; i < ELEMENTSOF(table); i++) + if (streq_ptr(name, table[i].name)) { + *ret = table[i].action; + return 0; + } + + return -EINVAL; +} + +static int oci_seccomp_arch_from_string(const char *name, uint32_t *ret) { + + static const struct { + const char *name; + uint32_t arch; + } table[] = { + { "SCMP_ARCH_AARCH64", SCMP_ARCH_AARCH64 }, + { "SCMP_ARCH_ARM", SCMP_ARCH_ARM }, + { "SCMP_ARCH_MIPS", SCMP_ARCH_MIPS }, + { "SCMP_ARCH_MIPS64", SCMP_ARCH_MIPS64 }, + { "SCMP_ARCH_MIPS64N32", SCMP_ARCH_MIPS64N32 }, + { "SCMP_ARCH_MIPSEL", SCMP_ARCH_MIPSEL }, + { "SCMP_ARCH_MIPSEL64", SCMP_ARCH_MIPSEL64 }, + { "SCMP_ARCH_MIPSEL64N32", SCMP_ARCH_MIPSEL64N32 }, + { "SCMP_ARCH_NATIVE", SCMP_ARCH_NATIVE }, +#ifdef SCMP_ARCH_PARISC + { "SCMP_ARCH_PARISC", SCMP_ARCH_PARISC }, +#endif +#ifdef SCMP_ARCH_PARISC64 + { "SCMP_ARCH_PARISC64", SCMP_ARCH_PARISC64 }, +#endif + { "SCMP_ARCH_PPC", SCMP_ARCH_PPC }, + { "SCMP_ARCH_PPC64", SCMP_ARCH_PPC64 }, + { "SCMP_ARCH_PPC64LE", SCMP_ARCH_PPC64LE }, + { "SCMP_ARCH_S390", SCMP_ARCH_S390 }, + { "SCMP_ARCH_S390X", SCMP_ARCH_S390X }, + { "SCMP_ARCH_X32", SCMP_ARCH_X32 }, + { "SCMP_ARCH_X86", SCMP_ARCH_X86 }, + { "SCMP_ARCH_X86_64", SCMP_ARCH_X86_64 }, + }; + + size_t i; + + for (i = 0; i < ELEMENTSOF(table); i++) + if (streq_ptr(table[i].name, name)) { + *ret = table[i].arch; + return 0; + } + + return -EINVAL; +} + +static int oci_seccomp_compare_from_string(const char *name, enum scmp_compare *ret) { + + static const struct { + const char *name; + enum scmp_compare op; + } table[] = { + { "SCMP_CMP_NE", SCMP_CMP_NE }, + { "SCMP_CMP_LT", SCMP_CMP_LT }, + { "SCMP_CMP_LE", SCMP_CMP_LE }, + { "SCMP_CMP_EQ", SCMP_CMP_EQ }, + { "SCMP_CMP_GE", SCMP_CMP_GE }, + { "SCMP_CMP_GT", SCMP_CMP_GT }, + { "SCMP_CMP_MASKED_EQ", SCMP_CMP_MASKED_EQ }, + }; + + size_t i; + + for (i = 0; i < ELEMENTSOF(table); i++) + if (streq_ptr(table[i].name, name)) { + *ret = table[i].op; + return 0; + } + + return -EINVAL; +} + +static int oci_seccomp_archs(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + scmp_filter_ctx *sc = userdata; + JsonVariant *e; + int r; + + assert(sc); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + uint32_t a; + + if (!json_variant_is_string(e)) { + json_log(e, flags, 0, "Architecture entry is not a string"); + return -EINVAL; + } + + r = oci_seccomp_arch_from_string(json_variant_string(e), &a); + if (r < 0) + return json_log(e, flags, r, "Unknown architecture: %s", json_variant_string(e)); + + r = seccomp_arch_add(sc, a); + if (r == -EEXIST) + continue; + if (r < 0) + return json_log(e, flags, r, "Failed to add architecture to seccomp filter: %m"); + } + + return 0; +} + +struct syscall_rule { + char **names; + uint32_t action; + struct scmp_arg_cmp *arguments; + size_t n_arguments; +}; + +static void syscall_rule_free(struct syscall_rule *rule) { + assert(rule); + + strv_free(rule->names); + free(rule->arguments); +}; + +static int oci_seccomp_action(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uint32_t *action = userdata; + int r; + + assert(action); + + r = oci_seccomp_action_from_string(json_variant_string(v), action); + if (r < 0) + return json_log(v, flags, r, "Unknown system call action '%s': %m", json_variant_string(v)); + + return 0; +} + +static int oci_seccomp_op(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + enum scmp_compare *op = userdata; + int r; + + assert(op); + + r = oci_seccomp_compare_from_string(json_variant_string(v), op); + if (r < 0) + return json_log(v, flags, r, "Unknown seccomp operator '%s': %m", json_variant_string(v)); + + return 0; +} + +static int oci_seccomp_args(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + struct syscall_rule *rule = userdata; + JsonVariant *e; + int r; + + assert(rule); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + static const struct JsonDispatch table[] = { + { "index", JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(struct scmp_arg_cmp, arg), JSON_MANDATORY }, + { "value", JSON_VARIANT_UNSIGNED, json_dispatch_uint64, offsetof(struct scmp_arg_cmp, datum_a), JSON_MANDATORY }, + { "valueTwo", JSON_VARIANT_UNSIGNED, json_dispatch_uint64, offsetof(struct scmp_arg_cmp, datum_b), 0 }, + { "op", JSON_VARIANT_STRING, oci_seccomp_op, offsetof(struct scmp_arg_cmp, op), JSON_MANDATORY }, + {}, + }; + + struct scmp_arg_cmp *a, *p; + int expected; + + a = reallocarray(rule->arguments, rule->n_arguments + 1, sizeof(struct syscall_rule)); + if (!a) + return log_oom(); + + rule->arguments = a; + p = rule->arguments + rule->n_arguments; + + *p = (struct scmp_arg_cmp) { + .arg = 0, + .datum_a = 0, + .datum_b = 0, + .op = 0, + }; + + r = json_dispatch(e, table, oci_unexpected, flags, p); + if (r < 0) + return r; + + expected = p->op == SCMP_CMP_MASKED_EQ ? 4 : 3; + if (r != expected) + json_log(e, flags|JSON_WARNING, 0, "Wrong number of system call arguments for JSON data data, ignoring."); + + /* Note that we are a bit sloppy here and do not insist that SCMP_CMP_MASKED_EQ gets two datum values, + * and the other only one. That's because buildah for example by default calls things with + * SCMP_CMP_MASKED_EQ but only one argument. We use 0 when the value is not specified. */ + + rule->n_arguments++; + } + + return 0; +} + +static int oci_seccomp_syscalls(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + scmp_filter_ctx *sc = userdata; + JsonVariant *e; + int r; + + assert(sc); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + static const JsonDispatch table[] = { + { "names", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(struct syscall_rule, names), JSON_MANDATORY }, + { "action", JSON_VARIANT_STRING, oci_seccomp_action, offsetof(struct syscall_rule, action), JSON_MANDATORY }, + { "args", JSON_VARIANT_ARRAY, oci_seccomp_args, 0, 0 }, + }; + struct syscall_rule rule = { + .action = (uint32_t) -1, + }; + char **i; + + r = json_dispatch(e, table, oci_unexpected, flags, &rule); + if (r < 0) + goto fail_rule; + + if (strv_isempty(rule.names)) { + json_log(e, flags, 0, "System call name list is empty."); + r = -EINVAL; + goto fail_rule; + } + + STRV_FOREACH(i, rule.names) { + int nr; + + nr = seccomp_syscall_resolve_name(*i); + if (nr == __NR_SCMP_ERROR) { + log_debug("Unknown syscall %s, skipping.", *i); + continue; + } + + r = seccomp_rule_add_array(sc, rule.action, nr, rule.n_arguments, rule.arguments); + if (r < 0) + goto fail_rule; + } + + syscall_rule_free(&rule); + continue; + + fail_rule: + syscall_rule_free(&rule); + return r; + } + + return 0; +} + +static int oci_seccomp(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "defaultAction", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + { "architectures", JSON_VARIANT_ARRAY, oci_seccomp_archs, 0, 0 }, + { "syscalls", JSON_VARIANT_ARRAY, oci_seccomp_syscalls, 0, 0 }, + {} + }; + + _cleanup_(seccomp_releasep) scmp_filter_ctx sc = NULL; + Settings *s = userdata; + JsonVariant *def; + uint32_t d; + int r; + + assert(s); + + def = json_variant_by_key(v, "defaultAction"); + if (!def) { + json_log(v, flags, 0, "defaultAction element missing."); + return -EINVAL; + } + + if (!json_variant_is_string(def)) { + json_log(def, flags, 0, "defaultAction is not a string."); + return -EINVAL; + } + + r = oci_seccomp_action_from_string(json_variant_string(def), &d); + if (r < 0) + return json_log(def, flags, r, "Unknown default action: %s", json_variant_string(def)); + + sc = seccomp_init(d); + if (!sc) { + log_error("Couldn't allocate seccomp object."); + return -ENOMEM; + } + + r = json_dispatch(v, table, oci_unexpected, flags, sc); + if (r < 0) + return r; + + seccomp_release(s->seccomp); + s->seccomp = TAKE_PTR(sc); + + return 0; +} + +static int oci_rootfs_propagation(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + const char *s; + + s = json_variant_string(v); + + if (streq(s, "shared")) + return 0; + + json_log(v, flags|JSON_DEBUG, 0, "Ignoring rootfsPropagation setting '%s'.", s); + return 0; +} + +static int oci_masked_paths(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + JsonVariant *e; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + _cleanup_free_ char *destination = NULL; + CustomMount *m; + const char *p; + + if (!json_variant_is_string(e)) { + json_log(v, flags, 0, "Path is not a string, refusing."); + return -EINVAL; + } + + assert_se(p = json_variant_string(e)); + + if (!path_is_absolute(p)) { + json_log(v, flags, 0, "Path is not not absolute, refusing: %s", p); + return -EINVAL; + } + + if (oci_exclude_mount(p)) + continue; + + destination = strdup(p); + if (!destination) + return log_oom(); + + m = custom_mount_add(&s->custom_mounts, &s->n_custom_mounts, CUSTOM_MOUNT_INACCESSIBLE); + if (!m) + return log_oom(); + + m->destination = TAKE_PTR(destination); + + /* The spec doesn't say this, but apparently pre-existing implementations are lenient towards + * non-existing paths to mask. Let's hence be too. */ + m->graceful = true; + } + + return 0; +} + +static int oci_readonly_paths(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + JsonVariant *e; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + _cleanup_free_ char *source = NULL, *destination = NULL; + CustomMount *m; + const char *p; + + if (!json_variant_is_string(e)) { + json_log(v, flags, 0, "Path is not a string, refusing."); + return -EINVAL; + } + + assert_se(p = json_variant_string(e)); + + if (!path_is_absolute(p)) { + json_log(v, flags, 0, "Path is not not absolute, refusing: %s", p); + return -EINVAL; + } + + if (oci_exclude_mount(p)) + continue; + + source = strjoin("+", p); + if (!source) + return log_oom(); + + destination = strdup(p); + if (!destination) + return log_oom(); + + m = custom_mount_add(&s->custom_mounts, &s->n_custom_mounts, CUSTOM_MOUNT_BIND); + if (!m) + return log_oom(); + + m->source = TAKE_PTR(source); + m->destination = TAKE_PTR(destination); + m->read_only = true; + } + + return 0; +} + +static int oci_linux(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "namespaces", JSON_VARIANT_ARRAY, oci_namespaces, 0, 0 }, + { "uidMappings", JSON_VARIANT_ARRAY, oci_uid_gid_mappings, 0, 0 }, + { "gidMappings", JSON_VARIANT_ARRAY, oci_uid_gid_mappings, 0, 0 }, + { "devices", JSON_VARIANT_ARRAY, oci_devices, 0, 0 }, + { "cgroupsPath", JSON_VARIANT_STRING, oci_cgroups_path, 0, 0 }, + { "resources", JSON_VARIANT_OBJECT, oci_resources, 0, 0 }, + { "intelRdt", JSON_VARIANT_OBJECT, oci_unsupported, 0, JSON_PERMISSIVE }, + { "sysctl", JSON_VARIANT_OBJECT, oci_sysctl, 0, 0 }, + { "seccomp", JSON_VARIANT_OBJECT, oci_seccomp, 0, 0 }, + { "rootfsPropagation", JSON_VARIANT_STRING, oci_rootfs_propagation, 0, 0 }, + { "maskedPaths", JSON_VARIANT_ARRAY, oci_masked_paths, 0, 0 }, + { "readonlyPaths", JSON_VARIANT_ARRAY, oci_readonly_paths, 0, 0 }, + { "mountLabel", JSON_VARIANT_STRING, oci_unsupported, 0, JSON_PERMISSIVE }, + {} + }; + + return json_dispatch(v, table, oci_unexpected, flags, userdata); +} + +static int oci_hook_timeout(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + usec_t *u = userdata; + uintmax_t k; + + k = json_variant_unsigned(v); + if (k == 0) { + json_log(v, flags, 0, "Hook timeout cannot be zero."); + return -EINVAL; + } + + if (k > (UINT64_MAX-1/USEC_PER_SEC)) { + json_log(v, flags, 0, "Hook timeout too large."); + return -EINVAL; + } + + *u = k * USEC_PER_SEC; + return 0; +} + +static int oci_hooks_array(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + JsonVariant *e; + int r; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + + static const JsonDispatch table[] = { + { "path", JSON_VARIANT_STRING, oci_absolute_path, offsetof(OciHook, path), JSON_MANDATORY }, + { "args", JSON_VARIANT_ARRAY, oci_args, offsetof(OciHook, args), 0 }, + { "env", JSON_VARIANT_ARRAY, oci_env, offsetof(OciHook, env), 0 }, + { "timeout", JSON_VARIANT_UNSIGNED, oci_hook_timeout, offsetof(OciHook, timeout), 0 }, + {} + }; + + OciHook *a, **array, *new_item; + size_t *n_array; + + if (streq(name, "prestart")) { + array = &s->oci_hooks_prestart; + n_array = &s->n_oci_hooks_prestart; + } else if (streq(name, "poststart")) { + array = &s->oci_hooks_poststart; + n_array = &s->n_oci_hooks_poststart; + } else { + assert(streq(name, "poststop")); + array = &s->oci_hooks_poststop; + n_array = &s->n_oci_hooks_poststop; + } + + a = reallocarray(*array, *n_array + 1, sizeof(OciHook)); + if (!a) + return log_oom(); + + *array = a; + new_item = a + *n_array; + + *new_item = (OciHook) { + .timeout = USEC_INFINITY, + }; + + r = json_dispatch(e, table, oci_unexpected, flags, userdata); + if (r < 0) { + free(new_item->path); + strv_free(new_item->args); + strv_free(new_item->env); + return r; + } + + (*n_array) ++; + } + + return 0; +} + +static int oci_hooks(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "prestart", JSON_VARIANT_OBJECT, oci_hooks_array, 0, 0 }, + { "poststart", JSON_VARIANT_OBJECT, oci_hooks_array, 0, 0 }, + { "poststop", JSON_VARIANT_OBJECT, oci_hooks_array, 0, 0 }, + {} + }; + + return json_dispatch(v, table, oci_unexpected, flags, userdata); +} + +static int oci_annotations(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + JsonVariant *k, *w; + + JSON_VARIANT_OBJECT_FOREACH(k, w, v) { + const char *n; + + assert_se(n = json_variant_string(k)); + + if (isempty(n)) { + json_log(k, flags, 0, "Annotation with empty key, refusing."); + return -EINVAL; + } + + if (!json_variant_is_string(w)) { + json_log(w, flags, 0, "Annotation has non-string value, refusing."); + return -EINVAL; + } + + json_log(k, flags|JSON_DEBUG, 0, "Ignoring annotation '%s' with value '%s'.", n, json_variant_string(w)); + } + + return 0; +} + +int oci_load(FILE *f, const char *bundle, Settings **ret) { + + static const JsonDispatch table[] = { + { "ociVersion", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + { "process", JSON_VARIANT_OBJECT, oci_process, 0, 0 }, + { "root", JSON_VARIANT_OBJECT, oci_root, 0, 0 }, + { "hostname", JSON_VARIANT_STRING, oci_hostname, 0, 0 }, + { "mounts", JSON_VARIANT_ARRAY, oci_mounts, 0, 0 }, + { "linux", JSON_VARIANT_OBJECT, oci_linux, 0, 0 }, + { "hooks", JSON_VARIANT_OBJECT, oci_hooks, 0, 0 }, + { "annotations", JSON_VARIANT_OBJECT, oci_annotations, 0, 0 }, + {} + }; + + _cleanup_(json_variant_unrefp) JsonVariant *oci = NULL; + _cleanup_(settings_freep) Settings *s = NULL; + unsigned line = 0, column = 0; + JsonVariant *v; + const char *path; + int r; + + assert_se(bundle); + + path = strjoina(bundle, "/config.json"); + + r = json_parse_file(f, path, &oci, &line, &column); + if (r < 0) { + if (line != 0 && column != 0) + return log_error_errno(r, "Failed to parse '%s' at %u:%u: %m", path, line, column); + else + return log_error_errno(r, "Failed to parse '%s': %m", path); + } + + v = json_variant_by_key(oci, "ociVersion"); + if (!v) { + log_error("JSON file '%s' is not an OCI bundle configuration file. Refusing.", path); + return -EINVAL; + } + if (!streq_ptr(json_variant_string(v), "1.0.0")) { + log_error("OCI bundle version not supported: %s", strna(json_variant_string(v))); + return -EINVAL; + } + + // { + // _cleanup_free_ char *formatted = NULL; + // assert_se(json_variant_format(oci, JSON_FORMAT_PRETTY|JSON_FORMAT_COLOR, &formatted) >= 0); + // fputs(formatted, stdout); + // } + + s = settings_new(); + if (!s) + return log_oom(); + + s->start_mode = START_PID1; + s->resolv_conf = RESOLV_CONF_OFF; + s->link_journal = LINK_NO; + s->timezone = TIMEZONE_OFF; + + s->bundle = strdup(bundle); + if (!s->bundle) + return log_oom(); + + r = json_dispatch(oci, table, oci_unexpected, 0, s); + if (r < 0) + return r; + + if (s->properties) { + r = sd_bus_message_seal(s->properties, 0, 0); + if (r < 0) + return log_error_errno(r, "Cannot seal properties bus message: %m"); + } + + *ret = TAKE_PTR(s); + return 0; +} diff --git a/src/nspawn/nspawn-oci.h b/src/nspawn/nspawn-oci.h new file mode 100644 index 0000000000..d23a2f3dae --- /dev/null +++ b/src/nspawn/nspawn-oci.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include "nspawn-settings.h" + +int oci_load(FILE *f, const char *path, Settings **ret); diff --git a/src/nspawn/nspawn-register.c b/src/nspawn/nspawn-register.c index 4ca5f7998d..8e2c329665 100644 --- a/src/nspawn/nspawn-register.c +++ b/src/nspawn/nspawn-register.c @@ -112,6 +112,7 @@ int register_machine( unsigned n_mounts, int kill_signal, char **properties, + sd_bus_message *properties_message, bool keep_unit, const char *service) { @@ -185,6 +186,12 @@ int register_machine( if (r < 0) return r; + if (properties_message) { + r = sd_bus_message_copy(m, properties_message, true); + if (r < 0) + return bus_log_create_error(r); + } + r = bus_append_unit_property_assignment_many(m, UNIT_SERVICE, properties); if (r < 0) return r; @@ -235,7 +242,8 @@ int allocate_scope( CustomMount *mounts, unsigned n_mounts, int kill_signal, - char **properties) { + char **properties, + sd_bus_message *properties_message) { _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; @@ -289,6 +297,12 @@ int allocate_scope( if (r < 0) return r; + if (properties_message) { + r = sd_bus_message_copy(m, properties_message, true); + if (r < 0) + return bus_log_create_error(r); + } + r = append_machine_properties( m, mounts, diff --git a/src/nspawn/nspawn-register.h b/src/nspawn/nspawn-register.h index 05f5776f23..65a3ae85a7 100644 --- a/src/nspawn/nspawn-register.h +++ b/src/nspawn/nspawn-register.h @@ -7,8 +7,8 @@ #include "nspawn-mount.h" -int register_machine(sd_bus *bus, const char *machine_name, pid_t pid, const char *directory, sd_id128_t uuid, int local_ifindex, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties, bool keep_unit, const char *service); +int register_machine(sd_bus *bus, const char *machine_name, pid_t pid, const char *directory, sd_id128_t uuid, int local_ifindex, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties, sd_bus_message *properties_message, bool keep_unit, const char *service); int terminate_machine(sd_bus *bus, const char *machine_name); -int allocate_scope(sd_bus *bus, const char *machine_name, pid_t pid, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties); +int allocate_scope(sd_bus *bus, const char *machine_name, pid_t pid, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties, sd_bus_message *properties_message); int terminate_scope(sd_bus *bus, const char *machine_name); diff --git a/src/nspawn/nspawn-settings.c b/src/nspawn/nspawn-settings.c index 505e8cac40..ab69f24c54 100644 --- a/src/nspawn/nspawn-settings.c +++ b/src/nspawn/nspawn-settings.c @@ -17,6 +17,50 @@ #include "user-util.h" #include "util.h" +Settings *settings_new(void) { + Settings *s; + + s = new(Settings, 1); + if (!s) + return NULL; + + *s = (Settings) { + .start_mode = _START_MODE_INVALID, + .personality = PERSONALITY_INVALID, + + .resolv_conf = _RESOLV_CONF_MODE_INVALID, + .link_journal = _LINK_JOURNAL_INVALID, + .timezone = _TIMEZONE_MODE_INVALID, + + .userns_mode = _USER_NAMESPACE_MODE_INVALID, + .userns_chown = -1, + .uid_shift = UID_INVALID, + .uid_range = UID_INVALID, + + .no_new_privileges = -1, + + .read_only = -1, + .volatile_mode = _VOLATILE_MODE_INVALID, + + .private_network = -1, + .network_veth = -1, + + .full_capabilities = CAPABILITY_QUINTET_NULL, + + .uid = UID_INVALID, + .gid = GID_INVALID, + + .console_mode = _CONSOLE_MODE_INVALID, + .console_width = (unsigned) -1, + .console_height = (unsigned) -1, + + .clone_ns_flags = (unsigned long) -1, + .use_cgns = -1, + }; + + return s; +} + int settings_load(FILE *f, const char *path, Settings **ret) { _cleanup_(settings_freep) Settings *s = NULL; int r; @@ -24,27 +68,10 @@ int settings_load(FILE *f, const char *path, Settings **ret) { assert(path); assert(ret); - s = new0(Settings, 1); + s = settings_new(); if (!s) return -ENOMEM; - s->start_mode = _START_MODE_INVALID; - s->personality = PERSONALITY_INVALID; - s->userns_mode = _USER_NAMESPACE_MODE_INVALID; - s->resolv_conf = _RESOLV_CONF_MODE_INVALID; - s->link_journal = _LINK_JOURNAL_INVALID; - s->timezone = _TIMEZONE_MODE_INVALID; - s->uid_shift = UID_INVALID; - s->uid_range = UID_INVALID; - s->no_new_privileges = -1; - - s->read_only = -1; - s->volatile_mode = _VOLATILE_MODE_INVALID; - s->userns_chown = -1; - - s->private_network = -1; - s->network_veth = -1; - r = config_parse(NULL, path, f, "Exec\0" "Network\0" @@ -66,12 +93,33 @@ int settings_load(FILE *f, const char *path, Settings **ret) { s->userns_mode = USER_NAMESPACE_NO; *ret = TAKE_PTR(s); - return 0; } -Settings* settings_free(Settings *s) { +static void free_oci_hooks(OciHook *h, size_t n) { + size_t i; + assert(h || n == 0); + + for (i = 0; i < n; i++) { + free(h[i].path); + strv_free(h[i].args); + strv_free(h[i].env); + } + + free(h); +} + +void device_node_free_many(DeviceNode *node, size_t n) { + size_t i; + + for (i = 0; i < n; i++) + free(node[i].path); + + free(node); +} + +Settings* settings_free(Settings *s) { if (!s) return NULL; @@ -96,6 +144,28 @@ Settings* settings_free(Settings *s) { expose_port_free_all(s->expose_ports); custom_mount_free_all(s->custom_mounts, s->n_custom_mounts); + + free(s->bundle); + free(s->root); + + free_oci_hooks(s->oci_hooks_prestart, s->n_oci_hooks_prestart); + free_oci_hooks(s->oci_hooks_poststart, s->n_oci_hooks_poststart); + free_oci_hooks(s->oci_hooks_poststop, s->n_oci_hooks_poststop); + + free(s->slice); + sd_bus_message_unref(s->properties); + + free(s->supplementary_gids); + device_node_free_many(s->extra_nodes, s->n_extra_nodes); + free(s->extra_nodes); + free(s->network_namespace_path); + + strv_free(s->sysctl); + +#if HAVE_SECCOMP + seccomp_release(s->seccomp); +#endif + return mfree(s); } @@ -122,6 +192,26 @@ bool settings_network_veth(Settings *s) { s->network_zone; } +int settings_allocate_properties(Settings *s) { + _cleanup_(sd_bus_unrefp) sd_bus *bus = NULL; + int r; + + assert(s); + + if (s->properties) + return 0; + + r = sd_bus_default_system(&bus); + if (r < 0) + return r; + + r = sd_bus_message_new(bus, &s->properties, SD_BUS_MESSAGE_METHOD_CALL); + if (r < 0) + return r; + + return 0; +} + DEFINE_CONFIG_PARSE_ENUM(config_parse_volatile_mode, volatile_mode, VolatileMode, "Failed to parse volatile mode"); int config_parse_expose_port( @@ -315,6 +405,34 @@ int config_parse_tmpfs( return 0; } +int config_parse_inaccessible( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = inaccessible_mount_parse(&settings->custom_mounts, &settings->n_custom_mounts, rvalue); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Invalid inaccessible file system specification %s: %m", rvalue); + return 0; + } + + return 0; +} + int config_parse_overlay( const char *unit, const char *filename, diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h index a63aa32e90..cc802f77af 100644 --- a/src/nspawn/nspawn-settings.h +++ b/src/nspawn/nspawn-settings.h @@ -4,8 +4,14 @@ #include #include +#if HAVE_SECCOMP +#include +#endif + +#include "sd-bus.h" #include "sd-id128.h" +#include "capability-util.h" #include "conf-parser.h" #include "macro.h" #include "missing_resource.h" @@ -60,6 +66,15 @@ typedef enum TimezoneMode { _TIMEZONE_MODE_INVALID = -1 } TimezoneMode; +typedef enum ConsoleMode { + CONSOLE_INTERACTIVE, + CONSOLE_READ_ONLY, + CONSOLE_PASSIVE, + CONSOLE_PIPE, + _CONSOLE_MODE_MAX, + _CONSOLE_MODE_INVALID = -1, +} ConsoleMode; + typedef enum SettingsMask { SETTING_START_MODE = UINT64_C(1) << 0, SETTING_ENVIRONMENT = UINT64_C(1) << 1, @@ -86,9 +101,14 @@ typedef enum SettingsMask { SETTING_LINK_JOURNAL = UINT64_C(1) << 22, SETTING_TIMEZONE = UINT64_C(1) << 23, SETTING_EPHEMERAL = UINT64_C(1) << 24, - SETTING_RLIMIT_FIRST = UINT64_C(1) << 25, /* we define one bit per resource limit here */ - SETTING_RLIMIT_LAST = UINT64_C(1) << (25 + _RLIMIT_MAX - 1), - _SETTINGS_MASK_ALL = (UINT64_C(1) << (25 + _RLIMIT_MAX)) -1, + SETTING_SLICE = UINT64_C(1) << 25, + SETTING_DIRECTORY = UINT64_C(1) << 26, + SETTING_USE_CGNS = UINT64_C(1) << 27, + SETTING_CLONE_NS_FLAGS = UINT64_C(1) << 28, + SETTING_CONSOLE_MODE = UINT64_C(1) << 29, + SETTING_RLIMIT_FIRST = UINT64_C(1) << 30, /* we define one bit per resource limit here */ + SETTING_RLIMIT_LAST = UINT64_C(1) << (30 + _RLIMIT_MAX - 1), + _SETTINGS_MASK_ALL = (UINT64_C(1) << (30 + _RLIMIT_MAX)) -1, _SETTING_FORCE_ENUM_WIDTH = UINT64_MAX } SettingsMask; @@ -101,6 +121,22 @@ assert_cc(sizeof(SettingsMask) == 8); assert_cc(sizeof(SETTING_RLIMIT_FIRST) == 8); assert_cc(sizeof(SETTING_RLIMIT_LAST) == 8); +typedef struct DeviceNode { + char *path; + unsigned major; + unsigned minor; + mode_t mode; + uid_t uid; + gid_t gid; +} DeviceNode; + +typedef struct OciHook { + char *path; + char **args; + char **env; + usec_t timeout; +} OciHook; + typedef struct Settings { /* [Run] */ StartMode start_mode; @@ -150,13 +186,39 @@ typedef struct Settings { char **network_ipvlan; char **network_veth_extra; ExposePort *expose_ports; + + /* Additional fields, that are specific to OCI runtime case */ + char *bundle; + char *root; + OciHook *oci_hooks_prestart, *oci_hooks_poststart, *oci_hooks_poststop; + size_t n_oci_hooks_prestart, n_oci_hooks_poststart, n_oci_hooks_poststop; + char *slice; + sd_bus_message *properties; + CapabilityQuintet full_capabilities; + uid_t uid; + gid_t gid; + gid_t *supplementary_gids; + size_t n_supplementary_gids; + unsigned console_width, console_height; + ConsoleMode console_mode; + DeviceNode *extra_nodes; + size_t n_extra_nodes; + unsigned long clone_ns_flags; + char *network_namespace_path; + int use_cgns; + char **sysctl; +#if HAVE_SECCOMP + scmp_filter_ctx seccomp; +#endif } Settings; +Settings *settings_new(void); int settings_load(FILE *f, const char *path, Settings **ret); Settings* settings_free(Settings *s); bool settings_network_veth(Settings *s); bool settings_private_network(Settings *s); +int settings_allocate_properties(Settings *s); DEFINE_TRIVIAL_CLEANUP_FUNC(Settings*, settings_free); @@ -170,6 +232,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_pivot_root); CONFIG_PARSER_PROTOTYPE(config_parse_bind); CONFIG_PARSER_PROTOTYPE(config_parse_tmpfs); CONFIG_PARSER_PROTOTYPE(config_parse_overlay); +CONFIG_PARSER_PROTOTYPE(config_parse_inaccessible); CONFIG_PARSER_PROTOTYPE(config_parse_veth_extra); CONFIG_PARSER_PROTOTYPE(config_parse_network_zone); CONFIG_PARSER_PROTOTYPE(config_parse_boot); @@ -190,3 +253,5 @@ const char *timezone_mode_to_string(TimezoneMode a) _const_; TimezoneMode timezone_mode_from_string(const char *s) _pure_; int parse_link_journal(const char *s, LinkJournal *ret_mode, bool *ret_try); + +void device_node_free_many(DeviceNode *node, size_t n); diff --git a/src/nspawn/nspawn-setuid.c b/src/nspawn/nspawn-setuid.c index f207bcec04..0b26f1a042 100644 --- a/src/nspawn/nspawn-setuid.c +++ b/src/nspawn/nspawn-setuid.c @@ -59,14 +59,41 @@ static int spawn_getent(const char *database, const char *key, pid_t *rpid) { return pipe_fds[0]; } +int change_uid_gid_raw( + uid_t uid, + gid_t gid, + const gid_t *supplementary_gids, + size_t n_supplementary_gids) { + + if (!uid_is_valid(uid)) + uid = 0; + if (!gid_is_valid(gid)) + gid = 0; + + (void) fchown(STDIN_FILENO, uid, gid); + (void) fchown(STDOUT_FILENO, uid, gid); + (void) fchown(STDERR_FILENO, uid, gid); + + if (setgroups(n_supplementary_gids, supplementary_gids) < 0) + return log_error_errno(errno, "Failed to set auxiliary groups: %m"); + + if (setresgid(gid, gid, gid) < 0) + return log_error_errno(errno, "setresgid() failed: %m"); + + if (setresuid(uid, uid, uid) < 0) + return log_error_errno(errno, "setresuid() failed: %m"); + + return 0; +} + int change_uid_gid(const char *user, char **_home) { char *x, *u, *g, *h; const char *word, *state; - _cleanup_free_ uid_t *uids = NULL; + _cleanup_free_ gid_t *gids = NULL; _cleanup_free_ char *home = NULL, *line = NULL; _cleanup_fclose_ FILE *f = NULL; _cleanup_close_ int fd = -1; - unsigned n_uids = 0; + unsigned n_gids = 0; size_t sz = 0, l; uid_t uid; gid_t gid; @@ -189,10 +216,10 @@ int change_uid_gid(const char *user, char **_home) { memcpy(c, word, l); c[l] = 0; - if (!GREEDY_REALLOC(uids, sz, n_uids+1)) + if (!GREEDY_REALLOC(gids, sz, n_gids+1)) return log_oom(); - r = parse_uid(c, &uids[n_uids++]); + r = parse_gid(c, &gids[n_gids++]); if (r < 0) return log_error_errno(r, "Failed to parse group data from getent: %m"); } @@ -205,18 +232,9 @@ int change_uid_gid(const char *user, char **_home) { if (r < 0 && !IN_SET(r, -EEXIST, -ENOTDIR)) return log_error_errno(r, "Failed to make home directory: %m"); - (void) fchown(STDIN_FILENO, uid, gid); - (void) fchown(STDOUT_FILENO, uid, gid); - (void) fchown(STDERR_FILENO, uid, gid); - - if (setgroups(n_uids, uids) < 0) - return log_error_errno(errno, "Failed to set auxiliary groups: %m"); - - if (setresgid(gid, gid, gid) < 0) - return log_error_errno(errno, "setresgid() failed: %m"); - - if (setresuid(uid, uid, uid) < 0) - return log_error_errno(errno, "setresuid() failed: %m"); + r = change_uid_gid_raw(uid, gid, gids, n_gids); + if (r < 0) + return r; if (_home) *_home = TAKE_PTR(home); diff --git a/src/nspawn/nspawn-setuid.h b/src/nspawn/nspawn-setuid.h index 0ae47cb6b3..9a2b05ebbb 100644 --- a/src/nspawn/nspawn-setuid.h +++ b/src/nspawn/nspawn-setuid.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1+ */ #pragma once -int change_uid_gid(const char *user, char **ret); +int change_uid_gid_raw(uid_t uid, gid_t gid, const gid_t *supplementary_gids, size_t n_supplementary_gids); +int change_uid_gid(const char *user, char **ret_home); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 05203057b6..b2cf419484 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -68,6 +68,7 @@ #include "nspawn-expose-ports.h" #include "nspawn-mount.h" #include "nspawn-network.h" +#include "nspawn-oci.h" #include "nspawn-patch-uid.h" #include "nspawn-register.h" #include "nspawn-seccomp.h" @@ -86,6 +87,9 @@ #include "raw-clone.h" #include "rlimit-util.h" #include "rm-rf.h" +#if HAVE_SECCOMP +#include "seccomp-util.h" +#endif #include "selinux-util.h" #include "signal-util.h" #include "socket-util.h" @@ -94,6 +98,7 @@ #include "string-table.h" #include "string-util.h" #include "strv.h" +#include "sysctl-util.h" #include "terminal-util.h" #include "tmpfile-util.h" #include "umask-util.h" @@ -124,12 +129,16 @@ static char *arg_chdir = NULL; static char *arg_pivot_root_new = NULL; static char *arg_pivot_root_old = NULL; static char *arg_user = NULL; +static uid_t arg_uid = UID_INVALID; +static gid_t arg_gid = GID_INVALID; +static gid_t* arg_supplementary_gids = NULL; +static size_t arg_n_supplementary_gids = 0; static sd_id128_t arg_uuid = {}; static char *arg_machine = NULL; /* The name used by the host to refer to this */ static char *arg_hostname = NULL; /* The name the payload sees by default */ static const char *arg_selinux_context = NULL; static const char *arg_selinux_apifs_context = NULL; -static const char *arg_slice = NULL; +static char *arg_slice = NULL; static bool arg_private_network = false; static bool arg_read_only = false; static StartMode arg_start_mode = START_PID1; @@ -163,6 +172,7 @@ static uint64_t arg_caps_retain = (1ULL << CAP_SYS_PTRACE) | (1ULL << CAP_SYS_RESOURCE) | (1ULL << CAP_SYS_TTY_CONFIG); +static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL; static CustomMount *arg_custom_mounts = NULL; static size_t arg_n_custom_mounts = 0; static char **arg_setenv = NULL; @@ -179,9 +189,11 @@ static char *arg_network_zone = NULL; static char *arg_network_namespace_path = NULL; static unsigned long arg_personality = PERSONALITY_INVALID; static char *arg_image = NULL; +static char *arg_oci_bundle = NULL; static VolatileMode arg_volatile_mode = VOLATILE_NO; static ExposePort *arg_expose_ports = NULL; static char **arg_property = NULL; +static sd_bus_message *arg_property_message = NULL; static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO; static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; static bool arg_userns_chown = false; @@ -199,6 +211,9 @@ static void *arg_root_hash = NULL; static size_t arg_root_hash_size = 0; static char **arg_syscall_whitelist = NULL; static char **arg_syscall_blacklist = NULL; +#if HAVE_SECCOMP +static scmp_filter_ctx arg_seccomp = NULL; +#endif static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {}; static bool arg_no_new_privileges = false; static int arg_oom_score_adjust = 0; @@ -207,6 +222,11 @@ static cpu_set_t *arg_cpuset = NULL; static unsigned arg_cpuset_ncpus = 0; static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO; static TimezoneMode arg_timezone = TIMEZONE_AUTO; +static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1; +static DeviceNode* arg_extra_nodes = NULL; +static size_t arg_n_extra_nodes = 0; +static char **arg_sysctl = NULL; +static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID; static int help(void) { _cleanup_free_ char *link = NULL; @@ -229,6 +249,7 @@ static int help(void) { " -x --ephemeral Run container with snapshot of root directory, and\n" " remove it after exit\n" " -i --image=PATH File system device or disk image for the container\n" + " --oci-bundle=PATH OCI bundle directory\n" " --root-hash=HASH Specify verity root hash\n" " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n" " -b --boot Boot up full system (i.e. invoke init)\n" @@ -297,6 +318,8 @@ static int help(void) { " the container\n" " --bind-ro=PATH[:PATH[:OPTIONS]\n" " Similar, but creates a read-only bind mount\n" + " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n" + " it\n" " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n" " --overlay=PATH[:PATH...]:PATH\n" " Create an overlay mount from the host to \n" @@ -310,6 +333,9 @@ static int help(void) { " --volatile[=MODE] Run the system in volatile mode\n" " --settings=BOOLEAN Load additional settings from .nspawn file\n" " --notify-ready=BOOLEAN Receive notifications from the child init process\n" + " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n" + " set up for the container.\n" + " -P --pipe Equivalent to --console=pipe\n" "\nSee the %s for details.\n" , program_invocation_short_name , link @@ -401,7 +427,9 @@ static void parse_share_ns_env(const char *name, unsigned long ns_flag) { return; if (r < 0) log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name); + arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag); + arg_settings_mask |= SETTING_CLONE_NS_FLAGS; } static void parse_mount_settings_env(void) { @@ -446,9 +474,20 @@ static void parse_environment(void) { /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use, * even if it is supported. If not supported, it has no effect. */ - r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS"); - if (r == 0 || !cg_ns_supported()) + if (!cg_ns_supported()) arg_use_cgns = false; + else { + r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS"); + if (r < 0) { + if (r != -ENXIO) + log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS, ignoring: %m"); + + arg_use_cgns = true; + } else { + arg_use_cgns = r > 0; + arg_settings_mask |= SETTING_USE_CGNS; + } + } e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE"); if (e) @@ -471,6 +510,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_TMPFS, ARG_OVERLAY, ARG_OVERLAY_RO, + ARG_INACCESSIBLE, ARG_SHARE_SYSTEM, ARG_REGISTER, ARG_KEEP_UNIT, @@ -501,6 +541,9 @@ static int parse_argv(int argc, char *argv[]) { ARG_CPU_AFFINITY, ARG_RESOLV_CONF, ARG_TIMEZONE, + ARG_CONSOLE, + ARG_PIPE, + ARG_OCI_BUNDLE, }; static const struct option options[] = { @@ -524,6 +567,7 @@ static int parse_argv(int argc, char *argv[]) { { "tmpfs", required_argument, NULL, ARG_TMPFS }, { "overlay", required_argument, NULL, ARG_OVERLAY }, { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO }, + { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE }, { "machine", required_argument, NULL, 'M' }, { "hostname", required_argument, NULL, ARG_HOSTNAME }, { "slice", required_argument, NULL, 'S' }, @@ -561,6 +605,9 @@ static int parse_argv(int argc, char *argv[]) { { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY }, { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF }, { "timezone", required_argument, NULL, ARG_TIMEZONE }, + { "console", required_argument, NULL, ARG_CONSOLE }, + { "pipe", no_argument, NULL, ARG_PIPE }, + { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE }, {} }; @@ -572,7 +619,7 @@ static int parse_argv(int argc, char *argv[]) { assert(argc >= 0); assert(argv); - while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0) switch (c) { case 'h': @@ -585,18 +632,31 @@ static int parse_argv(int argc, char *argv[]) { r = parse_path_argument_and_warn(optarg, false, &arg_directory); if (r < 0) return r; + + arg_settings_mask |= SETTING_DIRECTORY; break; case ARG_TEMPLATE: r = parse_path_argument_and_warn(optarg, false, &arg_template); if (r < 0) return r; + + arg_settings_mask |= SETTING_DIRECTORY; break; case 'i': r = parse_path_argument_and_warn(optarg, false, &arg_image); if (r < 0) return r; + + arg_settings_mask |= SETTING_DIRECTORY; + break; + + case ARG_OCI_BUNDLE: + r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle); + if (r < 0) + return r; + break; case 'x': @@ -704,6 +764,7 @@ static int parse_argv(int argc, char *argv[]) { if (r < 0) return r; + arg_settings_mask |= SETTING_NETWORK; break; case 'b': @@ -737,7 +798,11 @@ static int parse_argv(int argc, char *argv[]) { break; case 'S': - arg_slice = optarg; + r = free_and_strdup(&arg_slice, optarg); + if (r < 0) + return log_oom(); + + arg_settings_mask |= SETTING_SLICE; break; case 'M': @@ -792,7 +857,6 @@ static int parse_argv(int argc, char *argv[]) { r = extract_first_word(&p, &t, ",", 0); if (r < 0) return log_error_errno(r, "Failed to parse capability %s.", t); - if (r == 0) break; @@ -870,6 +934,14 @@ static int parse_argv(int argc, char *argv[]) { arg_settings_mask |= SETTING_CUSTOM_MOUNTS; break; + case ARG_INACCESSIBLE: + r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg); + + arg_settings_mask |= SETTING_CUSTOM_MOUNTS; + break; + case 'E': { char **n; @@ -1252,6 +1324,34 @@ static int parse_argv(int argc, char *argv[]) { arg_settings_mask |= SETTING_TIMEZONE; break; + case ARG_CONSOLE: + if (streq(optarg, "interactive")) + arg_console_mode = CONSOLE_INTERACTIVE; + else if (streq(optarg, "read-only")) + arg_console_mode = CONSOLE_READ_ONLY; + else if (streq(optarg, "passive")) + arg_console_mode = CONSOLE_PASSIVE; + else if (streq(optarg, "pipe")) + arg_console_mode = CONSOLE_PIPE; + else if (streq(optarg, "help")) + puts("interactive\n" + "read-only\n" + "passive\n" + "pipe"); + else { + log_error("Unknown console mode: %s", optarg); + return -EINVAL; + } + + arg_settings_mask |= SETTING_CONSOLE_MODE; + break; + + case 'P': + case ARG_PIPE: + arg_console_mode = CONSOLE_PIPE; + arg_settings_mask |= SETTING_CONSOLE_MODE; + break; + case '?': return -EINVAL; @@ -1275,7 +1375,10 @@ static int parse_argv(int argc, char *argv[]) { * --directory=". */ arg_directory = TAKE_PTR(arg_template); - arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus; + arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus; + + /* Make sure to parse environment before we reset the settings mask below */ + parse_environment(); /* Load all settings from .nspawn files */ if (mask_no_settings) @@ -1337,7 +1440,8 @@ static int verify_arguments(void) { return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support."); if (arg_userns_chown && arg_read_only) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--read-only and --private-users-chown may not be combined."); + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--read-only and --private-users-chown may not be combined."); /* We don't support --private-users-chown together with any of the volatile modes since we couldn't * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive @@ -1345,17 +1449,18 @@ static int verify_arguments(void) { if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined."); - /* If --network-namespace-path is given with any other network-related option, - * we need to error out, to avoid conflicts between different network options. */ + /* If --network-namespace-path is given with any other network-related option, we need to error out, + * to avoid conflicts between different network options. */ if (arg_network_namespace_path && (arg_network_interfaces || arg_network_macvlan || arg_network_ipvlan || arg_network_veth_extra || arg_network_bridge || arg_network_zone || arg_network_veth || arg_private_network)) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path cannot be combined with other network options."); + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options."); if (arg_network_bridge && arg_network_zone) - return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-bridge= and --network-zone= may not be combined."); + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "--network-bridge= and --network-zone= may not be combined."); if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network."); @@ -1729,9 +1834,9 @@ static int copy_devnodes(const char *dest) { "tty\0" "net/tun\0"; + _cleanup_umask_ mode_t u; const char *d; int r = 0; - _cleanup_umask_ mode_t u; assert(dest); @@ -1811,6 +1916,32 @@ static int copy_devnodes(const char *dest) { return r; } +static int make_extra_nodes(const char *dest) { + _cleanup_umask_ mode_t u; + size_t i; + int r; + + u = umask(0000); + + for (i = 0; i < arg_n_extra_nodes; i++) { + _cleanup_free_ char *path = NULL; + DeviceNode *n = arg_extra_nodes + i; + + path = prefix_root(dest, n->path); + if (!path) + return log_oom(); + + if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0) + return log_error_errno(errno, "Failed to create device node '%s': %m", path); + + r = chmod_and_chown(path, n->mode, n->uid, n->gid); + if (r < 0) + return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path); + } + + return 0; +} + static int setup_pts(const char *dest) { _cleanup_free_ char *options = NULL; const char *p; @@ -1867,10 +1998,12 @@ static int setup_dev_console(const char *dest, const char *console) { int r; assert(dest); - assert(console); u = umask(0000); + if (!console) + return 0; + r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift); if (r < 0) return log_error_errno(r, "Failed to correct access mode for TTY: %m"); @@ -2108,8 +2241,40 @@ static int setup_journal(const char *directory) { return 0; } -static int drop_capabilities(void) { - return capability_bounding_set_drop(arg_caps_retain, false); +static int drop_capabilities(uid_t uid) { + CapabilityQuintet q; + + /* Let's initialize all five capability sets to something valid. If the quintet was configured via + * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from + * arg_caps_retain. */ + + if (capability_quintet_is_set(&arg_full_capabilities)) { + q = arg_full_capabilities; + + if (q.bounding == (uint64_t) -1) + q.bounding = uid == 0 ? arg_caps_retain : 0; + + if (q.effective == (uint64_t) -1) + q.effective = uid == 0 ? q.bounding : 0; + + if (q.inheritable == (uint64_t) -1) + q.inheritable = uid == 0 ? q.bounding : 0; + + if (q.permitted == (uint64_t) -1) + q.permitted = uid == 0 ? q.bounding : 0; + + if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported()) + q.ambient = 0; + } else + q = (CapabilityQuintet) { + .bounding = arg_caps_retain, + .effective = uid == 0 ? arg_caps_retain : 0, + .inheritable = uid == 0 ? arg_caps_retain : 0, + .permitted = uid == 0 ? arg_caps_retain : 0, + .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1, + }; + + return capability_quintet_enforce(&q); } static int reset_audit_loginuid(void) { @@ -2496,6 +2661,77 @@ static int determine_uid_shift(const char *directory) { return 0; } +static unsigned long effective_clone_ns_flags(void) { + unsigned long flags = arg_clone_ns_flags; + + if (arg_private_network) + flags |= CLONE_NEWNET; + if (arg_use_cgns) + flags |= CLONE_NEWCGROUP; + if (arg_userns_mode != USER_NAMESPACE_NO) + flags |= CLONE_NEWUSER; + + return flags; +} + +static int patch_sysctl(void) { + + /* This table is inspired by runc's sysctl() function */ + static const struct { + const char *key; + bool prefix; + unsigned long clone_flags; + } safe_sysctl[] = { + { "kernel.hostname", false, CLONE_NEWUTS }, + { "kernel.domainname", false, CLONE_NEWUTS }, + { "kernel.msgmax", false, CLONE_NEWIPC }, + { "kernel.msgmnb", false, CLONE_NEWIPC }, + { "kernel.msgmni", false, CLONE_NEWIPC }, + { "kernel.sem", false, CLONE_NEWIPC }, + { "kernel.shmall", false, CLONE_NEWIPC }, + { "kernel.shmmax", false, CLONE_NEWIPC }, + { "kernel.shmmni", false, CLONE_NEWIPC }, + { "fs.mqueue.", true, CLONE_NEWIPC }, + { "net.", true, CLONE_NEWNET }, + }; + + unsigned long flags; + char **k, **v; + int r; + + flags = effective_clone_ns_flags(); + + STRV_FOREACH_PAIR(k, v, arg_sysctl) { + bool good = false; + size_t i; + + for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) { + + if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags)) + continue; + + if (safe_sysctl[i].prefix) + good = startswith(*k, safe_sysctl[i].key); + else + good = streq(*k, safe_sysctl[i].key); + + if (good) + break; + } + + if (!good) { + log_error("Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k); + return -EPERM; + } + + r = sysctl_write(*k, *v); + if (r < 0) + return log_error_errno(r, "Failed to write sysctl '%s': %m", *k); + } + + return 0; +} + static int inner_child( Barrier *barrier, const char *directory, @@ -2522,7 +2758,7 @@ static int inner_child( }; const char *exec_target; _cleanup_strv_free_ char **env_use = NULL; - int r; + int r, which_failed; /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID, @@ -2539,6 +2775,8 @@ static int inner_child( assert(directory); assert(kmsg_socket >= 0); + log_debug("Inner child is initializing."); + if (arg_userns_mode != USER_NAMESPACE_NO) { /* Tell the parent, that it now can write the UID map. */ (void) barrier_place(barrier); /* #1 */ @@ -2608,6 +2846,18 @@ static int inner_child( return r; kmsg_socket = safe_close(kmsg_socket); + r = mount_custom( + "/", + arg_custom_mounts, + arg_n_custom_mounts, + false, + 0, + 0, + arg_selinux_apifs_context, + true); + if (r < 0) + return r; + if (setsid() < 0) return log_error_errno(errno, "setsid() failed: %m"); @@ -2621,6 +2871,10 @@ static int inner_child( rtnl_socket = safe_close(rtnl_socket); } + r = patch_sysctl(); + if (r < 0) + return r; + if (arg_oom_score_adjust_set) { r = set_oom_score_adjust(arg_oom_score_adjust); if (r < 0) @@ -2631,10 +2885,6 @@ static int inner_child( if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0) return log_error_errno(errno, "Failed to set CPU affinity: %m"); - r = drop_capabilities(); - if (r < 0) - return log_error_errno(r, "drop_capabilities() failed: %m"); - (void) setup_hostname(); if (arg_personality != PERSONALITY_INVALID) { @@ -2647,16 +2897,51 @@ static int inner_child( return log_error_errno(r, "personality() failed: %m"); } + r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed); + if (r < 0) + return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed)); + +#if HAVE_SECCOMP + if (arg_seccomp) { + + if (is_seccomp_available()) { + + r = seccomp_load(arg_seccomp); + if (IN_SET(r, -EPERM, -EACCES)) + return log_error_errno(r, "Failed to install seccomp filter: %m"); + if (r < 0) + log_debug_errno(r, "Failed to install seccomp filter: %m"); + } + } else +#endif + { + r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist); + if (r < 0) + return r; + } + #if HAVE_SELINUX if (arg_selinux_context) if (setexeccon(arg_selinux_context) < 0) return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context); #endif - r = change_uid_gid(arg_user, &home); + /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps + * if we need to later on. */ + if (prctl(PR_SET_KEEPCAPS, 1) < 0) + return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m"); + + if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid)) + r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids); + else + r = change_uid_gid(arg_user, &home); if (r < 0) return r; + r = drop_capabilities(getuid()); + if (r < 0) + return log_error_errno(r, "Dropping capabilities failed: %m"); + if (arg_no_new_privileges) if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) return log_error_errno(errno, "Failed to disable new privileges: %m"); @@ -2668,10 +2953,14 @@ static int inner_child( if (envp[n_env]) n_env++; - if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) || - (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) || - (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) - return log_oom(); + if (home || !uid_is_valid(arg_uid) || arg_uid == 0) + if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0) + return log_oom(); + + if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0) + if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 || + asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0) + return log_oom(); assert(!sd_id128_is_null(arg_uuid)); @@ -2711,6 +3000,8 @@ static int inner_child( return r; } + log_debug("Inner child completed, invoking payload."); + /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need * it again. Note that the other fds closed here are at least the locking and barrier fds. */ @@ -2803,7 +3094,6 @@ static int outer_child( const char *directory, const char *console, DissectedImage *dissected_image, - bool interactive, bool secondary, int pid_socket, int uuid_socket, @@ -2816,9 +3106,9 @@ static int outer_child( int netns_fd) { _cleanup_close_ int fd = -1; - int r, which_failed; pid_t pid; ssize_t l; + int r; /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID, @@ -2827,18 +3117,21 @@ static int outer_child( assert(barrier); assert(directory); - assert(console); assert(pid_socket >= 0); assert(uuid_socket >= 0); assert(notify_socket >= 0); assert(kmsg_socket >= 0); + log_debug("Outer child is initializing."); + if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m"); - if (interactive) { + if (arg_console_mode != CONSOLE_PIPE) { int terminal; + assert(console); + terminal = open_terminal(console, O_RDWR); if (terminal < 0) return log_error_errno(terminal, "Failed to open console: %m"); @@ -2992,7 +3285,12 @@ static int outer_child( if (r < 0) return r; - dev_setup(directory, arg_uid_shift, arg_uid_shift); + r = make_extra_nodes(directory); + if (r < 0) + return r; + + (void) dev_setup(directory, arg_uid_shift, arg_uid_shift); + (void) make_inaccessible_nodes(directory, arg_uid_shift, arg_uid_shift); r = setup_pts(directory); if (r < 0) @@ -3010,10 +3308,6 @@ static int outer_child( if (r < 0) return r; - r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist); - if (r < 0) - return r; - r = setup_timezone(directory); if (r < 0) return r; @@ -3037,7 +3331,8 @@ static int outer_child( arg_userns_mode != USER_NAMESPACE_NO, arg_uid_shift, arg_uid_range, - arg_selinux_apifs_context); + arg_selinux_apifs_context, + false); if (r < 0) return r; @@ -3062,10 +3357,6 @@ static int outer_child( if (fd < 0) return fd; - r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed); - if (r < 0) - return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed)); - pid = raw_clone(SIGCHLD|CLONE_NEWNS| arg_clone_ns_flags | (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0)); @@ -3318,6 +3609,15 @@ static int merge_settings(Settings *settings, const char *path) { if ((arg_settings_mask & SETTING_EPHEMERAL) == 0) arg_ephemeral = settings->ephemeral; + if ((arg_settings_mask & SETTING_DIRECTORY) == 0 && + settings->root) { + + if (!arg_settings_trusted) + log_warning("Ignoring root directory setting, file %s is not trusted.", path); + else + free_and_replace(arg_directory, settings->root); + } + if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 && settings->pivot_root_new) { free_and_replace(arg_pivot_root_new, settings->pivot_root_new); @@ -3332,16 +3632,36 @@ static int merge_settings(Settings *settings, const char *path) { settings->environment) strv_free_and_replace(arg_setenv, settings->environment); - if ((arg_settings_mask & SETTING_USER) == 0 && - settings->user) - free_and_replace(arg_user, settings->user); + if ((arg_settings_mask & SETTING_USER) == 0) { + + if (settings->user) + free_and_replace(arg_user, settings->user); + + if (uid_is_valid(settings->uid)) + arg_uid = settings->uid; + if (gid_is_valid(settings->gid)) + arg_gid = settings->gid; + if (settings->n_supplementary_gids > 0) { + free_and_replace(arg_supplementary_gids, settings->supplementary_gids); + arg_n_supplementary_gids = settings->n_supplementary_gids; + } + } if ((arg_settings_mask & SETTING_CAPABILITY) == 0) { - uint64_t plus; + uint64_t plus, minus; + + /* Note that we copy both the simple plus/minus caps here, and the full quintet from the + * Settings structure */ plus = settings->capability; - if (settings_private_network(settings)) - plus |= (1ULL << CAP_NET_ADMIN); + minus = settings->drop_capability; + + if ((arg_settings_mask & SETTING_NETWORK) == 0) { + if (settings_private_network(settings)) + plus |= UINT64_C(1) << CAP_NET_ADMIN; + else + minus |= UINT64_C(1) << CAP_NET_ADMIN; + } if (!arg_settings_trusted && plus != 0) { if (settings->capability != 0) @@ -3349,7 +3669,15 @@ static int merge_settings(Settings *settings, const char *path) { } else arg_caps_retain |= plus; - arg_caps_retain &= ~settings->drop_capability; + arg_caps_retain &= ~minus; + + /* Copy the full capabilities over too */ + if (capability_quintet_is_set(&settings->full_capabilities)) { + if (!arg_settings_trusted) + log_warning("Ignoring capabilitiy settings, file %s is not trusted.", path); + else + arg_full_capabilities = settings->full_capabilities; + } } if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 && @@ -3398,7 +3726,8 @@ static int merge_settings(Settings *settings, const char *path) { settings->network_interfaces || settings->network_macvlan || settings->network_ipvlan || - settings->network_veth_extra)) { + settings->network_veth_extra || + settings->network_namespace_path)) { if (!arg_settings_trusted) log_warning("Ignoring network settings, file %s is not trusted.", path); @@ -3413,6 +3742,8 @@ static int merge_settings(Settings *settings, const char *path) { free_and_replace(arg_network_bridge, settings->network_bridge); free_and_replace(arg_network_zone, settings->network_zone); + + free_and_replace(arg_network_namespace_path, settings->network_namespace_path); } } @@ -3445,12 +3776,21 @@ static int merge_settings(Settings *settings, const char *path) { if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) { - if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist)) + if (!arg_settings_trusted && !strv_isempty(settings->syscall_whitelist)) log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path); else { strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist); strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist); } + +#if HAVE_SECCOMP + if (!arg_settings_trusted && settings->seccomp) + log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path); + else { + seccomp_release(arg_seccomp); + arg_seccomp = TAKE_PTR(settings->seccomp); + } +#endif } for (rl = 0; rl < _RLIMIT_MAX; rl ++) { @@ -3519,6 +3859,55 @@ static int merge_settings(Settings *settings, const char *path) { settings->timezone != _TIMEZONE_MODE_INVALID) arg_timezone = settings->timezone; + if ((arg_settings_mask & SETTING_SLICE) == 0 && + settings->slice) { + + if (!arg_settings_trusted) + log_warning("Ignoring slice setting, file '%s' is not trusted.", path); + else + free_and_replace(arg_slice, settings->slice); + } + + if ((arg_settings_mask & SETTING_USE_CGNS) == 0 && + settings->use_cgns >= 0) { + + if (!arg_settings_trusted) + log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path); + else + arg_use_cgns = settings->use_cgns; + } + + if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 && + settings->clone_ns_flags != (unsigned long) -1) { + + if (!arg_settings_trusted) + log_warning("Ignoring namespace setting, file '%s' is not trusted.", path); + else + arg_clone_ns_flags = settings->clone_ns_flags; + } + + if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 && + settings->console_mode >= 0) { + + if (!arg_settings_trusted) + log_warning("Ignoring console mode setting, file '%s' is not trusted.", path); + else + arg_console_mode = settings->console_mode; + } + + /* The following properties can only be set through the OCI settings logic, not from the command line, hence we + * don't consult arg_settings_mask for them. */ + + sd_bus_message_unref(arg_property_message); + arg_property_message = TAKE_PTR(settings->properties); + + arg_console_width = settings->console_width; + arg_console_height = settings->console_height; + + device_node_free_many(arg_extra_nodes, arg_n_extra_nodes); + arg_extra_nodes = TAKE_PTR(settings->extra_nodes); + arg_n_extra_nodes = settings->n_extra_nodes; + return 0; } @@ -3529,6 +3918,9 @@ static int load_settings(void) { const char *fn, *i; int r; + if (arg_oci_bundle) + return 0; + /* If all settings are masked, there's no point in looking for * the settings file */ if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL) @@ -3596,10 +3988,27 @@ static int load_settings(void) { return merge_settings(settings, p); } +static int load_oci_bundle(void) { + _cleanup_(settings_freep) Settings *settings = NULL; + int r; + + if (!arg_oci_bundle) + return 0; + + /* By default let's trust OCI bundles */ + if (arg_settings_trusted < 0) + arg_settings_trusted = true; + + r = oci_load(NULL, arg_oci_bundle, &settings); + if (r < 0) + return r; + + return merge_settings(settings, arg_oci_bundle); +} + static int run(int master, const char* console, DissectedImage *dissected_image, - bool interactive, bool secondary, FDSet *fds, char veth_name[IFNAMSIZ], bool *veth_created, @@ -3630,7 +4039,6 @@ static int run(int master, _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; ContainerStatus container_status = 0; - char last_char = 0; int ifi = 0, r; ssize_t l; sigset_t mask_chld; @@ -3732,7 +4140,6 @@ static int run(int master, arg_directory, console, dissected_image, - interactive, secondary, pid_socket_pair[1], uuid_socket_pair[1], @@ -3944,6 +4351,7 @@ static int run(int master, arg_custom_mounts, arg_n_custom_mounts, arg_kill_signal, arg_property, + arg_property_message, arg_keep_unit, arg_container_service_name); if (r < 0) @@ -3957,7 +4365,8 @@ static int run(int master, arg_slice, arg_custom_mounts, arg_n_custom_mounts, arg_kill_signal, - arg_property); + arg_property, + arg_property_message); if (r < 0) return r; @@ -4046,22 +4455,32 @@ static int run(int master, rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]); - r = pty_forward_new(event, master, - PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), - &forward); - if (r < 0) - return log_error_errno(r, "Failed to create PTY forwarder: %m"); + if (IN_SET(arg_console_mode, CONSOLE_INTERACTIVE, CONSOLE_READ_ONLY)) { + assert(master >= 0); + + r = pty_forward_new(event, master, + PTY_FORWARD_IGNORE_VHANGUP | (arg_console_mode == CONSOLE_READ_ONLY ? PTY_FORWARD_READ_ONLY : 0), + &forward); + if (r < 0) + return log_error_errno(r, "Failed to create PTY forwarder: %m"); + + if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1) + (void) pty_forward_set_width_height(forward, arg_console_width, arg_console_height); + } r = sd_event_loop(event); if (r < 0) return log_error_errno(r, "Failed to run event loop: %m"); - pty_forward_get_last_char(forward, &last_char); + if (forward) { + char last_char = 0; - forward = pty_forward_free(forward); + (void) pty_forward_get_last_char(forward, &last_char); + forward = pty_forward_free(forward); - if (!arg_quiet && last_char != '\n') - putc('\n', stdout); + if (!arg_quiet && last_char != '\n') + putc('\n', stdout); + } /* Kill if it is not dead yet anyway */ if (bus) { @@ -4185,7 +4604,7 @@ int main(int argc, char *argv[]) { pid_t pid = 0; union in_addr_union exposed = {}; _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT; - bool interactive, veth_created = false, remove_tmprootdir = false; + bool veth_created = false, remove_tmprootdir = false; char tmprootdir[] = "/tmp/nspawn-root-XXXXXX"; _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL; _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL; @@ -4210,6 +4629,10 @@ int main(int argc, char *argv[]) { if (r < 0) goto finish; + r = load_oci_bundle(); + if (r < 0) + goto finish; + r = determine_names(); if (r < 0) goto finish; @@ -4218,8 +4641,6 @@ int main(int argc, char *argv[]) { if (r < 0) goto finish; - parse_environment(); - r = cg_unified_flush(); if (r < 0) { log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m"); @@ -4485,31 +4906,37 @@ int main(int argc, char *argv[]) { if (r < 0) goto finish; - interactive = - isatty(STDIN_FILENO) > 0 && - isatty(STDOUT_FILENO) > 0; + if (arg_console_mode < 0) + arg_console_mode = + isatty(STDIN_FILENO) > 0 && + isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY; - master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK); - if (master < 0) { - r = log_error_errno(errno, "Failed to acquire pseudo tty: %m"); - goto finish; - } + if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */ + arg_quiet = true; - r = ptsname_malloc(master, &console); - if (r < 0) { - r = log_error_errno(r, "Failed to determine tty name: %m"); - goto finish; - } - - if (arg_selinux_apifs_context) { - r = mac_selinux_apply(console, arg_selinux_apifs_context); - if (r < 0) + if (arg_console_mode != CONSOLE_PIPE) { + master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK); + if (master < 0) { + r = log_error_errno(errno, "Failed to acquire pseudo tty: %m"); goto finish; - } + } - if (unlockpt(master) < 0) { - r = log_error_errno(errno, "Failed to unlock tty: %m"); - goto finish; + r = ptsname_malloc(master, &console); + if (r < 0) { + r = log_error_errno(r, "Failed to determine tty name: %m"); + goto finish; + } + + if (arg_selinux_apifs_context) { + r = mac_selinux_apply(console, arg_selinux_apifs_context); + if (r < 0) + goto finish; + } + + if (unlockpt(master) < 0) { + r = log_error_errno(errno, "Failed to unlock tty: %m"); + goto finish; + } } if (!arg_quiet) @@ -4527,7 +4954,7 @@ int main(int argc, char *argv[]) { r = run(master, console, dissected_image, - interactive, secondary, + secondary, fds, veth_name, &veth_created, &exposed, @@ -4592,6 +5019,7 @@ finish: free(arg_machine); free(arg_hostname); free(arg_user); + free(arg_supplementary_gids); free(arg_pivot_root_new); free(arg_pivot_root_old); free(arg_chdir); @@ -4605,13 +5033,21 @@ finish: free(arg_network_zone); free(arg_network_namespace_path); strv_free(arg_property); + sd_bus_message_unref(arg_property_message); custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts); expose_port_free_all(arg_expose_ports); free(arg_root_hash); rlimit_free_all(arg_rlimit); strv_free(arg_syscall_whitelist); strv_free(arg_syscall_blacklist); +#if HAVE_SECCOMP + seccomp_release(arg_seccomp); +#endif arg_cpuset = cpu_set_mfree(arg_cpuset); + free(arg_oci_bundle); + device_node_free_many(arg_extra_nodes, arg_n_extra_nodes); + strv_free(arg_sysctl); + free(arg_slice); return r < 0 ? EXIT_FAILURE : ret; }