Merge pull request #9762 from poettering/nspawn-oci

OCI runtime support for nspawn
This commit is contained in:
Zbigniew Jędrzejewski-Szmek 2019-03-21 11:01:53 +01:00 committed by GitHub
commit d0b6a10c00
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 3423 additions and 210 deletions

View File

@ -238,6 +238,15 @@
together with <option>--directory=</option>, <option>--template=</option>.</para></listitem>
</varlistentry>
<varlistentry>
<term><option>--oci-bundle=</option></term>
<listitem><para>Takes the path to an OCI runtime bundle to invoke, as specified in the <ulink
url="https://github.com/opencontainers/runtime-spec/blob/master/spec.md">OCI Runtime Specification</ulink>. In
this case no <filename>.nspawn</filename> file is loaded, and the root directory and various settings are read
from the OCI runtime JSON data (but data passed on the command line takes precedence).</para></listitem>
</varlistentry>
<varlistentry>
<term><option>--root-hash=</option></term>
@ -952,6 +961,16 @@
make them read-only, using <option>--bind-ro=</option>.</para></listitem>
</varlistentry>
<varlistentry>
<term><option>--inaccessible=</option></term>
<listitem><para>Make the specified path inaccessible in the container. This over-mounts the specified path
(which must exist in the container) with a file node of the same type that is empty and has the most
restrictive access mode supported. This is an effective way to mask files, directories and other file system
objects from the container payload. This option may be used more than once in case all specified paths are
masked.</para></listitem>
</varlistentry>
<varlistentry>
<term><option>--tmpfs=</option></term>
@ -1084,6 +1103,42 @@
same as the one reported on the host.</para></listitem>
</varlistentry>
<varlistentry>
<term><option>--console=</option><replaceable>MODE</replaceable></term>
<listitem><para>Configures how to set up standard input, output and error output for the container payload, as
well as the <filename>/dev/console</filename> device for the container. Takes one of
<option>interactive</option>, <option>read-only</option>, <option>passive</option> or <option>pipe</option>. If
<option>interactive</option> a pseudo-TTY is allocated and made available as <filename>/dev/console</filename>
in the container. It is then bi-directionally connected to the standard input and output passed to
<command>systemd-nspawn</command>. <option>read-only</option> is similar but only the output of the container
is propagated and no input from the caller is read. In <option>passive</option> mode a pseudo TTY is allocated,
but it is not connected anywhere. Finally, in <option>pipe</option> mode no pseudo TTY is allocated, but the
passed standard input, output and error output file descriptors are passed on — as they are — to the container
payload. In this mode <filename>/dev/console</filename> will not exist in the container. Note that in this mode
the container payload generally cannot be a full init system as init systems tend to require
<filename>/dev/console</filename> to be available. On the other hand, in this mode container invocations can be
used within shell pipelines. This is because intermediary pseudo TTYs do not permit independent bidirectional
propagation of the end-of-file (EOF) condition, which is necessary for shell pipelines to work
correctly.</para>
<para>Note that the <option>pipe</option> mode should be used carefully, as passing arbitrary file descriptors
to less trusted container payloads might open up unwanted interfaces for access by the container payload. For
example, if a passed file descriptor refers to a TTY of some form, APIs such as <constant>TIOCSTI</constant>
may be used to synthesize input that might be used for escaping the container. Hence <option>pipe</option> mode
should only be used if the payload is sufficiently trusted or when the standard input/output/error output file
descriptors are known safe, for example pipes. Defaults to <option>interactive</option> if
<command>systemd-nspawn</command> is invoked from a terminal, and <option>read-only</option>
otherwise.</para></listitem>
</varlistentry>
<varlistentry>
<term><option>--pipe</option></term>
<term><option>-P</option></term>
<listitem><para>Equivalent to <option>--console=pipe</option>.</para></listitem>
</varlistentry>
<varlistentry>
<term><option>-q</option></term>
<term><option>--quiet</option></term>

View File

@ -425,6 +425,17 @@
is privileged (see above).</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>Inaccessible=</varname></term>
<listitem><para>Masks the specified file or directly in the container, by over-mounting it with an empty file
node of the same type with the most restrictive access mode. Takes a file system path as arugment. This option
may be used multiple times to mask multiple files or directories. This option is equivalent to the command line
switch <option>--inaccessible=</option>, see
<citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> for details
about the specific options supported. This setting is privileged (see above).</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>Overlay=</varname></term>
<term><varname>OverlayReadOnly=</varname></term>

View File

@ -47,6 +47,13 @@ unsigned long cap_last_cap(void) {
if (r >= 0) {
r = safe_atolu(content, &p);
if (r >= 0) {
if (p > 63) /* Safety for the future: if one day the kernel learns more than 64 caps,
* then we are in trouble (since we, as much userspace and kernel space
* store capability masks in uint64_t types. Let's hence protect
* ourselves against that and always cap at 63 for now. */
p = 63;
saved = p;
valid = true;
return p;
@ -58,17 +65,15 @@ unsigned long cap_last_cap(void) {
if (prctl(PR_CAPBSET_READ, p) < 0) {
/* Hmm, look downwards, until we find one that
* works */
/* Hmm, look downwards, until we find one that works */
for (p--; p > 0; p --)
if (prctl(PR_CAPBSET_READ, p) >= 0)
break;
} else {
/* Hmm, look upwards, until we find one that doesn't
* work */
for (;; p++)
/* Hmm, look upwards, until we find one that doesn't work */
for (; p < 63; p++)
if (prctl(PR_CAPBSET_READ, p+1) < 0)
break;
}
@ -363,6 +368,7 @@ bool ambient_capabilities_supported(void) {
int capability_quintet_enforce(const CapabilityQuintet *q) {
_cleanup_cap_free_ cap_t c = NULL;
bool need_set_proc_again = false;
int r;
if (q->ambient != (uint64_t) -1) {
@ -393,7 +399,6 @@ int capability_quintet_enforce(const CapabilityQuintet *q) {
if (cap_set_flag(c, CAP_INHERITABLE, 1, &cv, CAP_SET) < 0)
return -errno;
if (cap_set_flag(c, CAP_PERMITTED, 1, &cv, CAP_SET) < 0)
return -errno;
@ -426,8 +431,15 @@ int capability_quintet_enforce(const CapabilityQuintet *q) {
if (q->inheritable != (uint64_t) -1) {
cap_flag_value_t old_value, new_value;
if (cap_get_flag(c, cv, CAP_INHERITABLE, &old_value) < 0)
if (cap_get_flag(c, cv, CAP_INHERITABLE, &old_value) < 0) {
if (errno == EINVAL) /* If the kernel knows more caps than this
* version of libcap, then this will return
* EINVAL. In that case, simply ignore it,
* pretend it doesn't exist. */
continue;
return -errno;
}
new_value = (q->inheritable & m) ? CAP_SET : CAP_CLEAR;
@ -442,8 +454,12 @@ int capability_quintet_enforce(const CapabilityQuintet *q) {
if (q->permitted != (uint64_t) -1) {
cap_flag_value_t old_value, new_value;
if (cap_get_flag(c, cv, CAP_PERMITTED, &old_value) < 0)
if (cap_get_flag(c, cv, CAP_PERMITTED, &old_value) < 0) {
if (errno == EINVAL)
continue;
return -errno;
}
new_value = (q->permitted & m) ? CAP_SET : CAP_CLEAR;
@ -458,8 +474,12 @@ int capability_quintet_enforce(const CapabilityQuintet *q) {
if (q->effective != (uint64_t) -1) {
cap_flag_value_t old_value, new_value;
if (cap_get_flag(c, cv, CAP_EFFECTIVE, &old_value) < 0)
if (cap_get_flag(c, cv, CAP_EFFECTIVE, &old_value) < 0) {
if (errno == EINVAL)
continue;
return -errno;
}
new_value = (q->effective & m) ? CAP_SET : CAP_CLEAR;
@ -472,9 +492,39 @@ int capability_quintet_enforce(const CapabilityQuintet *q) {
}
}
if (changed)
if (cap_set_proc(c) < 0)
if (changed) {
_cleanup_cap_free_ cap_t modified = NULL;
/* In order to change the bounding caps, we need to keep CAP_SETPCAP for a bit
* longer. Let's add it to our list hence for now. */
if (q->bounding != (uint64_t) -1) {
cap_value_t cv = CAP_SETPCAP;
modified = cap_dup(c);
if (!modified)
return -ENOMEM;
if (cap_set_flag(modified, CAP_PERMITTED, 1, &cv, CAP_SET) < 0)
return -errno;
if (cap_set_flag(modified, CAP_EFFECTIVE, 1, &cv, CAP_SET) < 0)
return -errno;
if (cap_compare(modified, c) == 0) {
/* No change? then drop this nonsense again */
cap_free(modified);
modified = NULL;
}
}
/* Now, let's enforce the caps for the first time. Note that this is where we acquire
* caps in any of the sets we currently don't have. We have to do this before
* droppoing the bounding caps below, since at that point we can never acquire new
* caps in inherited/permitted/effective anymore, but only lose them.*/
if (cap_set_proc(modified ?: c) < 0)
return -errno;
need_set_proc_again = !!modified;
}
}
if (q->bounding != (uint64_t) -1) {
@ -483,5 +533,13 @@ int capability_quintet_enforce(const CapabilityQuintet *q) {
return r;
}
/* If needed, let's now set the caps again, this time in the final version, which differs from what
* we have already set only in the CAP_SETPCAP bit, which we needed for dropping the bounding
* bits. This call only undoes bits and doesn't acquire any which means the bounding caps don't
* matter. */
if (need_set_proc_again)
if (cap_set_proc(c) < 0)
return -errno;
return 0;
}

View File

@ -33,10 +33,12 @@ static inline void cap_free_charpp(char **p) {
}
#define _cleanup_cap_free_charp_ _cleanup_(cap_free_charpp)
static inline uint64_t all_capabilities(void) {
return UINT64_MAX >> (63 - cap_last_cap());
}
static inline bool cap_test_all(uint64_t caps) {
uint64_t m;
m = (UINT64_C(1) << (cap_last_cap() + 1)) - 1;
return FLAGS_SET(caps, m);
return FLAGS_SET(caps, all_capabilities());
}
bool ambient_capabilities_supported(void);

View File

@ -10,6 +10,8 @@ libnspawn_core_sources = files('''
nspawn-mount.h
nspawn-network.c
nspawn-network.h
nspawn-oci.c
nspawn-oci.h
nspawn-patch-uid.c
nspawn-patch-uid.h
nspawn-register.c

View File

@ -263,7 +263,7 @@ static int mount_legacy_cgroup_hierarchy(
if (r > 0)
return 0;
mkdir_p(to, 0755);
(void) mkdir_p(to, 0755);
/* The superblock mount options of the mount point need to be
* identical to the hosts', and hence writable... */

View File

@ -62,6 +62,7 @@ Files.Volatile, config_parse_volatile_mode, 0, of
Files.Bind, config_parse_bind, 0, 0
Files.BindReadOnly, config_parse_bind, 1, 0
Files.TemporaryFileSystem, config_parse_tmpfs, 0, 0
Files.Inaccessible, config_parse_inaccessible, 0, 0
Files.Overlay, config_parse_overlay, 0, 0
Files.OverlayReadOnly, config_parse_overlay, 1, 0
Files.PrivateUsersChown, config_parse_tristate, 0, offsetof(Settings, userns_chown)

View File

@ -65,6 +65,7 @@ void custom_mount_free_all(CustomMount *l, size_t n) {
}
strv_free(m->lower);
free(m->type_argument);
}
free(l);
@ -116,32 +117,40 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) {
for (i = 0; i < n; i++) {
CustomMount *m = l + i;
if (m->source) {
char *s;
/* /proc we mount in the inner child, i.e. when we acquired CLONE_NEWPID. All other mounts we mount
* already in the outer child, so that the mounts are already established before CLONE_NEWPID and in
* particular CLONE_NEWUSER. This also means any custom mounts below /proc also need to be mounted in
* the inner child, not the outer one. Determine this here. */
m->in_userns = path_startswith(m->destination, "/proc");
s = resolve_source_path(dest, m->source);
if (!s)
return log_oom();
if (m->type == CUSTOM_MOUNT_BIND) {
if (m->source) {
char *s;
free_and_replace(m->source, s);
} else {
/* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
s = resolve_source_path(dest, m->source);
if (!s)
return log_oom();
m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX");
if (!m->rm_rf_tmpdir)
return log_oom();
free_and_replace(m->source, s);
} else {
/* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
if (!mkdtemp(m->rm_rf_tmpdir)) {
m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir);
return log_error_errno(errno, "Failed to acquire temporary directory: %m");
m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX");
if (!m->rm_rf_tmpdir)
return log_oom();
if (!mkdtemp(m->rm_rf_tmpdir)) {
m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir);
return log_error_errno(errno, "Failed to acquire temporary directory: %m");
}
m->source = strjoin(m->rm_rf_tmpdir, "/src");
if (!m->source)
return log_oom();
if (mkdir(m->source, 0755) < 0)
return log_error_errno(errno, "Failed to create %s: %m", m->source);
}
m->source = strjoin(m->rm_rf_tmpdir, "/src");
if (!m->source)
return log_oom();
if (mkdir(m->source, 0755) < 0)
return log_error_errno(errno, "Failed to create %s: %m", m->source);
}
if (m->type == CUSTOM_MOUNT_OVERLAY) {
@ -223,6 +232,7 @@ int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only)
m->destination = TAKE_PTR(destination);
m->read_only = read_only;
m->options = TAKE_PTR(opts);
return 0;
}
@ -327,6 +337,29 @@ int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_onl
return 0;
}
int inaccessible_mount_parse(CustomMount **l, size_t *n, const char *s) {
_cleanup_free_ char *path = NULL;
CustomMount *m;
assert(l);
assert(n);
assert(s);
if (!path_is_absolute(s))
return -EINVAL;
path = strdup(s);
if (!path)
return -ENOMEM;
m = custom_mount_add(l, n, CUSTOM_MOUNT_INACCESSIBLE);
if (!m)
return -ENOMEM;
m->destination = TAKE_PTR(path);
return 0;
}
int tmpfs_patch_options(
const char *options,
uid_t uid_shift,
@ -494,9 +527,9 @@ int mount_all(const char *dest,
uid_t uid_shift,
const char *selinux_apifs_context) {
#define PROC_INACCESSIBLE(path) \
{ NULL, (path), NULL, NULL, MS_BIND, \
MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_INACCESSIBLE_REG }, /* Bind mount first ... */ \
#define PROC_INACCESSIBLE_REG(path) \
{ "/run/systemd/inaccessible/reg", (path), NULL, NULL, MS_BIND, \
MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
{ NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
@ -531,11 +564,11 @@ int mount_all(const char *dest,
/* Make these files inaccessible to container payloads: they potentially leak information about kernel
* internals or the host's execution environment to the container */
PROC_INACCESSIBLE("/proc/kallsyms"),
PROC_INACCESSIBLE("/proc/kcore"),
PROC_INACCESSIBLE("/proc/keys"),
PROC_INACCESSIBLE("/proc/sysrq-trigger"),
PROC_INACCESSIBLE("/proc/timer_list"),
PROC_INACCESSIBLE_REG("/proc/kallsyms"),
PROC_INACCESSIBLE_REG("/proc/kcore"),
PROC_INACCESSIBLE_REG("/proc/keys"),
PROC_INACCESSIBLE_REG("/proc/sysrq-trigger"),
PROC_INACCESSIBLE_REG("/proc/timer_list"),
/* Make these directories read-only to container payloads: they show hardware information, and in some
* cases contain tunables the container really shouldn't have access to. */
@ -573,7 +606,6 @@ int mount_all(const char *dest,
#endif
};
_cleanup_(unlink_and_freep) char *inaccessible = NULL;
bool use_userns = (mount_settings & MOUNT_USE_USERNS);
bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
@ -584,7 +616,7 @@ int mount_all(const char *dest,
for (k = 0; k < ELEMENTSOF(mount_table); k++) {
_cleanup_free_ char *where = NULL, *options = NULL;
const char *o, *what;
const char *o;
bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
@ -603,33 +635,14 @@ int mount_all(const char *dest,
if (r < 0)
return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
if (mount_table[k].mount_settings & MOUNT_INACCESSIBLE_REG) {
if (!inaccessible) {
_cleanup_free_ char *np = NULL;
r = tempfn_random_child(NULL, "inaccessible", &np);
if (r < 0)
return log_error_errno(r, "Failed to generate inaccessible file node path: %m");
r = touch_file(np, false, USEC_INFINITY, UID_INVALID, GID_INVALID, 0000);
if (r < 0)
return log_error_errno(r, "Failed to create inaccessible file node '%s': %m", np);
inaccessible = TAKE_PTR(np);
}
what = inaccessible;
} else
what = mount_table[k].what;
r = path_is_mount_point(where, NULL, 0);
if (r < 0 && r != -ENOENT)
return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
/* Skip this entry if it is not a remount. */
if (what && r > 0)
continue;
if (mount_table[k].what) {
r = path_is_mount_point(where, NULL, 0);
if (r < 0 && r != -ENOENT)
return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
if (r > 0)
continue;
}
r = mkdir_userns_p(dest, where, 0755, (use_userns && !in_userns) ? uid_shift : UID_INVALID);
if (r < 0 && r != -EEXIST) {
@ -654,7 +667,7 @@ int mount_all(const char *dest,
}
r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
what,
mount_table[k].what,
where,
mount_table[k].type,
mount_table[k].flags,
@ -667,7 +680,6 @@ int mount_all(const char *dest,
}
static int mount_bind(const char *dest, CustomMount *m) {
_cleanup_free_ char *where = NULL;
struct stat source_st, dest_st;
int r;
@ -711,7 +723,6 @@ static int mount_bind(const char *dest, CustomMount *m) {
r = touch(where);
if (r < 0)
return log_error_errno(r, "Failed to create mount point %s: %m", where);
}
r = mount_verbose(LOG_ERR, m->source, where, NULL, MS_BIND | MS_REC, m->options);
@ -773,7 +784,6 @@ static char *joined_and_escaped_lower_dirs(char **lower) {
}
static int mount_overlay(const char *dest, CustomMount *m) {
_cleanup_free_ char *lower = NULL, *where = NULL, *escaped_source = NULL;
const char *options;
int r;
@ -815,11 +825,59 @@ static int mount_overlay(const char *dest, CustomMount *m) {
return mount_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options);
}
static int mount_inaccessible(const char *dest, CustomMount *m) {
_cleanup_free_ char *where = NULL;
const char *source;
struct stat st;
int r;
assert(dest);
assert(m);
r = chase_symlinks_and_stat(m->destination, dest, CHASE_PREFIX_ROOT, &where, &st);
if (r < 0) {
log_full_errno(m->graceful ? LOG_DEBUG : LOG_ERR, r, "Failed to resolve %s/%s: %m", dest, m->destination);
return m->graceful ? 0 : r;
}
assert_se(source = mode_to_inaccessible_node(st.st_mode));
r = mount_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, source, where, NULL, MS_BIND, NULL);
if (r < 0)
return m->graceful ? 0 : r;
r = mount_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, NULL, where, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, NULL);
if (r < 0)
return m->graceful ? 0 : r;
return 0;
}
static int mount_arbitrary(const char *dest, CustomMount *m) {
_cleanup_free_ char *where = NULL;
int r;
assert(dest);
assert(m);
r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where);
if (r < 0)
return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
if (r == 0) { /* Doesn't exist yet? */
r = mkdir_p_label(where, 0755);
if (r < 0)
return log_error_errno(r, "Creating mount point for mount %s failed: %m", where);
}
return mount_verbose(LOG_ERR, m->source, where, m->type_argument, 0, m->options);
}
int mount_custom(
const char *dest,
CustomMount *mounts, size_t n,
bool userns, uid_t uid_shift, uid_t uid_range,
const char *selinux_apifs_context) {
const char *selinux_apifs_context,
bool in_userns) {
size_t i;
int r;
@ -829,6 +887,9 @@ int mount_custom(
for (i = 0; i < n; i++) {
CustomMount *m = mounts + i;
if (m->in_userns != in_userns)
continue;
switch (m->type) {
case CUSTOM_MOUNT_BIND:
@ -843,6 +904,14 @@ int mount_custom(
r = mount_overlay(dest, m);
break;
case CUSTOM_MOUNT_INACCESSIBLE:
r = mount_inaccessible(dest, m);
break;
case CUSTOM_MOUNT_ARBITRARY:
r = mount_arbitrary(dest, m);
break;
default:
assert_not_reached("Unknown custom mount type");
}

View File

@ -13,14 +13,15 @@ typedef enum MountSettingsMask {
MOUNT_APPLY_APIVFS_RO = 1 << 3, /* if set, /proc/sys, and /sys will be mounted read-only, otherwise read-write. */
MOUNT_APPLY_APIVFS_NETNS = 1 << 4, /* if set, /proc/sys/net will be mounted read-write.
Works only if MOUNT_APPLY_APIVFS_RO is also set. */
MOUNT_INACCESSIBLE_REG = 1 << 5, /* if set, create an inaccessible regular file first and use as bind mount source */
MOUNT_APPLY_TMPFS_TMP = 1 << 6, /* if set, /tmp will be mounted as tmpfs */
MOUNT_APPLY_TMPFS_TMP = 1 << 5, /* if set, /tmp will be mounted as tmpfs */
} MountSettingsMask;
typedef enum CustomMountType {
CUSTOM_MOUNT_BIND,
CUSTOM_MOUNT_TMPFS,
CUSTOM_MOUNT_OVERLAY,
CUSTOM_MOUNT_INACCESSIBLE,
CUSTOM_MOUNT_ARBITRARY,
_CUSTOM_MOUNT_TYPE_MAX,
_CUSTOM_MOUNT_TYPE_INVALID = -1
} CustomMountType;
@ -34,6 +35,9 @@ typedef struct CustomMount {
char *work_dir;
char **lower;
char *rm_rf_tmpdir;
char *type_argument; /* only for CUSTOM_MOUNT_ARBITRARY */
bool graceful;
bool in_userns;
} CustomMount;
CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t);
@ -43,11 +47,12 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n);
int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only);
int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s);
int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only);
int inaccessible_mount_parse(CustomMount **l, size_t *n, const char *s);
int mount_all(const char *dest, MountSettingsMask mount_settings, uid_t uid_shift, const char *selinux_apifs_context);
int mount_sysfs(const char *dest, MountSettingsMask mount_settings);
int mount_custom(const char *dest, CustomMount *mounts, size_t n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
int mount_custom(const char *dest, CustomMount *mounts, size_t n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool in_userns);
int setup_volatile_mode(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);

2352
src/nspawn/nspawn-oci.c Normal file

File diff suppressed because it is too large Load Diff

6
src/nspawn/nspawn-oci.h Normal file
View File

@ -0,0 +1,6 @@
/* SPDX-License-Identifier: LGPL-2.1+ */
#pragma once
#include "nspawn-settings.h"
int oci_load(FILE *f, const char *path, Settings **ret);

View File

@ -112,6 +112,7 @@ int register_machine(
unsigned n_mounts,
int kill_signal,
char **properties,
sd_bus_message *properties_message,
bool keep_unit,
const char *service) {
@ -185,6 +186,12 @@ int register_machine(
if (r < 0)
return r;
if (properties_message) {
r = sd_bus_message_copy(m, properties_message, true);
if (r < 0)
return bus_log_create_error(r);
}
r = bus_append_unit_property_assignment_many(m, UNIT_SERVICE, properties);
if (r < 0)
return r;
@ -235,7 +242,8 @@ int allocate_scope(
CustomMount *mounts,
unsigned n_mounts,
int kill_signal,
char **properties) {
char **properties,
sd_bus_message *properties_message) {
_cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL;
_cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
@ -289,6 +297,12 @@ int allocate_scope(
if (r < 0)
return r;
if (properties_message) {
r = sd_bus_message_copy(m, properties_message, true);
if (r < 0)
return bus_log_create_error(r);
}
r = append_machine_properties(
m,
mounts,

View File

@ -7,8 +7,8 @@
#include "nspawn-mount.h"
int register_machine(sd_bus *bus, const char *machine_name, pid_t pid, const char *directory, sd_id128_t uuid, int local_ifindex, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties, bool keep_unit, const char *service);
int register_machine(sd_bus *bus, const char *machine_name, pid_t pid, const char *directory, sd_id128_t uuid, int local_ifindex, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties, sd_bus_message *properties_message, bool keep_unit, const char *service);
int terminate_machine(sd_bus *bus, const char *machine_name);
int allocate_scope(sd_bus *bus, const char *machine_name, pid_t pid, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties);
int allocate_scope(sd_bus *bus, const char *machine_name, pid_t pid, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties, sd_bus_message *properties_message);
int terminate_scope(sd_bus *bus, const char *machine_name);

View File

@ -17,6 +17,50 @@
#include "user-util.h"
#include "util.h"
Settings *settings_new(void) {
Settings *s;
s = new(Settings, 1);
if (!s)
return NULL;
*s = (Settings) {
.start_mode = _START_MODE_INVALID,
.personality = PERSONALITY_INVALID,
.resolv_conf = _RESOLV_CONF_MODE_INVALID,
.link_journal = _LINK_JOURNAL_INVALID,
.timezone = _TIMEZONE_MODE_INVALID,
.userns_mode = _USER_NAMESPACE_MODE_INVALID,
.userns_chown = -1,
.uid_shift = UID_INVALID,
.uid_range = UID_INVALID,
.no_new_privileges = -1,
.read_only = -1,
.volatile_mode = _VOLATILE_MODE_INVALID,
.private_network = -1,
.network_veth = -1,
.full_capabilities = CAPABILITY_QUINTET_NULL,
.uid = UID_INVALID,
.gid = GID_INVALID,
.console_mode = _CONSOLE_MODE_INVALID,
.console_width = (unsigned) -1,
.console_height = (unsigned) -1,
.clone_ns_flags = (unsigned long) -1,
.use_cgns = -1,
};
return s;
}
int settings_load(FILE *f, const char *path, Settings **ret) {
_cleanup_(settings_freep) Settings *s = NULL;
int r;
@ -24,27 +68,10 @@ int settings_load(FILE *f, const char *path, Settings **ret) {
assert(path);
assert(ret);
s = new0(Settings, 1);
s = settings_new();
if (!s)
return -ENOMEM;
s->start_mode = _START_MODE_INVALID;
s->personality = PERSONALITY_INVALID;
s->userns_mode = _USER_NAMESPACE_MODE_INVALID;
s->resolv_conf = _RESOLV_CONF_MODE_INVALID;
s->link_journal = _LINK_JOURNAL_INVALID;
s->timezone = _TIMEZONE_MODE_INVALID;
s->uid_shift = UID_INVALID;
s->uid_range = UID_INVALID;
s->no_new_privileges = -1;
s->read_only = -1;
s->volatile_mode = _VOLATILE_MODE_INVALID;
s->userns_chown = -1;
s->private_network = -1;
s->network_veth = -1;
r = config_parse(NULL, path, f,
"Exec\0"
"Network\0"
@ -66,12 +93,33 @@ int settings_load(FILE *f, const char *path, Settings **ret) {
s->userns_mode = USER_NAMESPACE_NO;
*ret = TAKE_PTR(s);
return 0;
}
Settings* settings_free(Settings *s) {
static void free_oci_hooks(OciHook *h, size_t n) {
size_t i;
assert(h || n == 0);
for (i = 0; i < n; i++) {
free(h[i].path);
strv_free(h[i].args);
strv_free(h[i].env);
}
free(h);
}
void device_node_free_many(DeviceNode *node, size_t n) {
size_t i;
for (i = 0; i < n; i++)
free(node[i].path);
free(node);
}
Settings* settings_free(Settings *s) {
if (!s)
return NULL;
@ -96,6 +144,28 @@ Settings* settings_free(Settings *s) {
expose_port_free_all(s->expose_ports);
custom_mount_free_all(s->custom_mounts, s->n_custom_mounts);
free(s->bundle);
free(s->root);
free_oci_hooks(s->oci_hooks_prestart, s->n_oci_hooks_prestart);
free_oci_hooks(s->oci_hooks_poststart, s->n_oci_hooks_poststart);
free_oci_hooks(s->oci_hooks_poststop, s->n_oci_hooks_poststop);
free(s->slice);
sd_bus_message_unref(s->properties);
free(s->supplementary_gids);
device_node_free_many(s->extra_nodes, s->n_extra_nodes);
free(s->extra_nodes);
free(s->network_namespace_path);
strv_free(s->sysctl);
#if HAVE_SECCOMP
seccomp_release(s->seccomp);
#endif
return mfree(s);
}
@ -122,6 +192,26 @@ bool settings_network_veth(Settings *s) {
s->network_zone;
}
int settings_allocate_properties(Settings *s) {
_cleanup_(sd_bus_unrefp) sd_bus *bus = NULL;
int r;
assert(s);
if (s->properties)
return 0;
r = sd_bus_default_system(&bus);
if (r < 0)
return r;
r = sd_bus_message_new(bus, &s->properties, SD_BUS_MESSAGE_METHOD_CALL);
if (r < 0)
return r;
return 0;
}
DEFINE_CONFIG_PARSE_ENUM(config_parse_volatile_mode, volatile_mode, VolatileMode, "Failed to parse volatile mode");
int config_parse_expose_port(
@ -315,6 +405,34 @@ int config_parse_tmpfs(
return 0;
}
int config_parse_inaccessible(
const char *unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
Settings *settings = data;
int r;
assert(filename);
assert(lvalue);
assert(rvalue);
r = inaccessible_mount_parse(&settings->custom_mounts, &settings->n_custom_mounts, rvalue);
if (r < 0) {
log_syntax(unit, LOG_ERR, filename, line, r, "Invalid inaccessible file system specification %s: %m", rvalue);
return 0;
}
return 0;
}
int config_parse_overlay(
const char *unit,
const char *filename,

View File

@ -4,8 +4,14 @@
#include <sched.h>
#include <stdio.h>
#if HAVE_SECCOMP
#include <seccomp.h>
#endif
#include "sd-bus.h"
#include "sd-id128.h"
#include "capability-util.h"
#include "conf-parser.h"
#include "macro.h"
#include "missing_resource.h"
@ -60,6 +66,15 @@ typedef enum TimezoneMode {
_TIMEZONE_MODE_INVALID = -1
} TimezoneMode;
typedef enum ConsoleMode {
CONSOLE_INTERACTIVE,
CONSOLE_READ_ONLY,
CONSOLE_PASSIVE,
CONSOLE_PIPE,
_CONSOLE_MODE_MAX,
_CONSOLE_MODE_INVALID = -1,
} ConsoleMode;
typedef enum SettingsMask {
SETTING_START_MODE = UINT64_C(1) << 0,
SETTING_ENVIRONMENT = UINT64_C(1) << 1,
@ -86,9 +101,14 @@ typedef enum SettingsMask {
SETTING_LINK_JOURNAL = UINT64_C(1) << 22,
SETTING_TIMEZONE = UINT64_C(1) << 23,
SETTING_EPHEMERAL = UINT64_C(1) << 24,
SETTING_RLIMIT_FIRST = UINT64_C(1) << 25, /* we define one bit per resource limit here */
SETTING_RLIMIT_LAST = UINT64_C(1) << (25 + _RLIMIT_MAX - 1),
_SETTINGS_MASK_ALL = (UINT64_C(1) << (25 + _RLIMIT_MAX)) -1,
SETTING_SLICE = UINT64_C(1) << 25,
SETTING_DIRECTORY = UINT64_C(1) << 26,
SETTING_USE_CGNS = UINT64_C(1) << 27,
SETTING_CLONE_NS_FLAGS = UINT64_C(1) << 28,
SETTING_CONSOLE_MODE = UINT64_C(1) << 29,
SETTING_RLIMIT_FIRST = UINT64_C(1) << 30, /* we define one bit per resource limit here */
SETTING_RLIMIT_LAST = UINT64_C(1) << (30 + _RLIMIT_MAX - 1),
_SETTINGS_MASK_ALL = (UINT64_C(1) << (30 + _RLIMIT_MAX)) -1,
_SETTING_FORCE_ENUM_WIDTH = UINT64_MAX
} SettingsMask;
@ -101,6 +121,22 @@ assert_cc(sizeof(SettingsMask) == 8);
assert_cc(sizeof(SETTING_RLIMIT_FIRST) == 8);
assert_cc(sizeof(SETTING_RLIMIT_LAST) == 8);
typedef struct DeviceNode {
char *path;
unsigned major;
unsigned minor;
mode_t mode;
uid_t uid;
gid_t gid;
} DeviceNode;
typedef struct OciHook {
char *path;
char **args;
char **env;
usec_t timeout;
} OciHook;
typedef struct Settings {
/* [Run] */
StartMode start_mode;
@ -150,13 +186,39 @@ typedef struct Settings {
char **network_ipvlan;
char **network_veth_extra;
ExposePort *expose_ports;
/* Additional fields, that are specific to OCI runtime case */
char *bundle;
char *root;
OciHook *oci_hooks_prestart, *oci_hooks_poststart, *oci_hooks_poststop;
size_t n_oci_hooks_prestart, n_oci_hooks_poststart, n_oci_hooks_poststop;
char *slice;
sd_bus_message *properties;
CapabilityQuintet full_capabilities;
uid_t uid;
gid_t gid;
gid_t *supplementary_gids;
size_t n_supplementary_gids;
unsigned console_width, console_height;
ConsoleMode console_mode;
DeviceNode *extra_nodes;
size_t n_extra_nodes;
unsigned long clone_ns_flags;
char *network_namespace_path;
int use_cgns;
char **sysctl;
#if HAVE_SECCOMP
scmp_filter_ctx seccomp;
#endif
} Settings;
Settings *settings_new(void);
int settings_load(FILE *f, const char *path, Settings **ret);
Settings* settings_free(Settings *s);
bool settings_network_veth(Settings *s);
bool settings_private_network(Settings *s);
int settings_allocate_properties(Settings *s);
DEFINE_TRIVIAL_CLEANUP_FUNC(Settings*, settings_free);
@ -170,6 +232,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_pivot_root);
CONFIG_PARSER_PROTOTYPE(config_parse_bind);
CONFIG_PARSER_PROTOTYPE(config_parse_tmpfs);
CONFIG_PARSER_PROTOTYPE(config_parse_overlay);
CONFIG_PARSER_PROTOTYPE(config_parse_inaccessible);
CONFIG_PARSER_PROTOTYPE(config_parse_veth_extra);
CONFIG_PARSER_PROTOTYPE(config_parse_network_zone);
CONFIG_PARSER_PROTOTYPE(config_parse_boot);
@ -190,3 +253,5 @@ const char *timezone_mode_to_string(TimezoneMode a) _const_;
TimezoneMode timezone_mode_from_string(const char *s) _pure_;
int parse_link_journal(const char *s, LinkJournal *ret_mode, bool *ret_try);
void device_node_free_many(DeviceNode *node, size_t n);

View File

@ -59,14 +59,41 @@ static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
return pipe_fds[0];
}
int change_uid_gid_raw(
uid_t uid,
gid_t gid,
const gid_t *supplementary_gids,
size_t n_supplementary_gids) {
if (!uid_is_valid(uid))
uid = 0;
if (!gid_is_valid(gid))
gid = 0;
(void) fchown(STDIN_FILENO, uid, gid);
(void) fchown(STDOUT_FILENO, uid, gid);
(void) fchown(STDERR_FILENO, uid, gid);
if (setgroups(n_supplementary_gids, supplementary_gids) < 0)
return log_error_errno(errno, "Failed to set auxiliary groups: %m");
if (setresgid(gid, gid, gid) < 0)
return log_error_errno(errno, "setresgid() failed: %m");
if (setresuid(uid, uid, uid) < 0)
return log_error_errno(errno, "setresuid() failed: %m");
return 0;
}
int change_uid_gid(const char *user, char **_home) {
char *x, *u, *g, *h;
const char *word, *state;
_cleanup_free_ uid_t *uids = NULL;
_cleanup_free_ gid_t *gids = NULL;
_cleanup_free_ char *home = NULL, *line = NULL;
_cleanup_fclose_ FILE *f = NULL;
_cleanup_close_ int fd = -1;
unsigned n_uids = 0;
unsigned n_gids = 0;
size_t sz = 0, l;
uid_t uid;
gid_t gid;
@ -189,10 +216,10 @@ int change_uid_gid(const char *user, char **_home) {
memcpy(c, word, l);
c[l] = 0;
if (!GREEDY_REALLOC(uids, sz, n_uids+1))
if (!GREEDY_REALLOC(gids, sz, n_gids+1))
return log_oom();
r = parse_uid(c, &uids[n_uids++]);
r = parse_gid(c, &gids[n_gids++]);
if (r < 0)
return log_error_errno(r, "Failed to parse group data from getent: %m");
}
@ -205,18 +232,9 @@ int change_uid_gid(const char *user, char **_home) {
if (r < 0 && !IN_SET(r, -EEXIST, -ENOTDIR))
return log_error_errno(r, "Failed to make home directory: %m");
(void) fchown(STDIN_FILENO, uid, gid);
(void) fchown(STDOUT_FILENO, uid, gid);
(void) fchown(STDERR_FILENO, uid, gid);
if (setgroups(n_uids, uids) < 0)
return log_error_errno(errno, "Failed to set auxiliary groups: %m");
if (setresgid(gid, gid, gid) < 0)
return log_error_errno(errno, "setresgid() failed: %m");
if (setresuid(uid, uid, uid) < 0)
return log_error_errno(errno, "setresuid() failed: %m");
r = change_uid_gid_raw(uid, gid, gids, n_gids);
if (r < 0)
return r;
if (_home)
*_home = TAKE_PTR(home);

View File

@ -1,4 +1,5 @@
/* SPDX-License-Identifier: LGPL-2.1+ */
#pragma once
int change_uid_gid(const char *user, char **ret);
int change_uid_gid_raw(uid_t uid, gid_t gid, const gid_t *supplementary_gids, size_t n_supplementary_gids);
int change_uid_gid(const char *user, char **ret_home);

File diff suppressed because it is too large Load Diff