de40a3037a
This is a pretty large patch, and adds support for OCI runtime bundles to nspawn. A new switch --oci-bundle= is added that takes a path to an OCI bundle. The JSON file included therein is read similar to a .nspawn settings files, however with a different feature set. Implementation-wise this mostly extends the pre-existing Settings object to carry additional properties for OCI. However, OCI supports some concepts .nspawn files did not support yet, which this patch also adds: 1. Support for "masking" files and directories. This functionatly is now also available via the new --inaccesible= cmdline command, and Inaccessible= in .nspawn files. 2. Support for mounting arbitrary file systems. (not exposed through nspawn cmdline nor .nspawn files, because probably not a good idea) 3. Ability to configure the console settings for a container. This functionality is now also available on the nspawn cmdline in the new --console= switch (not added to .nspawn for now, as it is something specific to the invocation really, not a property of the container) 4. Console width/height configuration. Not exposed through .nspawn/cmdline, but this may be controlled through $COLUMNS and $LINES like in most other UNIX tools. 5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on the cmdline, since containers likely have different user tables, and the existing --user= switch appears to be the better option) 6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to OCI) 7. Creation of additional devices nodes in /dev. Most likely not a good idea, hence not exposed in .nspawn/cmdline. There's already --bind= to achieve the same, which is the better alternative. 8. Explicit syscall filters. This is not a good idea, due to the skewed arch support, hence not exposed through .nspawn/cmdline. 9. Configuration of some sysctls on a whitelist. Questionnable, not supported in .nspawn/cmdline for now. 10. Configuration of all 5 types of capabilities. Not a useful concept, since the kernel will reduce the caps on execve() anyway. Not exposed through .nspawn/cmdline as this is not very useful hence. Note that this only implements the OCI runtime logic itself. It does not provide a runc-compatible command line tool. This is left for a later PR. Only with that in place tools such as "buildah" can use the OCI support in nspawn as drop-in replacement. Currently still missing is OCI hook support, but it's already parsed and everything, and should be easy to add. Other than that it's OCI is implemented pretty comprehensively. There's a list of incompatibilities in the nspawn-oci.c file. In a later PR I'd like to convert this into proper markdown and add it to the documentation directory.
258 lines
8.4 KiB
C
258 lines
8.4 KiB
C
/* SPDX-License-Identifier: LGPL-2.1+ */
|
|
#pragma once
|
|
|
|
#include <sched.h>
|
|
#include <stdio.h>
|
|
|
|
#if HAVE_SECCOMP
|
|
#include <seccomp.h>
|
|
#endif
|
|
|
|
#include "sd-bus.h"
|
|
#include "sd-id128.h"
|
|
|
|
#include "capability-util.h"
|
|
#include "conf-parser.h"
|
|
#include "macro.h"
|
|
#include "missing_resource.h"
|
|
#include "nspawn-expose-ports.h"
|
|
#include "nspawn-mount.h"
|
|
|
|
typedef enum StartMode {
|
|
START_PID1, /* Run parameters as command line as process 1 */
|
|
START_PID2, /* Use stub init process as PID 1, run parameters as command line as process 2 */
|
|
START_BOOT, /* Search for init system, pass arguments as parameters */
|
|
_START_MODE_MAX,
|
|
_START_MODE_INVALID = -1
|
|
} StartMode;
|
|
|
|
typedef enum UserNamespaceMode {
|
|
USER_NAMESPACE_NO,
|
|
USER_NAMESPACE_FIXED,
|
|
USER_NAMESPACE_PICK,
|
|
_USER_NAMESPACE_MODE_MAX,
|
|
_USER_NAMESPACE_MODE_INVALID = -1,
|
|
} UserNamespaceMode;
|
|
|
|
typedef enum ResolvConfMode {
|
|
RESOLV_CONF_OFF,
|
|
RESOLV_CONF_COPY_HOST,
|
|
RESOLV_CONF_COPY_STATIC,
|
|
RESOLV_CONF_BIND_HOST,
|
|
RESOLV_CONF_BIND_STATIC,
|
|
RESOLV_CONF_DELETE,
|
|
RESOLV_CONF_AUTO,
|
|
_RESOLV_CONF_MODE_MAX,
|
|
_RESOLV_CONF_MODE_INVALID = -1
|
|
} ResolvConfMode;
|
|
|
|
typedef enum LinkJournal {
|
|
LINK_NO,
|
|
LINK_AUTO,
|
|
LINK_HOST,
|
|
LINK_GUEST,
|
|
_LINK_JOURNAL_MAX,
|
|
_LINK_JOURNAL_INVALID = -1
|
|
} LinkJournal;
|
|
|
|
typedef enum TimezoneMode {
|
|
TIMEZONE_OFF,
|
|
TIMEZONE_COPY,
|
|
TIMEZONE_BIND,
|
|
TIMEZONE_SYMLINK,
|
|
TIMEZONE_DELETE,
|
|
TIMEZONE_AUTO,
|
|
_TIMEZONE_MODE_MAX,
|
|
_TIMEZONE_MODE_INVALID = -1
|
|
} TimezoneMode;
|
|
|
|
typedef enum ConsoleMode {
|
|
CONSOLE_INTERACTIVE,
|
|
CONSOLE_READ_ONLY,
|
|
CONSOLE_PASSIVE,
|
|
CONSOLE_PIPE,
|
|
_CONSOLE_MODE_MAX,
|
|
_CONSOLE_MODE_INVALID = -1,
|
|
} ConsoleMode;
|
|
|
|
typedef enum SettingsMask {
|
|
SETTING_START_MODE = UINT64_C(1) << 0,
|
|
SETTING_ENVIRONMENT = UINT64_C(1) << 1,
|
|
SETTING_USER = UINT64_C(1) << 2,
|
|
SETTING_CAPABILITY = UINT64_C(1) << 3,
|
|
SETTING_KILL_SIGNAL = UINT64_C(1) << 4,
|
|
SETTING_PERSONALITY = UINT64_C(1) << 5,
|
|
SETTING_MACHINE_ID = UINT64_C(1) << 6,
|
|
SETTING_NETWORK = UINT64_C(1) << 7,
|
|
SETTING_EXPOSE_PORTS = UINT64_C(1) << 8,
|
|
SETTING_READ_ONLY = UINT64_C(1) << 9,
|
|
SETTING_VOLATILE_MODE = UINT64_C(1) << 10,
|
|
SETTING_CUSTOM_MOUNTS = UINT64_C(1) << 11,
|
|
SETTING_WORKING_DIRECTORY = UINT64_C(1) << 12,
|
|
SETTING_USERNS = UINT64_C(1) << 13,
|
|
SETTING_NOTIFY_READY = UINT64_C(1) << 14,
|
|
SETTING_PIVOT_ROOT = UINT64_C(1) << 15,
|
|
SETTING_SYSCALL_FILTER = UINT64_C(1) << 16,
|
|
SETTING_HOSTNAME = UINT64_C(1) << 17,
|
|
SETTING_NO_NEW_PRIVILEGES = UINT64_C(1) << 18,
|
|
SETTING_OOM_SCORE_ADJUST = UINT64_C(1) << 19,
|
|
SETTING_CPU_AFFINITY = UINT64_C(1) << 20,
|
|
SETTING_RESOLV_CONF = UINT64_C(1) << 21,
|
|
SETTING_LINK_JOURNAL = UINT64_C(1) << 22,
|
|
SETTING_TIMEZONE = UINT64_C(1) << 23,
|
|
SETTING_EPHEMERAL = UINT64_C(1) << 24,
|
|
SETTING_SLICE = UINT64_C(1) << 25,
|
|
SETTING_DIRECTORY = UINT64_C(1) << 26,
|
|
SETTING_USE_CGNS = UINT64_C(1) << 27,
|
|
SETTING_CLONE_NS_FLAGS = UINT64_C(1) << 28,
|
|
SETTING_CONSOLE_MODE = UINT64_C(1) << 29,
|
|
SETTING_RLIMIT_FIRST = UINT64_C(1) << 30, /* we define one bit per resource limit here */
|
|
SETTING_RLIMIT_LAST = UINT64_C(1) << (30 + _RLIMIT_MAX - 1),
|
|
_SETTINGS_MASK_ALL = (UINT64_C(1) << (30 + _RLIMIT_MAX)) -1,
|
|
_SETTING_FORCE_ENUM_WIDTH = UINT64_MAX
|
|
} SettingsMask;
|
|
|
|
/* We want to use SETTING_RLIMIT_FIRST in shifts, so make sure it is really 64 bits
|
|
* when used in expressions. */
|
|
#define SETTING_RLIMIT_FIRST ((uint64_t) SETTING_RLIMIT_FIRST)
|
|
#define SETTING_RLIMIT_LAST ((uint64_t) SETTING_RLIMIT_LAST)
|
|
|
|
assert_cc(sizeof(SettingsMask) == 8);
|
|
assert_cc(sizeof(SETTING_RLIMIT_FIRST) == 8);
|
|
assert_cc(sizeof(SETTING_RLIMIT_LAST) == 8);
|
|
|
|
typedef struct DeviceNode {
|
|
char *path;
|
|
unsigned major;
|
|
unsigned minor;
|
|
mode_t mode;
|
|
uid_t uid;
|
|
gid_t gid;
|
|
} DeviceNode;
|
|
|
|
typedef struct OciHook {
|
|
char *path;
|
|
char **args;
|
|
char **env;
|
|
usec_t timeout;
|
|
} OciHook;
|
|
|
|
typedef struct Settings {
|
|
/* [Run] */
|
|
StartMode start_mode;
|
|
bool ephemeral;
|
|
char **parameters;
|
|
char **environment;
|
|
char *user;
|
|
uint64_t capability;
|
|
uint64_t drop_capability;
|
|
int kill_signal;
|
|
unsigned long personality;
|
|
sd_id128_t machine_id;
|
|
char *working_directory;
|
|
char *pivot_root_new;
|
|
char *pivot_root_old;
|
|
UserNamespaceMode userns_mode;
|
|
uid_t uid_shift, uid_range;
|
|
bool notify_ready;
|
|
char **syscall_whitelist;
|
|
char **syscall_blacklist;
|
|
struct rlimit *rlimit[_RLIMIT_MAX];
|
|
char *hostname;
|
|
int no_new_privileges;
|
|
int oom_score_adjust;
|
|
bool oom_score_adjust_set;
|
|
cpu_set_t *cpuset;
|
|
unsigned cpuset_ncpus;
|
|
ResolvConfMode resolv_conf;
|
|
LinkJournal link_journal;
|
|
bool link_journal_try;
|
|
TimezoneMode timezone;
|
|
|
|
/* [Image] */
|
|
int read_only;
|
|
VolatileMode volatile_mode;
|
|
CustomMount *custom_mounts;
|
|
size_t n_custom_mounts;
|
|
int userns_chown;
|
|
|
|
/* [Network] */
|
|
int private_network;
|
|
int network_veth;
|
|
char *network_bridge;
|
|
char *network_zone;
|
|
char **network_interfaces;
|
|
char **network_macvlan;
|
|
char **network_ipvlan;
|
|
char **network_veth_extra;
|
|
ExposePort *expose_ports;
|
|
|
|
/* Additional fields, that are specific to OCI runtime case */
|
|
char *bundle;
|
|
char *root;
|
|
OciHook *oci_hooks_prestart, *oci_hooks_poststart, *oci_hooks_poststop;
|
|
size_t n_oci_hooks_prestart, n_oci_hooks_poststart, n_oci_hooks_poststop;
|
|
char *slice;
|
|
sd_bus_message *properties;
|
|
CapabilityQuintet full_capabilities;
|
|
uid_t uid;
|
|
gid_t gid;
|
|
gid_t *supplementary_gids;
|
|
size_t n_supplementary_gids;
|
|
unsigned console_width, console_height;
|
|
ConsoleMode console_mode;
|
|
DeviceNode *extra_nodes;
|
|
size_t n_extra_nodes;
|
|
unsigned long clone_ns_flags;
|
|
char *network_namespace_path;
|
|
int use_cgns;
|
|
char **sysctl;
|
|
#if HAVE_SECCOMP
|
|
scmp_filter_ctx seccomp;
|
|
#endif
|
|
} Settings;
|
|
|
|
Settings *settings_new(void);
|
|
int settings_load(FILE *f, const char *path, Settings **ret);
|
|
Settings* settings_free(Settings *s);
|
|
|
|
bool settings_network_veth(Settings *s);
|
|
bool settings_private_network(Settings *s);
|
|
int settings_allocate_properties(Settings *s);
|
|
|
|
DEFINE_TRIVIAL_CLEANUP_FUNC(Settings*, settings_free);
|
|
|
|
const struct ConfigPerfItem* nspawn_gperf_lookup(const char *key, GPERF_LEN_TYPE length);
|
|
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_capability);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_id128);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_expose_port);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_volatile_mode);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_pivot_root);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_bind);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_tmpfs);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_overlay);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_inaccessible);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_veth_extra);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_network_zone);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_boot);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_pid2);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_private_users);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_syscall_filter);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_hostname);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_oom_score_adjust);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_cpu_affinity);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_resolv_conf);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_link_journal);
|
|
CONFIG_PARSER_PROTOTYPE(config_parse_timezone);
|
|
|
|
const char *resolv_conf_mode_to_string(ResolvConfMode a) _const_;
|
|
ResolvConfMode resolv_conf_mode_from_string(const char *s) _pure_;
|
|
|
|
const char *timezone_mode_to_string(TimezoneMode a) _const_;
|
|
TimezoneMode timezone_mode_from_string(const char *s) _pure_;
|
|
|
|
int parse_link_journal(const char *s, LinkJournal *ret_mode, bool *ret_try);
|
|
|
|
void device_node_free_many(DeviceNode *node, size_t n);
|