From 960e4569e17abf7c84f07b697d57ac7d0418edfc Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 11 Sep 2017 17:45:21 +0200 Subject: [PATCH] nspawn: implement configurable syscall whitelisting/blacklisting Now that we have ported nspawn's seccomp code to the generic code in seccomp-util, let's extend it to support whitelisting and blacklisting of specific additional syscalls. This uses similar syntax as PID1's support for system call filtering, but in contrast to that always implements a blacklist (and not a whitelist), as we prepopulate the filter with a blacklist, and the unit's system call filter logic does not come with anything prepopulated. (Later on we might actually want to invert the logic here, and whitelist rather than blacklist things, but at this point let's not do that. In case we switch this over later, the syscall add/remove logic of this commit should be compatible conceptually.) Fixes: #5163 Replaces: #5944 --- man/systemd-nspawn.xml | 17 +++++++++++ man/systemd.nspawn.xml | 18 +++++++++--- src/nspawn/nspawn-gperf.gperf | 1 + src/nspawn/nspawn-seccomp.c | 24 ++++++++++++---- src/nspawn/nspawn-seccomp.h | 2 +- src/nspawn/nspawn-settings.c | 50 +++++++++++++++++++++++++++++++++ src/nspawn/nspawn-settings.h | 6 +++- src/nspawn/nspawn.c | 53 ++++++++++++++++++++++++++++++++++- src/shared/seccomp-util.c | 16 +++++++---- src/shared/seccomp-util.h | 2 +- 10 files changed, 169 insertions(+), 20 deletions(-) diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index 5d3212dec7..c4db6a3ada 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -713,6 +713,23 @@ above). + + + + Alter the system call filter applied to containers. Takes a space-separated list of system call + names or group names (the latter prefixed with @, as listed by the + syscall-filter command of systemd-analyze1). Passed + system calls will be permitted. The list may optionally be prefixed by ~, in which case all + listed system calls are prohibited. If this command line option is used multiple times the configured lists are + combined. If both a positive and a negative list (that is one system call list without and one with the + ~ prefix) are configured, the positive list takes precedence over the negative list. Note + that systemd-nspawn always implements a system call blacklist (as opposed to a whitelist), + and this command line option hence adds or removes entries from the default blacklist, depending on the + ~ prefix. Note that the applied system call filter is also altered implicitly if additional + capabilities are passed using the --capabilities=. + + diff --git a/man/systemd.nspawn.xml b/man/systemd.nspawn.xml index 4f3f052911..58024a071d 100644 --- a/man/systemd.nspawn.xml +++ b/man/systemd.nspawn.xml @@ -274,11 +274,21 @@ NotifyReady= - Configures support for notifications from the container's init process. - This is equivalent to use command line switch, - and takes the same options. See systemd-nspawn1 - for details about the specific options supported. + Configures support for notifications from the container's init process. This is equivalent to + the command line switch, and takes the same paramaters. See + systemd-nspawn1 for details + about the specific options supported. + + + SystemCallFilter= + + Configures the system call filter applied to containers. This is equivalent to the + command line switch, and takes the same list parameter. See + systemd-nspawn1 for + details. + + diff --git a/src/nspawn/nspawn-gperf.gperf b/src/nspawn/nspawn-gperf.gperf index e5fdf63162..b61b347ee7 100644 --- a/src/nspawn/nspawn-gperf.gperf +++ b/src/nspawn/nspawn-gperf.gperf @@ -29,6 +29,7 @@ Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, Exec.PivotRoot, config_parse_pivot_root, 0, 0 Exec.PrivateUsers, config_parse_private_users, 0, 0 Exec.NotifyReady, config_parse_bool, 0, offsetof(Settings, notify_ready) +Exec.SystemCallFilter, config_parse_syscall_filter,0, 0, Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only) Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode) Files.Bind, config_parse_bind, 0, 0 diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c index 25851401f3..a6f7a7dabc 100644 --- a/src/nspawn/nspawn-seccomp.c +++ b/src/nspawn/nspawn-seccomp.c @@ -33,13 +33,16 @@ #include "seccomp-util.h" #endif #include "string-util.h" +#include "strv.h" #ifdef HAVE_SECCOMP static int seccomp_add_default_syscall_filter( scmp_filter_ctx ctx, uint32_t arch, - uint64_t cap_list_retain) { + uint64_t cap_list_retain, + char **syscall_whitelist, + char **syscall_blacklist) { static const struct { uint64_t capability; @@ -67,12 +70,13 @@ static int seccomp_add_default_syscall_filter( int r, c = 0; size_t i; + char **p; for (i = 0; i < ELEMENTSOF(blacklist); i++) { if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability))) continue; - r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM)); + r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM), syscall_whitelist); if (r < 0) /* If the system call is not known on this architecture, then that's fine, let's ignore it */ log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", blacklist[i].name); @@ -80,15 +84,23 @@ static int seccomp_add_default_syscall_filter( c++; } + STRV_FOREACH(p, syscall_blacklist) { + r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ERRNO(EPERM), syscall_whitelist); + if (r < 0) + log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", *p); + else + c++; + } + return c; } -int setup_seccomp(uint64_t cap_list_retain) { +int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) { uint32_t arch; int r; if (!is_seccomp_available()) { - log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP audit filter"); + log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering"); return 0; } @@ -102,7 +114,7 @@ int setup_seccomp(uint64_t cap_list_retain) { if (r < 0) return log_error_errno(r, "Failed to allocate seccomp object: %m"); - n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain); + n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist); if (n < 0) return n; @@ -141,7 +153,7 @@ int setup_seccomp(uint64_t cap_list_retain) { #else -int setup_seccomp(uint64_t cap_list_retain) { +int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) { return 0; } diff --git a/src/nspawn/nspawn-seccomp.h b/src/nspawn/nspawn-seccomp.h index 5bde16faf9..5cf5ad1e14 100644 --- a/src/nspawn/nspawn-seccomp.h +++ b/src/nspawn/nspawn-seccomp.h @@ -21,4 +21,4 @@ #include -int setup_seccomp(uint64_t cap_list_retain); +int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist); diff --git a/src/nspawn/nspawn-settings.c b/src/nspawn/nspawn-settings.c index 5217d10665..c02c1ea697 100644 --- a/src/nspawn/nspawn-settings.c +++ b/src/nspawn/nspawn-settings.c @@ -93,6 +93,8 @@ Settings* settings_free(Settings *s) { free(s->pivot_root_new); free(s->pivot_root_old); free(s->working_directory); + strv_free(s->syscall_whitelist); + strv_free(s->syscall_blacklist); strv_free(s->network_interfaces); strv_free(s->network_macvlan); @@ -568,3 +570,51 @@ int config_parse_private_users( return 0; } + +int config_parse_syscall_filter( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + bool negative; + const char *items; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + negative = rvalue[0] == '~'; + items = negative ? rvalue + 1 : rvalue; + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&items, &word, NULL, 0); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse SystemCallFilter= parameter %s, ignoring: %m", rvalue); + return 0; + } + + if (negative) + r = strv_extend(&settings->syscall_blacklist, word); + else + r = strv_extend(&settings->syscall_whitelist, word); + if (r < 0) + return log_oom(); + } + + return 0; +} diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h index 021403258f..75d68ce4cf 100644 --- a/src/nspawn/nspawn-settings.h +++ b/src/nspawn/nspawn-settings.h @@ -58,7 +58,8 @@ typedef enum SettingsMask { SETTING_USERNS = 1 << 13, SETTING_NOTIFY_READY = 1 << 14, SETTING_PIVOT_ROOT = 1 << 15, - _SETTINGS_MASK_ALL = (1 << 16) -1 + SETTING_SYSCALL_FILTER = 1 << 16, + _SETTINGS_MASK_ALL = (1 << 17) -1 } SettingsMask; typedef struct Settings { @@ -78,6 +79,8 @@ typedef struct Settings { UserNamespaceMode userns_mode; uid_t uid_shift, uid_range; bool notify_ready; + char **syscall_whitelist; + char **syscall_blacklist; /* [Image] */ int read_only; @@ -121,3 +124,4 @@ int config_parse_network_zone(const char *unit, const char *filename, unsigned l int config_parse_boot(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_pid2(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_private_users(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_syscall_filter(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 24a3da68ca..cf804ed1b3 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -208,6 +208,8 @@ static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO; static void *arg_root_hash = NULL; static size_t arg_root_hash_size = 0; +static char **arg_syscall_whitelist = NULL; +static char **arg_syscall_blacklist = NULL; static void help(void) { printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n" @@ -267,6 +269,8 @@ static void help(void) { " --capability=CAP In addition to the default, retain specified\n" " capability\n" " --drop-capability=CAP Drop the specified capability from the default set\n" + " --system-call-filter=LIST|~LIST\n" + " Permit/prohibit specific system calls\n" " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n" " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n" " host, try-guest, try-host\n" @@ -431,6 +435,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_PRIVATE_USERS_CHOWN, ARG_NOTIFY_READY, ARG_ROOT_HASH, + ARG_SYSTEM_CALL_FILTER, }; static const struct option options[] = { @@ -482,6 +487,7 @@ static int parse_argv(int argc, char *argv[]) { { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT }, { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY }, { "root-hash", required_argument, NULL, ARG_ROOT_HASH }, + { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER }, {} }; @@ -1051,6 +1057,36 @@ static int parse_argv(int argc, char *argv[]) { break; } + case ARG_SYSTEM_CALL_FILTER: { + bool negative; + const char *items; + + negative = optarg[0] == '~'; + items = negative ? optarg + 1 : optarg; + + for (;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&items, &word, NULL, 0); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to parse system call filter: %m"); + + if (negative) + r = strv_extend(&arg_syscall_blacklist, word); + else + r = strv_extend(&arg_syscall_whitelist, word); + if (r < 0) + return log_oom(); + } + + arg_settings_mask |= SETTING_SYSCALL_FILTER; + break; + } + case '?': return -EINVAL; @@ -2606,7 +2642,7 @@ static int outer_child( if (r < 0) return r; - r = setup_seccomp(arg_caps_retain); + r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist); if (r < 0) return r; @@ -3111,6 +3147,21 @@ static int load_settings(void) { if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0) arg_notify_ready = settings->notify_ready; + if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) { + + if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist)) + log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", p); + else { + strv_free(arg_syscall_whitelist); + strv_free(arg_syscall_blacklist); + + arg_syscall_whitelist = settings->syscall_whitelist; + arg_syscall_blacklist = settings->syscall_blacklist; + + settings->syscall_whitelist = settings->syscall_blacklist = NULL; + } + } + return 0; } diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 1215f714f1..643dde6c4a 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -682,14 +682,17 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name) { return NULL; } -static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action); +static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude); -int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action) { +int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) { int r; assert(seccomp); assert(name); + if (strv_contains(exclude, name)) + return 0; + if (name[0] == '@') { const SyscallFilterSet *other; @@ -697,7 +700,7 @@ int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, if (!other) return -EINVAL; - r = seccomp_add_syscall_filter_set(seccomp, other, action); + r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude); if (r < 0) return r; } else { @@ -719,7 +722,8 @@ int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, static int seccomp_add_syscall_filter_set( scmp_filter_ctx seccomp, const SyscallFilterSet *set, - uint32_t action) { + uint32_t action, + char **exclude) { const char *sys; int r; @@ -728,7 +732,7 @@ static int seccomp_add_syscall_filter_set( assert(set); NULSTR_FOREACH(sys, set->value) { - r = seccomp_add_syscall_filter_item(seccomp, sys, action); + r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude); if (r < 0) return r; } @@ -754,7 +758,7 @@ int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilter if (r < 0) return r; - r = seccomp_add_syscall_filter_set(seccomp, set, action); + r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL); if (r < 0) { log_debug_errno(r, "Failed to add filter set, ignoring: %m"); continue; diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index 894c53e6fd..c1612f5894 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -69,7 +69,7 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name); int seccomp_filter_set_add(Set *s, bool b, const SyscallFilterSet *set); -int seccomp_add_syscall_filter_item(scmp_filter_ctx *ctx, const char *name, uint32_t action); +int seccomp_add_syscall_filter_item(scmp_filter_ctx *ctx, const char *name, uint32_t action, char **exclude); int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action); int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action);