nspawn: implement configurable syscall whitelisting/blacklisting

Now that we have ported nspawn's seccomp code to the generic code in
seccomp-util, let's extend it to support whitelisting and blacklisting
of specific additional syscalls.

This uses similar syntax as PID1's support for system call filtering,
but in contrast to that always implements a blacklist (and not a
whitelist), as we prepopulate the filter with a blacklist, and the
unit's system call filter logic does not come with anything
prepopulated.

(Later on we might actually want to invert the logic here, and
whitelist rather than blacklist things, but at this point let's not do
that. In case we switch this over later, the syscall add/remove logic of
this commit should be compatible conceptually.)

Fixes: #5163

Replaces: #5944
This commit is contained in:
Lennart Poettering 2017-09-11 17:45:21 +02:00
parent 7609340e2f
commit 960e4569e1
10 changed files with 169 additions and 20 deletions

View File

@ -713,6 +713,23 @@
above).</para></listitem>
</varlistentry>
<varlistentry>
<term><option>--system-call-filter=</option></term>
<listitem><para>Alter the system call filter applied to containers. Takes a space-separated list of system call
names or group names (the latter prefixed with <literal>@</literal>, as listed by the
<command>syscall-filter</command> command of <citerefentry
project='man-pages'><refentrytitle>systemd-analyze</refentrytitle><manvolnum>1</manvolnum></citerefentry>). Passed
system calls will be permitted. The list may optionally be prefixed by <literal>~</literal>, in which case all
listed system calls are prohibited. If this command line option is used multiple times the configured lists are
combined. If both a positive and a negative list (that is one system call list without and one with the
<literal>~</literal> prefix) are configured, the positive list takes precedence over the negative list. Note
that <command>systemd-nspawn</command> always implements a system call blacklist (as opposed to a whitelist),
and this command line option hence adds or removes entries from the default blacklist, depending on the
<literal>~</literal> prefix. Note that the applied system call filter is also altered implicitly if additional
capabilities are passed using the <command>--capabilities=</command>.</para></listitem>
</varlistentry>
<varlistentry>
<term><option>--kill-signal=</option></term>

View File

@ -274,11 +274,21 @@
<varlistentry>
<term><varname>NotifyReady=</varname></term>
<listitem><para>Configures support for notifications from the container's init process.
This is equivalent to use <option>--notify-ready=</option> command line switch,
and takes the same options. See <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>
for details about the specific options supported.</para></listitem>
<listitem><para>Configures support for notifications from the container's init process. This is equivalent to
the <option>--notify-ready=</option> command line switch, and takes the same paramaters. See
<citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> for details
about the specific options supported.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>SystemCallFilter=</varname></term>
<listitem><para>Configures the system call filter applied to containers. This is equivalent to the
<option>--system-call-filter=</option> command line switch, and takes the same list parameter. See
<citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> for
details.</para></listitem>
</varlistentry>
</variablelist>
</refsect1>

View File

@ -29,6 +29,7 @@ Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings,
Exec.PivotRoot, config_parse_pivot_root, 0, 0
Exec.PrivateUsers, config_parse_private_users, 0, 0
Exec.NotifyReady, config_parse_bool, 0, offsetof(Settings, notify_ready)
Exec.SystemCallFilter, config_parse_syscall_filter,0, 0,
Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only)
Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
Files.Bind, config_parse_bind, 0, 0

View File

@ -33,13 +33,16 @@
#include "seccomp-util.h"
#endif
#include "string-util.h"
#include "strv.h"
#ifdef HAVE_SECCOMP
static int seccomp_add_default_syscall_filter(
scmp_filter_ctx ctx,
uint32_t arch,
uint64_t cap_list_retain) {
uint64_t cap_list_retain,
char **syscall_whitelist,
char **syscall_blacklist) {
static const struct {
uint64_t capability;
@ -67,12 +70,13 @@ static int seccomp_add_default_syscall_filter(
int r, c = 0;
size_t i;
char **p;
for (i = 0; i < ELEMENTSOF(blacklist); i++) {
if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability)))
continue;
r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM));
r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
if (r < 0)
/* If the system call is not known on this architecture, then that's fine, let's ignore it */
log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", blacklist[i].name);
@ -80,15 +84,23 @@ static int seccomp_add_default_syscall_filter(
c++;
}
STRV_FOREACH(p, syscall_blacklist) {
r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
if (r < 0)
log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", *p);
else
c++;
}
return c;
}
int setup_seccomp(uint64_t cap_list_retain) {
int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
uint32_t arch;
int r;
if (!is_seccomp_available()) {
log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP audit filter");
log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering");
return 0;
}
@ -102,7 +114,7 @@ int setup_seccomp(uint64_t cap_list_retain) {
if (r < 0)
return log_error_errno(r, "Failed to allocate seccomp object: %m");
n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain);
n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
if (n < 0)
return n;
@ -141,7 +153,7 @@ int setup_seccomp(uint64_t cap_list_retain) {
#else
int setup_seccomp(uint64_t cap_list_retain) {
int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
return 0;
}

View File

@ -21,4 +21,4 @@
#include <sys/types.h>
int setup_seccomp(uint64_t cap_list_retain);
int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist);

View File

@ -93,6 +93,8 @@ Settings* settings_free(Settings *s) {
free(s->pivot_root_new);
free(s->pivot_root_old);
free(s->working_directory);
strv_free(s->syscall_whitelist);
strv_free(s->syscall_blacklist);
strv_free(s->network_interfaces);
strv_free(s->network_macvlan);
@ -568,3 +570,51 @@ int config_parse_private_users(
return 0;
}
int config_parse_syscall_filter(
const char *unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
Settings *settings = data;
bool negative;
const char *items;
int r;
assert(filename);
assert(lvalue);
assert(rvalue);
negative = rvalue[0] == '~';
items = negative ? rvalue + 1 : rvalue;
for (;;) {
_cleanup_free_ char *word = NULL;
r = extract_first_word(&items, &word, NULL, 0);
if (r == 0)
break;
if (r == -ENOMEM)
return log_oom();
if (r < 0) {
log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse SystemCallFilter= parameter %s, ignoring: %m", rvalue);
return 0;
}
if (negative)
r = strv_extend(&settings->syscall_blacklist, word);
else
r = strv_extend(&settings->syscall_whitelist, word);
if (r < 0)
return log_oom();
}
return 0;
}

View File

@ -58,7 +58,8 @@ typedef enum SettingsMask {
SETTING_USERNS = 1 << 13,
SETTING_NOTIFY_READY = 1 << 14,
SETTING_PIVOT_ROOT = 1 << 15,
_SETTINGS_MASK_ALL = (1 << 16) -1
SETTING_SYSCALL_FILTER = 1 << 16,
_SETTINGS_MASK_ALL = (1 << 17) -1
} SettingsMask;
typedef struct Settings {
@ -78,6 +79,8 @@ typedef struct Settings {
UserNamespaceMode userns_mode;
uid_t uid_shift, uid_range;
bool notify_ready;
char **syscall_whitelist;
char **syscall_blacklist;
/* [Image] */
int read_only;
@ -121,3 +124,4 @@ int config_parse_network_zone(const char *unit, const char *filename, unsigned l
int config_parse_boot(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_pid2(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_private_users(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_syscall_filter(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);

View File

@ -208,6 +208,8 @@ static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS
static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
static void *arg_root_hash = NULL;
static size_t arg_root_hash_size = 0;
static char **arg_syscall_whitelist = NULL;
static char **arg_syscall_blacklist = NULL;
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@ -267,6 +269,8 @@ static void help(void) {
" --capability=CAP In addition to the default, retain specified\n"
" capability\n"
" --drop-capability=CAP Drop the specified capability from the default set\n"
" --system-call-filter=LIST|~LIST\n"
" Permit/prohibit specific system calls\n"
" --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
" --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
" host, try-guest, try-host\n"
@ -431,6 +435,7 @@ static int parse_argv(int argc, char *argv[]) {
ARG_PRIVATE_USERS_CHOWN,
ARG_NOTIFY_READY,
ARG_ROOT_HASH,
ARG_SYSTEM_CALL_FILTER,
};
static const struct option options[] = {
@ -482,6 +487,7 @@ static int parse_argv(int argc, char *argv[]) {
{ "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
{ "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
{ "root-hash", required_argument, NULL, ARG_ROOT_HASH },
{ "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
{}
};
@ -1051,6 +1057,36 @@ static int parse_argv(int argc, char *argv[]) {
break;
}
case ARG_SYSTEM_CALL_FILTER: {
bool negative;
const char *items;
negative = optarg[0] == '~';
items = negative ? optarg + 1 : optarg;
for (;;) {
_cleanup_free_ char *word = NULL;
r = extract_first_word(&items, &word, NULL, 0);
if (r == 0)
break;
if (r == -ENOMEM)
return log_oom();
if (r < 0)
return log_error_errno(r, "Failed to parse system call filter: %m");
if (negative)
r = strv_extend(&arg_syscall_blacklist, word);
else
r = strv_extend(&arg_syscall_whitelist, word);
if (r < 0)
return log_oom();
}
arg_settings_mask |= SETTING_SYSCALL_FILTER;
break;
}
case '?':
return -EINVAL;
@ -2606,7 +2642,7 @@ static int outer_child(
if (r < 0)
return r;
r = setup_seccomp(arg_caps_retain);
r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
if (r < 0)
return r;
@ -3111,6 +3147,21 @@ static int load_settings(void) {
if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
arg_notify_ready = settings->notify_ready;
if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", p);
else {
strv_free(arg_syscall_whitelist);
strv_free(arg_syscall_blacklist);
arg_syscall_whitelist = settings->syscall_whitelist;
arg_syscall_blacklist = settings->syscall_blacklist;
settings->syscall_whitelist = settings->syscall_blacklist = NULL;
}
}
return 0;
}

View File

@ -682,14 +682,17 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name) {
return NULL;
}
static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action);
static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action) {
int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
int r;
assert(seccomp);
assert(name);
if (strv_contains(exclude, name))
return 0;
if (name[0] == '@') {
const SyscallFilterSet *other;
@ -697,7 +700,7 @@ int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name,
if (!other)
return -EINVAL;
r = seccomp_add_syscall_filter_set(seccomp, other, action);
r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
if (r < 0)
return r;
} else {
@ -719,7 +722,8 @@ int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name,
static int seccomp_add_syscall_filter_set(
scmp_filter_ctx seccomp,
const SyscallFilterSet *set,
uint32_t action) {
uint32_t action,
char **exclude) {
const char *sys;
int r;
@ -728,7 +732,7 @@ static int seccomp_add_syscall_filter_set(
assert(set);
NULSTR_FOREACH(sys, set->value) {
r = seccomp_add_syscall_filter_item(seccomp, sys, action);
r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
if (r < 0)
return r;
}
@ -754,7 +758,7 @@ int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilter
if (r < 0)
return r;
r = seccomp_add_syscall_filter_set(seccomp, set, action);
r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
if (r < 0) {
log_debug_errno(r, "Failed to add filter set, ignoring: %m");
continue;

View File

@ -69,7 +69,7 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name);
int seccomp_filter_set_add(Set *s, bool b, const SyscallFilterSet *set);
int seccomp_add_syscall_filter_item(scmp_filter_ctx *ctx, const char *name, uint32_t action);
int seccomp_add_syscall_filter_item(scmp_filter_ctx *ctx, const char *name, uint32_t action, char **exclude);
int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action);
int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action);