nspawn: replace syscall blacklist by a whitelist

Let's lock things down a bit, and maintain a list of what's permitted
rather than a list of what's prohibited in nspawn (also to make things a
bit more like Docker and friends).

Note that this slightly alters the effect of --system-call-filter=, as
now the negative list now takes precedence over the positive list.
However, given that the option is just a few days old and not included
in any released version it should be fine to change it at this point in
time.

Note that the whitelist is good chunk more restrictive thatn the
previous blacklist. Specifically:

- fanotify is not permitted (given the buffer size issues it's
  problematic in containers)
- nfsservctl is not permitted (NFS server support is not virtualized)
- pkey_xyz stuff is not permitted (really new stuff I don't grok)
- @cpu-emulation is prohibited (untested legacy stuff mostly, and if
  people really want to run dosemu in nspawn, they should use
  --system-call-filter=@cpu-emulation and all should be good)
This commit is contained in:
Lennart Poettering 2017-09-14 10:18:57 +02:00
parent cff7bff880
commit 96bedbe2e5
2 changed files with 159 additions and 40 deletions

View File

@ -723,9 +723,9 @@
system calls will be permitted. The list may optionally be prefixed by <literal>~</literal>, in which case all
listed system calls are prohibited. If this command line option is used multiple times the configured lists are
combined. If both a positive and a negative list (that is one system call list without and one with the
<literal>~</literal> prefix) are configured, the positive list takes precedence over the negative list. Note
that <command>systemd-nspawn</command> always implements a system call blacklist (as opposed to a whitelist),
and this command line option hence adds or removes entries from the default blacklist, depending on the
<literal>~</literal> prefix) are configured, the negative list takes precedence over the positive list. Note
that <command>systemd-nspawn</command> always implements a system call whitelist (as opposed to a blacklist),
and this command line option hence adds or removes entries from the default whitelist, depending on the
<literal>~</literal> prefix. Note that the applied system call filter is also altered implicitly if additional
capabilities are passed using the <command>--capabilities=</command>.</para></listitem>
</varlistentry>

View File

@ -47,47 +47,154 @@ static int seccomp_add_default_syscall_filter(
static const struct {
uint64_t capability;
const char* name;
} blacklist[] = {
{ 0, "@obsolete" },
{ 0, "@keyring" }, /* keyring is not namespaced */
{ 0, "bpf" },
{ 0, "kexec_file_load" },
{ 0, "kexec_load" },
{ 0, "lookup_dcookie" },
{ 0, "open_by_handle_at" },
{ 0, "perf_event_open" },
{ 0, "quotactl" },
{ 0, "@swap" },
{ CAP_SYSLOG, "syslog" },
{ CAP_SYS_MODULE, "@module" },
{ CAP_SYS_PACCT, "acct" },
{ CAP_SYS_PTRACE, "process_vm_readv" },
{ CAP_SYS_PTRACE, "process_vm_writev" },
{ CAP_SYS_PTRACE, "ptrace" },
{ CAP_SYS_RAWIO, "@raw-io" },
{ CAP_SYS_TIME, "@clock" },
} whitelist[] = {
/* Let's use set names where we can */
{ 0, "@basic-io" },
{ 0, "@credentials" },
{ 0, "@default" },
{ 0, "@file-system" },
{ 0, "@io-event" },
{ 0, "@ipc" },
{ 0, "@mount" },
{ 0, "@network-io" },
{ 0, "@process" },
{ 0, "@resources" },
{ 0, "@setuid" },
{ 0, "@signal" },
{ 0, "@timer" },
/* The following four are sets we optionally enable, in case the caps have been configured for it */
{ CAP_SYS_TIME, "@clock" },
{ CAP_SYS_MODULE, "@module" },
{ CAP_SYS_RAWIO, "@raw-io" },
{ CAP_IPC_LOCK, "@memlock" },
/* Plus a good set of additional syscalls which are not part of any of the groups above */
{ 0, "brk" },
{ 0, "capset" },
{ 0, "chown" },
{ 0, "chown32" },
{ 0, "copy_file_range" },
{ 0, "fadvise64" },
{ 0, "fadvise64_64" },
{ 0, "fchown" },
{ 0, "fchown32" },
{ 0, "fchownat" },
{ 0, "fdatasync" },
{ 0, "flock" },
{ 0, "fsync" },
{ 0, "get_mempolicy" },
{ 0, "getcpu" },
{ 0, "getpriority" },
{ 0, "getrandom" },
{ 0, "io_cancel" },
{ 0, "io_destroy" },
{ 0, "io_getevents" },
{ 0, "io_setup" },
{ 0, "io_submit" },
{ 0, "ioctl" },
{ 0, "ioprio_get" },
{ 0, "kcmp" },
{ 0, "lchown" },
{ 0, "lchown32" },
{ 0, "madvise" },
{ 0, "mincore" },
{ 0, "mprotect" },
{ 0, "mremap" },
{ 0, "msync" },
{ 0, "name_to_handle_at" },
{ 0, "oldolduname" },
{ 0, "olduname" },
{ 0, "personality" },
{ 0, "preadv2" },
{ 0, "pwritev2" },
{ 0, "readahead" },
{ 0, "readdir" },
{ 0, "remap_file_pages" },
{ 0, "sched_get_priority_max" },
{ 0, "sched_get_priority_min" },
{ 0, "sched_getaffinity" },
{ 0, "sched_getattr" },
{ 0, "sched_getparam" },
{ 0, "sched_getscheduler" },
{ 0, "sched_rr_get_interval" },
{ 0, "sched_yield" },
{ 0, "seccomp" },
{ 0, "sendfile" },
{ 0, "sendfile64" },
{ 0, "setdomainname" },
{ 0, "setfsgid" },
{ 0, "setfsgid32" },
{ 0, "setfsuid" },
{ 0, "setfsuid32" },
{ 0, "sethostname" },
{ 0, "setpgid" },
{ 0, "setsid" },
{ 0, "splice" },
{ 0, "sync" },
{ 0, "sync_file_range" },
{ 0, "syncfs" },
{ 0, "sysinfo" },
{ 0, "tee" },
{ 0, "ugetrlimit" },
{ 0, "umask" },
{ 0, "uname" },
{ 0, "userfaultfd" },
{ 0, "vmsplice" },
/* The following individual syscalls are added depending on specified caps */
{ CAP_SYS_PACCT, "acct" },
{ CAP_SYS_PTRACE, "process_vm_readv" },
{ CAP_SYS_PTRACE, "process_vm_writev" },
{ CAP_SYS_PTRACE, "ptrace" },
{ CAP_SYS_BOOT, "reboot" },
{ CAP_SYSLOG, "syslog" },
{ CAP_SYS_TTY_CONFIG, "vhangup" },
/*
* The following syscalls and groups are knowingly excluded:
*
* @cpu-emulation
* @keyring (NB: keyring is not namespaced!)
* @obsolete
* @swap
*
* bpf (NB: bpffs is not namespaced!)
* fanotify_init
* fanotify_mark
* kexec_file_load
* kexec_load
* lookup_dcookie
* nfsservctl
* open_by_handle_at
* perf_event_open
* pkey_alloc
* pkey_free
* pkey_mprotect
* quotactl
*/
};
int r, c = 0;
size_t i;
char **p;
for (i = 0; i < ELEMENTSOF(blacklist); i++) {
if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability)))
for (i = 0; i < ELEMENTSOF(whitelist); i++) {
if (whitelist[i].capability != 0 && (cap_list_retain & (1ULL << whitelist[i].capability)) == 0)
continue;
r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
r = seccomp_add_syscall_filter_item(ctx, whitelist[i].name, SCMP_ACT_ALLOW, syscall_blacklist);
if (r < 0)
/* If the system call is not known on this architecture, then that's fine, let's ignore it */
log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", blacklist[i].name);
log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", whitelist[i].name, seccomp_arch_to_string(arch));
else
c++;
}
STRV_FOREACH(p, syscall_blacklist) {
r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
STRV_FOREACH(p, syscall_whitelist) {
r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ALLOW, syscall_blacklist);
if (r < 0)
log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", *p);
log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", *p, seccomp_arch_to_string(arch));
else
c++;
}
@ -106,18 +213,33 @@ int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **sys
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
int n;
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch));
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(EPERM));
if (r < 0)
return log_error_errno(r, "Failed to allocate seccomp object: %m");
r = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
if (r < 0)
return r;
r = seccomp_load(seccomp);
if (IN_SET(r, -EPERM, -EACCES))
return log_error_errno(r, "Failed to install seccomp filter: %m");
if (r < 0)
log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
}
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch));
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
if (r < 0)
return log_error_errno(r, "Failed to allocate seccomp object: %m");
n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
if (n < 0)
return n;
/*
Audit is broken in containers, much of the userspace audit hookup will fail if running inside a
container. We don't care and just turn off creation of audit sockets.
@ -133,13 +255,10 @@ int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **sys
2,
SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
if (r < 0)
if (r < 0) {
log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m");
else
n++;
if (n <= 0) /* no rule added? then skip this architecture */
continue;
}
r = seccomp_load(seccomp);
if (IN_SET(r, -EPERM, -EACCES))