diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index c4db6a3ada..3951e32e8f 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -723,9 +723,9 @@ system calls will be permitted. The list may optionally be prefixed by ~, in which case all listed system calls are prohibited. If this command line option is used multiple times the configured lists are combined. If both a positive and a negative list (that is one system call list without and one with the - ~ prefix) are configured, the positive list takes precedence over the negative list. Note - that systemd-nspawn always implements a system call blacklist (as opposed to a whitelist), - and this command line option hence adds or removes entries from the default blacklist, depending on the + ~ prefix) are configured, the negative list takes precedence over the positive list. Note + that systemd-nspawn always implements a system call whitelist (as opposed to a blacklist), + and this command line option hence adds or removes entries from the default whitelist, depending on the ~ prefix. Note that the applied system call filter is also altered implicitly if additional capabilities are passed using the --capabilities=. diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c index a6f7a7dabc..db3d098e7f 100644 --- a/src/nspawn/nspawn-seccomp.c +++ b/src/nspawn/nspawn-seccomp.c @@ -47,47 +47,154 @@ static int seccomp_add_default_syscall_filter( static const struct { uint64_t capability; const char* name; - } blacklist[] = { - { 0, "@obsolete" }, - { 0, "@keyring" }, /* keyring is not namespaced */ - { 0, "bpf" }, - { 0, "kexec_file_load" }, - { 0, "kexec_load" }, - { 0, "lookup_dcookie" }, - { 0, "open_by_handle_at" }, - { 0, "perf_event_open" }, - { 0, "quotactl" }, - { 0, "@swap" }, - { CAP_SYSLOG, "syslog" }, - { CAP_SYS_MODULE, "@module" }, - { CAP_SYS_PACCT, "acct" }, - { CAP_SYS_PTRACE, "process_vm_readv" }, - { CAP_SYS_PTRACE, "process_vm_writev" }, - { CAP_SYS_PTRACE, "ptrace" }, - { CAP_SYS_RAWIO, "@raw-io" }, - { CAP_SYS_TIME, "@clock" }, + } whitelist[] = { + /* Let's use set names where we can */ + { 0, "@basic-io" }, + { 0, "@credentials" }, + { 0, "@default" }, + { 0, "@file-system" }, + { 0, "@io-event" }, + { 0, "@ipc" }, + { 0, "@mount" }, + { 0, "@network-io" }, + { 0, "@process" }, + { 0, "@resources" }, + { 0, "@setuid" }, + { 0, "@signal" }, + { 0, "@timer" }, + + /* The following four are sets we optionally enable, in case the caps have been configured for it */ + { CAP_SYS_TIME, "@clock" }, + { CAP_SYS_MODULE, "@module" }, + { CAP_SYS_RAWIO, "@raw-io" }, + { CAP_IPC_LOCK, "@memlock" }, + + /* Plus a good set of additional syscalls which are not part of any of the groups above */ + { 0, "brk" }, + { 0, "capset" }, + { 0, "chown" }, + { 0, "chown32" }, + { 0, "copy_file_range" }, + { 0, "fadvise64" }, + { 0, "fadvise64_64" }, + { 0, "fchown" }, + { 0, "fchown32" }, + { 0, "fchownat" }, + { 0, "fdatasync" }, + { 0, "flock" }, + { 0, "fsync" }, + { 0, "get_mempolicy" }, + { 0, "getcpu" }, + { 0, "getpriority" }, + { 0, "getrandom" }, + { 0, "io_cancel" }, + { 0, "io_destroy" }, + { 0, "io_getevents" }, + { 0, "io_setup" }, + { 0, "io_submit" }, + { 0, "ioctl" }, + { 0, "ioprio_get" }, + { 0, "kcmp" }, + { 0, "lchown" }, + { 0, "lchown32" }, + { 0, "madvise" }, + { 0, "mincore" }, + { 0, "mprotect" }, + { 0, "mremap" }, + { 0, "msync" }, + { 0, "name_to_handle_at" }, + { 0, "oldolduname" }, + { 0, "olduname" }, + { 0, "personality" }, + { 0, "preadv2" }, + { 0, "pwritev2" }, + { 0, "readahead" }, + { 0, "readdir" }, + { 0, "remap_file_pages" }, + { 0, "sched_get_priority_max" }, + { 0, "sched_get_priority_min" }, + { 0, "sched_getaffinity" }, + { 0, "sched_getattr" }, + { 0, "sched_getparam" }, + { 0, "sched_getscheduler" }, + { 0, "sched_rr_get_interval" }, + { 0, "sched_yield" }, + { 0, "seccomp" }, + { 0, "sendfile" }, + { 0, "sendfile64" }, + { 0, "setdomainname" }, + { 0, "setfsgid" }, + { 0, "setfsgid32" }, + { 0, "setfsuid" }, + { 0, "setfsuid32" }, + { 0, "sethostname" }, + { 0, "setpgid" }, + { 0, "setsid" }, + { 0, "splice" }, + { 0, "sync" }, + { 0, "sync_file_range" }, + { 0, "syncfs" }, + { 0, "sysinfo" }, + { 0, "tee" }, + { 0, "ugetrlimit" }, + { 0, "umask" }, + { 0, "uname" }, + { 0, "userfaultfd" }, + { 0, "vmsplice" }, + + /* The following individual syscalls are added depending on specified caps */ + { CAP_SYS_PACCT, "acct" }, + { CAP_SYS_PTRACE, "process_vm_readv" }, + { CAP_SYS_PTRACE, "process_vm_writev" }, + { CAP_SYS_PTRACE, "ptrace" }, + { CAP_SYS_BOOT, "reboot" }, + { CAP_SYSLOG, "syslog" }, + { CAP_SYS_TTY_CONFIG, "vhangup" }, + + /* + * The following syscalls and groups are knowingly excluded: + * + * @cpu-emulation + * @keyring (NB: keyring is not namespaced!) + * @obsolete + * @swap + * + * bpf (NB: bpffs is not namespaced!) + * fanotify_init + * fanotify_mark + * kexec_file_load + * kexec_load + * lookup_dcookie + * nfsservctl + * open_by_handle_at + * perf_event_open + * pkey_alloc + * pkey_free + * pkey_mprotect + * quotactl + */ }; int r, c = 0; size_t i; char **p; - for (i = 0; i < ELEMENTSOF(blacklist); i++) { - if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability))) + for (i = 0; i < ELEMENTSOF(whitelist); i++) { + if (whitelist[i].capability != 0 && (cap_list_retain & (1ULL << whitelist[i].capability)) == 0) continue; - r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM), syscall_whitelist); + r = seccomp_add_syscall_filter_item(ctx, whitelist[i].name, SCMP_ACT_ALLOW, syscall_blacklist); if (r < 0) /* If the system call is not known on this architecture, then that's fine, let's ignore it */ - log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", blacklist[i].name); + log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", whitelist[i].name, seccomp_arch_to_string(arch)); else c++; } - STRV_FOREACH(p, syscall_blacklist) { - r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ERRNO(EPERM), syscall_whitelist); + STRV_FOREACH(p, syscall_whitelist) { + r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ALLOW, syscall_blacklist); if (r < 0) - log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", *p); + log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", *p, seccomp_arch_to_string(arch)); else c++; } @@ -106,18 +213,33 @@ int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **sys SECCOMP_FOREACH_LOCAL_ARCH(arch) { _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; - int n; - log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch)); + log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch)); + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(EPERM)); + if (r < 0) + return log_error_errno(r, "Failed to allocate seccomp object: %m"); + + r = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist); + if (r < 0) + return r; + + r = seccomp_load(seccomp); + if (IN_SET(r, -EPERM, -EACCES)) + return log_error_errno(r, "Failed to install seccomp filter: %m"); + if (r < 0) + log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch)); r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); if (r < 0) return log_error_errno(r, "Failed to allocate seccomp object: %m"); - n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist); - if (n < 0) - return n; - /* Audit is broken in containers, much of the userspace audit hookup will fail if running inside a container. We don't care and just turn off creation of audit sockets. @@ -133,13 +255,10 @@ int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **sys 2, SCMP_A0(SCMP_CMP_EQ, AF_NETLINK), SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT)); - if (r < 0) + if (r < 0) { log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m"); - else - n++; - - if (n <= 0) /* no rule added? then skip this architecture */ continue; + } r = seccomp_load(seccomp); if (IN_SET(r, -EPERM, -EACCES))