seccomp: don't install filters for archs that can't use syscalls

When seccomp_restrict_archs is called, architectures that are blocked
are replaced by the SECCOMP_LOCAL_ARCH_BLOCKED marker so that they are
not disabled again and filters are not installed for them.

This can make some service that use SystemCallArchitecture= and
SystemCallFilter= start faster.
This commit is contained in:
Greg Depoire--Ferrer 2020-10-29 00:51:30 +01:00 committed by Lennart Poettering
parent 104fc4be11
commit 6597686865
3 changed files with 43 additions and 26 deletions

4
TODO
View File

@ -135,10 +135,6 @@ Features:
o move into separate libsystemd-shared-iptables.so .so o move into separate libsystemd-shared-iptables.so .so
- iptables-libs (only used by nspawn + networkd) - iptables-libs (only used by nspawn + networkd)
* seccomp: when SystemCallArchitectures=native is set then don't install any
other seccomp filters for any of the other archs, in order to reduce the
number of seccomp filters we install needlessly.
* seccomp: maybe use seccomp_merge() to merge our filters per-arch if we can. * seccomp: maybe use seccomp_merge() to merge our filters per-arch if we can.
Apparently kernel performance is much better with fewer larger seccomp Apparently kernel performance is much better with fewer larger seccomp
filters than with more smaller seccomp filters. filters than with more smaller seccomp filters.

View File

@ -23,7 +23,8 @@
#include "string-util.h" #include "string-util.h"
#include "strv.h" #include "strv.h"
const uint32_t seccomp_local_archs[] = { /* This array will be modified at runtime as seccomp_restrict_archs is called. */
uint32_t seccomp_local_archs[] = {
/* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */ /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
@ -94,7 +95,7 @@ const uint32_t seccomp_local_archs[] = {
#elif defined(__s390__) #elif defined(__s390__)
SCMP_ARCH_S390, SCMP_ARCH_S390,
#endif #endif
(uint32_t) -1 SECCOMP_LOCAL_ARCH_END
}; };
const char* seccomp_arch_to_string(uint32_t c) { const char* seccomp_arch_to_string(uint32_t c) {
@ -1758,8 +1759,8 @@ int seccomp_memory_deny_write_execute(void) {
int seccomp_restrict_archs(Set *archs) { int seccomp_restrict_archs(Set *archs) {
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
void *id;
int r; int r;
bool blocked_new = false;
/* This installs a filter with no rules, but that restricts the system call architectures to the specified /* This installs a filter with no rules, but that restricts the system call architectures to the specified
* list. * list.
@ -1775,24 +1776,36 @@ int seccomp_restrict_archs(Set *archs) {
if (!seccomp) if (!seccomp)
return -ENOMEM; return -ENOMEM;
SET_FOREACH(id, archs) { for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1); uint32_t arch = seccomp_local_archs[i];
if (r < 0 && r != -EEXIST)
return r; /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
continue;
bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
/* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
* x32 syscalls should basically match x86-64 for everything except the pointer type.
* The important thing is that you can block the old 32-bit x86 syscalls.
* https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
if (block) {
seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
blocked_new = true;
} else {
r = seccomp_arch_add(seccomp, arch);
if (r < 0 && r != -EEXIST)
return r;
}
} }
/* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32 /* All architectures that will be blocked by the seccomp program were
* x32 syscalls should basically match x86-64 for everything except the pointer type. * already blocked. */
* The important thing is that you can block the old 32-bit x86 syscalls. if (!blocked_new)
* https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */ return 0;
if (seccomp_arch_native() == SCMP_ARCH_X32 ||
set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
if (r < 0 && r != -EEXIST)
return r;
}
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
if (r < 0) if (r < 0)

View File

@ -100,12 +100,20 @@ int seccomp_lock_personality(unsigned long personality);
int seccomp_protect_hostname(void); int seccomp_protect_hostname(void);
int seccomp_restrict_suid_sgid(void); int seccomp_restrict_suid_sgid(void);
extern const uint32_t seccomp_local_archs[]; extern uint32_t seccomp_local_archs[];
#define SECCOMP_LOCAL_ARCH_END UINT32_MAX
/* Note: 0 is safe to use here because although SCMP_ARCH_NATIVE is 0, it would
* never be in the seccomp_local_archs array anyway so we can use it as a
* marker. */
#define SECCOMP_LOCAL_ARCH_BLOCKED 0
#define SECCOMP_FOREACH_LOCAL_ARCH(arch) \ #define SECCOMP_FOREACH_LOCAL_ARCH(arch) \
for (unsigned _i = ({ (arch) = seccomp_local_archs[0]; 0; }); \ for (unsigned _i = ({ (arch) = seccomp_local_archs[0]; 0; }); \
seccomp_local_archs[_i] != (uint32_t) -1; \ (arch) != SECCOMP_LOCAL_ARCH_END; \
(arch) = seccomp_local_archs[++_i]) (arch) = seccomp_local_archs[++_i]) \
if ((arch) != SECCOMP_LOCAL_ARCH_BLOCKED)
/* EACCES: does not have the CAP_SYS_ADMIN or no_new_privs == 1 /* EACCES: does not have the CAP_SYS_ADMIN or no_new_privs == 1
* ENOMEM: out of memory, failed to allocate space for a libseccomp structure, or would exceed a defined constant * ENOMEM: out of memory, failed to allocate space for a libseccomp structure, or would exceed a defined constant