From 6597686865ffcba7450b44814618b94321cfa3cf Mon Sep 17 00:00:00 2001 From: Greg Depoire--Ferrer Date: Thu, 29 Oct 2020 00:51:30 +0100 Subject: [PATCH] seccomp: don't install filters for archs that can't use syscalls When seccomp_restrict_archs is called, architectures that are blocked are replaced by the SECCOMP_LOCAL_ARCH_BLOCKED marker so that they are not disabled again and filters are not installed for them. This can make some service that use SystemCallArchitecture= and SystemCallFilter= start faster. --- TODO | 4 --- src/shared/seccomp-util.c | 51 ++++++++++++++++++++++++--------------- src/shared/seccomp-util.h | 14 ++++++++--- 3 files changed, 43 insertions(+), 26 deletions(-) diff --git a/TODO b/TODO index 4affa324dc..c4f000e20f 100644 --- a/TODO +++ b/TODO @@ -135,10 +135,6 @@ Features: o move into separate libsystemd-shared-iptables.so .so - iptables-libs (only used by nspawn + networkd) -* seccomp: when SystemCallArchitectures=native is set then don't install any - other seccomp filters for any of the other archs, in order to reduce the - number of seccomp filters we install needlessly. - * seccomp: maybe use seccomp_merge() to merge our filters per-arch if we can. Apparently kernel performance is much better with fewer larger seccomp filters than with more smaller seccomp filters. diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index ccae9d4f14..b5f9ad9adc 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -23,7 +23,8 @@ #include "string-util.h" #include "strv.h" -const uint32_t seccomp_local_archs[] = { +/* This array will be modified at runtime as seccomp_restrict_archs is called. */ +uint32_t seccomp_local_archs[] = { /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */ @@ -94,7 +95,7 @@ const uint32_t seccomp_local_archs[] = { #elif defined(__s390__) SCMP_ARCH_S390, #endif - (uint32_t) -1 + SECCOMP_LOCAL_ARCH_END }; const char* seccomp_arch_to_string(uint32_t c) { @@ -1758,8 +1759,8 @@ int seccomp_memory_deny_write_execute(void) { int seccomp_restrict_archs(Set *archs) { _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; - void *id; int r; + bool blocked_new = false; /* This installs a filter with no rules, but that restricts the system call architectures to the specified * list. @@ -1775,24 +1776,36 @@ int seccomp_restrict_archs(Set *archs) { if (!seccomp) return -ENOMEM; - SET_FOREACH(id, archs) { - r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1); - if (r < 0 && r != -EEXIST) - return r; + for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) { + uint32_t arch = seccomp_local_archs[i]; + + /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */ + if (arch == SECCOMP_LOCAL_ARCH_BLOCKED) + continue; + + bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1)); + + /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32 + * x32 syscalls should basically match x86-64 for everything except the pointer type. + * The important thing is that you can block the old 32-bit x86 syscalls. + * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */ + if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32) + block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1)); + + if (block) { + seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED; + blocked_new = true; + } else { + r = seccomp_arch_add(seccomp, arch); + if (r < 0 && r != -EEXIST) + return r; + } } - /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32 - * x32 syscalls should basically match x86-64 for everything except the pointer type. - * The important thing is that you can block the old 32-bit x86 syscalls. - * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */ - - if (seccomp_arch_native() == SCMP_ARCH_X32 || - set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) { - - r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64); - if (r < 0 && r != -EEXIST) - return r; - } + /* All architectures that will be blocked by the seccomp program were + * already blocked. */ + if (!blocked_new) + return 0; r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); if (r < 0) diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index 610597127e..53b74bdc34 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -100,12 +100,20 @@ int seccomp_lock_personality(unsigned long personality); int seccomp_protect_hostname(void); int seccomp_restrict_suid_sgid(void); -extern const uint32_t seccomp_local_archs[]; +extern uint32_t seccomp_local_archs[]; + +#define SECCOMP_LOCAL_ARCH_END UINT32_MAX + +/* Note: 0 is safe to use here because although SCMP_ARCH_NATIVE is 0, it would + * never be in the seccomp_local_archs array anyway so we can use it as a + * marker. */ +#define SECCOMP_LOCAL_ARCH_BLOCKED 0 #define SECCOMP_FOREACH_LOCAL_ARCH(arch) \ for (unsigned _i = ({ (arch) = seccomp_local_archs[0]; 0; }); \ - seccomp_local_archs[_i] != (uint32_t) -1; \ - (arch) = seccomp_local_archs[++_i]) + (arch) != SECCOMP_LOCAL_ARCH_END; \ + (arch) = seccomp_local_archs[++_i]) \ + if ((arch) != SECCOMP_LOCAL_ARCH_BLOCKED) /* EACCES: does not have the CAP_SYS_ADMIN or no_new_privs == 1 * ENOMEM: out of memory, failed to allocate space for a libseccomp structure, or would exceed a defined constant