seccomp: don't install filters for archs that can't use syscalls

When seccomp_restrict_archs is called, architectures that are blocked
are replaced by the SECCOMP_LOCAL_ARCH_BLOCKED marker so that they are
not disabled again and filters are not installed for them.

This can make some service that use SystemCallArchitecture= and
SystemCallFilter= start faster.
This commit is contained in:
Greg Depoire--Ferrer 2020-10-29 00:51:30 +01:00 committed by Lennart Poettering
parent 104fc4be11
commit 6597686865
3 changed files with 43 additions and 26 deletions

4
TODO
View File

@ -135,10 +135,6 @@ Features:
o move into separate libsystemd-shared-iptables.so .so
- iptables-libs (only used by nspawn + networkd)
* seccomp: when SystemCallArchitectures=native is set then don't install any
other seccomp filters for any of the other archs, in order to reduce the
number of seccomp filters we install needlessly.
* seccomp: maybe use seccomp_merge() to merge our filters per-arch if we can.
Apparently kernel performance is much better with fewer larger seccomp
filters than with more smaller seccomp filters.

View File

@ -23,7 +23,8 @@
#include "string-util.h"
#include "strv.h"
const uint32_t seccomp_local_archs[] = {
/* This array will be modified at runtime as seccomp_restrict_archs is called. */
uint32_t seccomp_local_archs[] = {
/* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
@ -94,7 +95,7 @@ const uint32_t seccomp_local_archs[] = {
#elif defined(__s390__)
SCMP_ARCH_S390,
#endif
(uint32_t) -1
SECCOMP_LOCAL_ARCH_END
};
const char* seccomp_arch_to_string(uint32_t c) {
@ -1758,8 +1759,8 @@ int seccomp_memory_deny_write_execute(void) {
int seccomp_restrict_archs(Set *archs) {
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
void *id;
int r;
bool blocked_new = false;
/* This installs a filter with no rules, but that restricts the system call architectures to the specified
* list.
@ -1775,24 +1776,36 @@ int seccomp_restrict_archs(Set *archs) {
if (!seccomp)
return -ENOMEM;
SET_FOREACH(id, archs) {
r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
if (r < 0 && r != -EEXIST)
return r;
}
for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
uint32_t arch = seccomp_local_archs[i];
/* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
continue;
bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
/* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
* x32 syscalls should basically match x86-64 for everything except the pointer type.
* The important thing is that you can block the old 32-bit x86 syscalls.
* https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
if (seccomp_arch_native() == SCMP_ARCH_X32 ||
set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
if (block) {
seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
blocked_new = true;
} else {
r = seccomp_arch_add(seccomp, arch);
if (r < 0 && r != -EEXIST)
return r;
}
}
/* All architectures that will be blocked by the seccomp program were
* already blocked. */
if (!blocked_new)
return 0;
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
if (r < 0)

View File

@ -100,12 +100,20 @@ int seccomp_lock_personality(unsigned long personality);
int seccomp_protect_hostname(void);
int seccomp_restrict_suid_sgid(void);
extern const uint32_t seccomp_local_archs[];
extern uint32_t seccomp_local_archs[];
#define SECCOMP_LOCAL_ARCH_END UINT32_MAX
/* Note: 0 is safe to use here because although SCMP_ARCH_NATIVE is 0, it would
* never be in the seccomp_local_archs array anyway so we can use it as a
* marker. */
#define SECCOMP_LOCAL_ARCH_BLOCKED 0
#define SECCOMP_FOREACH_LOCAL_ARCH(arch) \
for (unsigned _i = ({ (arch) = seccomp_local_archs[0]; 0; }); \
seccomp_local_archs[_i] != (uint32_t) -1; \
(arch) = seccomp_local_archs[++_i])
(arch) != SECCOMP_LOCAL_ARCH_END; \
(arch) = seccomp_local_archs[++_i]) \
if ((arch) != SECCOMP_LOCAL_ARCH_BLOCKED)
/* EACCES: does not have the CAP_SYS_ADMIN or no_new_privs == 1
* ENOMEM: out of memory, failed to allocate space for a libseccomp structure, or would exceed a defined constant