2017-11-18 17:09:20 +01:00
|
|
|
/* SPDX-License-Identifier: LGPL-2.1+ */
|
2014-02-13 00:24:00 +01:00
|
|
|
|
2015-12-03 21:13:37 +01:00
|
|
|
#include <errno.h>
|
2019-03-20 19:00:28 +01:00
|
|
|
#include <fcntl.h>
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
#include <linux/seccomp.h>
|
2014-02-13 00:24:00 +01:00
|
|
|
#include <seccomp.h>
|
2015-12-03 21:13:37 +01:00
|
|
|
#include <stddef.h>
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
#include <sys/mman.h>
|
2016-08-31 15:00:35 +02:00
|
|
|
#include <sys/prctl.h>
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
#include <sys/shm.h>
|
2019-03-20 19:00:28 +01:00
|
|
|
#include <sys/stat.h>
|
2014-02-13 00:24:00 +01:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
#include "af-list.h"
|
2016-11-02 03:25:19 +01:00
|
|
|
#include "alloc-util.h"
|
2020-08-16 22:57:41 +02:00
|
|
|
#include "env-util.h"
|
2019-03-14 13:14:33 +01:00
|
|
|
#include "errno-list.h"
|
2015-12-03 21:13:37 +01:00
|
|
|
#include "macro.h"
|
2016-11-02 03:25:19 +01:00
|
|
|
#include "nsflags.h"
|
2019-03-14 13:14:33 +01:00
|
|
|
#include "nulstr-util.h"
|
2017-07-04 14:48:18 +02:00
|
|
|
#include "process-util.h"
|
2015-11-16 22:09:36 +01:00
|
|
|
#include "seccomp-util.h"
|
2017-08-02 06:46:45 +02:00
|
|
|
#include "set.h"
|
2015-10-24 22:58:24 +02:00
|
|
|
#include "string-util.h"
|
2017-08-02 06:46:45 +02:00
|
|
|
#include "strv.h"
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
|
|
|
|
const uint32_t seccomp_local_archs[] = {
|
|
|
|
|
2020-06-23 08:31:16 +02:00
|
|
|
/* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
|
2017-02-10 23:47:50 +01:00
|
|
|
|
|
|
|
#if defined(__x86_64__) && defined(__ILP32__)
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_X86,
|
|
|
|
SCMP_ARCH_X86_64,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_X32, /* native */
|
|
|
|
#elif defined(__x86_64__) && !defined(__ILP32__)
|
|
|
|
SCMP_ARCH_X86,
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_X32,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_X86_64, /* native */
|
|
|
|
#elif defined(__i386__)
|
|
|
|
SCMP_ARCH_X86,
|
|
|
|
#elif defined(__aarch64__)
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_ARM,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_AARCH64, /* native */
|
|
|
|
#elif defined(__arm__)
|
|
|
|
SCMP_ARCH_ARM,
|
|
|
|
#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
|
|
|
|
SCMP_ARCH_MIPSEL,
|
|
|
|
SCMP_ARCH_MIPS, /* native */
|
|
|
|
#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_MIPS,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_MIPSEL, /* native */
|
|
|
|
#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
|
|
|
|
SCMP_ARCH_MIPSEL,
|
|
|
|
SCMP_ARCH_MIPS,
|
|
|
|
SCMP_ARCH_MIPSEL64N32,
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_MIPS64N32,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_MIPSEL64,
|
|
|
|
SCMP_ARCH_MIPS64, /* native */
|
|
|
|
#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
|
|
|
|
SCMP_ARCH_MIPS,
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_MIPSEL,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_MIPS64N32,
|
|
|
|
SCMP_ARCH_MIPSEL64N32,
|
|
|
|
SCMP_ARCH_MIPS64,
|
|
|
|
SCMP_ARCH_MIPSEL64, /* native */
|
|
|
|
#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
|
|
|
|
SCMP_ARCH_MIPSEL,
|
|
|
|
SCMP_ARCH_MIPS,
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_MIPSEL64,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_MIPS64,
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_MIPSEL64N32,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_MIPS64N32, /* native */
|
|
|
|
#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
|
|
|
|
SCMP_ARCH_MIPS,
|
|
|
|
SCMP_ARCH_MIPSEL,
|
|
|
|
SCMP_ARCH_MIPS64,
|
|
|
|
SCMP_ARCH_MIPSEL64,
|
|
|
|
SCMP_ARCH_MIPS64N32,
|
|
|
|
SCMP_ARCH_MIPSEL64N32, /* native */
|
|
|
|
#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_PPC,
|
|
|
|
SCMP_ARCH_PPC64LE,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_PPC64, /* native */
|
|
|
|
#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
|
|
|
|
SCMP_ARCH_PPC,
|
|
|
|
SCMP_ARCH_PPC64,
|
|
|
|
SCMP_ARCH_PPC64LE, /* native */
|
|
|
|
#elif defined(__powerpc__)
|
|
|
|
SCMP_ARCH_PPC,
|
2020-08-19 22:44:15 +02:00
|
|
|
#elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
|
|
|
|
SCMP_ARCH_RISCV64,
|
2017-02-10 23:47:50 +01:00
|
|
|
#elif defined(__s390x__)
|
|
|
|
SCMP_ARCH_S390,
|
|
|
|
SCMP_ARCH_S390X, /* native */
|
|
|
|
#elif defined(__s390__)
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_S390,
|
|
|
|
#endif
|
|
|
|
(uint32_t) -1
|
|
|
|
};
|
2014-02-13 00:24:00 +01:00
|
|
|
|
|
|
|
const char* seccomp_arch_to_string(uint32_t c) {
|
2016-11-01 16:33:18 +01:00
|
|
|
/* Maintain order used in <seccomp.h>.
|
|
|
|
*
|
|
|
|
* Names used here should be the same as those used for ConditionArchitecture=,
|
|
|
|
* except for "subarchitectures" like x32. */
|
2014-02-13 00:24:00 +01:00
|
|
|
|
2016-11-01 16:33:18 +01:00
|
|
|
switch(c) {
|
|
|
|
case SCMP_ARCH_NATIVE:
|
2014-02-13 00:24:00 +01:00
|
|
|
return "native";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_X86:
|
2014-02-13 00:24:00 +01:00
|
|
|
return "x86";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_X86_64:
|
2014-02-13 00:24:00 +01:00
|
|
|
return "x86-64";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_X32:
|
2014-02-13 00:24:00 +01:00
|
|
|
return "x32";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_ARM:
|
2014-02-13 00:24:00 +01:00
|
|
|
return "arm";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_AARCH64:
|
|
|
|
return "arm64";
|
|
|
|
case SCMP_ARCH_MIPS:
|
|
|
|
return "mips";
|
|
|
|
case SCMP_ARCH_MIPS64:
|
|
|
|
return "mips64";
|
|
|
|
case SCMP_ARCH_MIPS64N32:
|
|
|
|
return "mips64-n32";
|
|
|
|
case SCMP_ARCH_MIPSEL:
|
|
|
|
return "mips-le";
|
|
|
|
case SCMP_ARCH_MIPSEL64:
|
|
|
|
return "mips64-le";
|
|
|
|
case SCMP_ARCH_MIPSEL64N32:
|
|
|
|
return "mips64-le-n32";
|
|
|
|
case SCMP_ARCH_PPC:
|
|
|
|
return "ppc";
|
|
|
|
case SCMP_ARCH_PPC64:
|
|
|
|
return "ppc64";
|
|
|
|
case SCMP_ARCH_PPC64LE:
|
|
|
|
return "ppc64-le";
|
2020-08-19 22:44:15 +02:00
|
|
|
#ifdef SCMP_ARCH_RISCV64
|
|
|
|
case SCMP_ARCH_RISCV64:
|
|
|
|
return "riscv64";
|
|
|
|
#endif
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_S390:
|
2016-10-05 13:58:55 +02:00
|
|
|
return "s390";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_S390X:
|
2016-10-05 13:58:55 +02:00
|
|
|
return "s390x";
|
2016-11-01 16:33:18 +01:00
|
|
|
default:
|
|
|
|
return NULL;
|
|
|
|
}
|
2014-02-13 00:24:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
int seccomp_arch_from_string(const char *n, uint32_t *ret) {
|
|
|
|
if (!n)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
assert(ret);
|
|
|
|
|
|
|
|
if (streq(n, "native"))
|
|
|
|
*ret = SCMP_ARCH_NATIVE;
|
|
|
|
else if (streq(n, "x86"))
|
|
|
|
*ret = SCMP_ARCH_X86;
|
|
|
|
else if (streq(n, "x86-64"))
|
|
|
|
*ret = SCMP_ARCH_X86_64;
|
|
|
|
else if (streq(n, "x32"))
|
|
|
|
*ret = SCMP_ARCH_X32;
|
|
|
|
else if (streq(n, "arm"))
|
|
|
|
*ret = SCMP_ARCH_ARM;
|
2016-11-01 16:33:18 +01:00
|
|
|
else if (streq(n, "arm64"))
|
|
|
|
*ret = SCMP_ARCH_AARCH64;
|
|
|
|
else if (streq(n, "mips"))
|
|
|
|
*ret = SCMP_ARCH_MIPS;
|
|
|
|
else if (streq(n, "mips64"))
|
|
|
|
*ret = SCMP_ARCH_MIPS64;
|
|
|
|
else if (streq(n, "mips64-n32"))
|
|
|
|
*ret = SCMP_ARCH_MIPS64N32;
|
|
|
|
else if (streq(n, "mips-le"))
|
|
|
|
*ret = SCMP_ARCH_MIPSEL;
|
|
|
|
else if (streq(n, "mips64-le"))
|
|
|
|
*ret = SCMP_ARCH_MIPSEL64;
|
|
|
|
else if (streq(n, "mips64-le-n32"))
|
|
|
|
*ret = SCMP_ARCH_MIPSEL64N32;
|
|
|
|
else if (streq(n, "ppc"))
|
|
|
|
*ret = SCMP_ARCH_PPC;
|
|
|
|
else if (streq(n, "ppc64"))
|
|
|
|
*ret = SCMP_ARCH_PPC64;
|
|
|
|
else if (streq(n, "ppc64-le"))
|
|
|
|
*ret = SCMP_ARCH_PPC64LE;
|
2020-08-19 22:44:15 +02:00
|
|
|
#ifdef SCMP_ARCH_RISCV64
|
|
|
|
else if (streq(n, "riscv64"))
|
|
|
|
*ret = SCMP_ARCH_RISCV64;
|
|
|
|
#endif
|
2016-10-05 13:58:55 +02:00
|
|
|
else if (streq(n, "s390"))
|
|
|
|
*ret = SCMP_ARCH_S390;
|
|
|
|
else if (streq(n, "s390x"))
|
|
|
|
*ret = SCMP_ARCH_S390X;
|
2014-02-13 00:24:00 +01:00
|
|
|
else
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2014-02-18 22:14:00 +01:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
|
2020-08-18 17:06:28 +02:00
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
2016-10-21 20:28:05 +02:00
|
|
|
int r;
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
/* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
|
|
|
|
* any others. Also, turns off the NNP fiddling. */
|
2016-10-21 20:28:05 +02:00
|
|
|
|
|
|
|
seccomp = seccomp_init(default_action);
|
|
|
|
if (!seccomp)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
if (arch != SCMP_ARCH_NATIVE &&
|
|
|
|
arch != seccomp_arch_native()) {
|
|
|
|
|
2017-02-05 17:58:19 +01:00
|
|
|
r = seccomp_arch_remove(seccomp, seccomp_arch_native());
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
if (r < 0)
|
2020-08-18 17:06:28 +02:00
|
|
|
return r;
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
|
2017-02-05 17:58:19 +01:00
|
|
|
r = seccomp_arch_add(seccomp, arch);
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
if (r < 0)
|
2020-08-18 17:06:28 +02:00
|
|
|
return r;
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
|
|
|
|
assert(seccomp_arch_exist(seccomp, arch) >= 0);
|
|
|
|
assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
|
|
|
|
assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
|
|
|
|
} else {
|
|
|
|
assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
|
|
|
|
assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
|
2016-10-21 20:28:05 +02:00
|
|
|
if (r < 0)
|
2020-08-18 17:06:28 +02:00
|
|
|
return r;
|
2016-10-21 20:28:05 +02:00
|
|
|
|
|
|
|
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
|
|
|
|
if (r < 0)
|
2020-08-18 17:06:28 +02:00
|
|
|
return r;
|
2016-10-21 20:28:05 +02:00
|
|
|
|
2020-08-16 22:57:41 +02:00
|
|
|
#if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
|
|
|
|
if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
|
|
|
|
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to enable seccomp event logging: %m");
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2020-08-18 17:06:28 +02:00
|
|
|
*ret = TAKE_PTR(seccomp);
|
2016-10-21 20:28:05 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-08-31 15:00:35 +02:00
|
|
|
static bool is_basic_seccomp_available(void) {
|
2016-12-27 16:50:02 +01:00
|
|
|
return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
|
2016-08-31 15:00:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool is_seccomp_filter_available(void) {
|
2016-12-27 16:50:02 +01:00
|
|
|
return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
|
|
|
|
errno == EFAULT;
|
2016-08-31 15:00:35 +02:00
|
|
|
}
|
|
|
|
|
2016-08-22 21:40:58 +02:00
|
|
|
bool is_seccomp_available(void) {
|
|
|
|
static int cached_enabled = -1;
|
2016-12-27 16:50:02 +01:00
|
|
|
|
2016-08-22 21:40:58 +02:00
|
|
|
if (cached_enabled < 0)
|
2016-12-27 16:50:02 +01:00
|
|
|
cached_enabled =
|
|
|
|
is_basic_seccomp_available() &&
|
|
|
|
is_seccomp_filter_available();
|
|
|
|
|
2016-08-22 21:40:58 +02:00
|
|
|
return cached_enabled;
|
|
|
|
}
|
|
|
|
|
2016-10-21 21:50:05 +02:00
|
|
|
const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
|
2016-11-02 17:01:04 +01:00
|
|
|
[SYSCALL_FILTER_SET_DEFAULT] = {
|
|
|
|
.name = "@default",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "System calls that are always permitted",
|
2016-11-02 17:01:04 +01:00
|
|
|
.value =
|
|
|
|
"clock_getres\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"clock_getres_time64\0"
|
2016-11-02 17:01:04 +01:00
|
|
|
"clock_gettime\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"clock_gettime64\0"
|
2016-11-02 17:01:04 +01:00
|
|
|
"clock_nanosleep\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"clock_nanosleep_time64\0"
|
2016-11-02 17:01:04 +01:00
|
|
|
"execve\0"
|
|
|
|
"exit\0"
|
|
|
|
"exit_group\0"
|
seccomp: update "@default" seccomp group a bit
Let's add more of the most basic operations to "@default" as absolute
baseline needed by glibc and such to operate. Specifically:
futex, get_robust_list, get_thread_area, membarrier, set_robust_list,
set_thread_area, set_tid_address are all required to properly implement
mutexes and other thread synchronization logic. Given that a ton of
datastructures are protected by mutexes (such as stdio and such), let's
just whitelist this by default, so that things can just work.
restart_syscall is used to implement EAGAIN SA_RESTART stuff in some
archs, and synthesized by the kernel without any explicit user logic,
hence let's make this work out of the box.
2017-09-13 19:27:51 +02:00
|
|
|
"futex\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"futex_time64\0"
|
seccomp: update "@default" seccomp group a bit
Let's add more of the most basic operations to "@default" as absolute
baseline needed by glibc and such to operate. Specifically:
futex, get_robust_list, get_thread_area, membarrier, set_robust_list,
set_thread_area, set_tid_address are all required to properly implement
mutexes and other thread synchronization logic. Given that a ton of
datastructures are protected by mutexes (such as stdio and such), let's
just whitelist this by default, so that things can just work.
restart_syscall is used to implement EAGAIN SA_RESTART stuff in some
archs, and synthesized by the kernel without any explicit user logic,
hence let's make this work out of the box.
2017-09-13 19:27:51 +02:00
|
|
|
"get_robust_list\0"
|
|
|
|
"get_thread_area\0"
|
2017-10-03 07:20:05 +02:00
|
|
|
"getegid\0"
|
|
|
|
"getegid32\0"
|
|
|
|
"geteuid\0"
|
|
|
|
"geteuid32\0"
|
|
|
|
"getgid\0"
|
|
|
|
"getgid32\0"
|
|
|
|
"getgroups\0"
|
|
|
|
"getgroups32\0"
|
|
|
|
"getpgid\0"
|
|
|
|
"getpgrp\0"
|
|
|
|
"getpid\0"
|
|
|
|
"getppid\0"
|
|
|
|
"getresgid\0"
|
|
|
|
"getresgid32\0"
|
|
|
|
"getresuid\0"
|
|
|
|
"getresuid32\0"
|
2016-11-02 17:01:04 +01:00
|
|
|
"getrlimit\0" /* make sure processes can query stack size and such */
|
2017-10-03 07:20:05 +02:00
|
|
|
"getsid\0"
|
|
|
|
"gettid\0"
|
2016-11-02 17:01:04 +01:00
|
|
|
"gettimeofday\0"
|
2017-10-03 07:20:05 +02:00
|
|
|
"getuid\0"
|
|
|
|
"getuid32\0"
|
seccomp: update "@default" seccomp group a bit
Let's add more of the most basic operations to "@default" as absolute
baseline needed by glibc and such to operate. Specifically:
futex, get_robust_list, get_thread_area, membarrier, set_robust_list,
set_thread_area, set_tid_address are all required to properly implement
mutexes and other thread synchronization logic. Given that a ton of
datastructures are protected by mutexes (such as stdio and such), let's
just whitelist this by default, so that things can just work.
restart_syscall is used to implement EAGAIN SA_RESTART stuff in some
archs, and synthesized by the kernel without any explicit user logic,
hence let's make this work out of the box.
2017-09-13 19:27:51 +02:00
|
|
|
"membarrier\0"
|
2016-11-02 17:01:04 +01:00
|
|
|
"nanosleep\0"
|
|
|
|
"pause\0"
|
2017-09-30 14:08:26 +02:00
|
|
|
"prlimit64\0"
|
seccomp: update "@default" seccomp group a bit
Let's add more of the most basic operations to "@default" as absolute
baseline needed by glibc and such to operate. Specifically:
futex, get_robust_list, get_thread_area, membarrier, set_robust_list,
set_thread_area, set_tid_address are all required to properly implement
mutexes and other thread synchronization logic. Given that a ton of
datastructures are protected by mutexes (such as stdio and such), let's
just whitelist this by default, so that things can just work.
restart_syscall is used to implement EAGAIN SA_RESTART stuff in some
archs, and synthesized by the kernel without any explicit user logic,
hence let's make this work out of the box.
2017-09-13 19:27:51 +02:00
|
|
|
"restart_syscall\0"
|
2019-03-28 10:01:09 +01:00
|
|
|
"rseq\0"
|
2016-11-02 17:01:04 +01:00
|
|
|
"rt_sigreturn\0"
|
2017-10-04 11:41:42 +02:00
|
|
|
"sched_yield\0"
|
seccomp: update "@default" seccomp group a bit
Let's add more of the most basic operations to "@default" as absolute
baseline needed by glibc and such to operate. Specifically:
futex, get_robust_list, get_thread_area, membarrier, set_robust_list,
set_thread_area, set_tid_address are all required to properly implement
mutexes and other thread synchronization logic. Given that a ton of
datastructures are protected by mutexes (such as stdio and such), let's
just whitelist this by default, so that things can just work.
restart_syscall is used to implement EAGAIN SA_RESTART stuff in some
archs, and synthesized by the kernel without any explicit user logic,
hence let's make this work out of the box.
2017-09-13 19:27:51 +02:00
|
|
|
"set_robust_list\0"
|
|
|
|
"set_thread_area\0"
|
|
|
|
"set_tid_address\0"
|
2017-11-12 16:34:43 +01:00
|
|
|
"set_tls\0"
|
2016-11-02 17:01:04 +01:00
|
|
|
"sigreturn\0"
|
|
|
|
"time\0"
|
2017-09-30 14:08:26 +02:00
|
|
|
"ugetrlimit\0"
|
2016-11-02 17:01:04 +01:00
|
|
|
},
|
2017-09-30 14:34:50 +02:00
|
|
|
[SYSCALL_FILTER_SET_AIO] = {
|
|
|
|
.name = "@aio",
|
|
|
|
.help = "Asynchronous IO",
|
|
|
|
.value =
|
|
|
|
"io_cancel\0"
|
|
|
|
"io_destroy\0"
|
|
|
|
"io_getevents\0"
|
2018-11-14 19:53:49 +01:00
|
|
|
"io_pgetevents\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"io_pgetevents_time64\0"
|
2017-09-30 14:34:50 +02:00
|
|
|
"io_setup\0"
|
|
|
|
"io_submit\0"
|
2019-10-30 11:11:05 +01:00
|
|
|
"io_uring_enter\0"
|
|
|
|
"io_uring_register\0"
|
|
|
|
"io_uring_setup\0"
|
2017-09-30 14:34:50 +02:00
|
|
|
},
|
2016-11-02 15:46:18 +01:00
|
|
|
[SYSCALL_FILTER_SET_BASIC_IO] = {
|
|
|
|
.name = "@basic-io",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Basic IO",
|
2016-11-02 15:46:18 +01:00
|
|
|
.value =
|
2017-09-13 19:31:43 +02:00
|
|
|
"_llseek\0"
|
2016-11-02 15:46:18 +01:00
|
|
|
"close\0"
|
2017-09-13 19:31:43 +02:00
|
|
|
"dup\0"
|
2016-11-02 15:46:18 +01:00
|
|
|
"dup2\0"
|
|
|
|
"dup3\0"
|
|
|
|
"lseek\0"
|
|
|
|
"pread64\0"
|
|
|
|
"preadv\0"
|
2017-09-30 14:34:50 +02:00
|
|
|
"preadv2\0"
|
2016-11-02 15:46:18 +01:00
|
|
|
"pwrite64\0"
|
|
|
|
"pwritev\0"
|
2017-09-30 14:34:50 +02:00
|
|
|
"pwritev2\0"
|
2016-11-02 15:46:18 +01:00
|
|
|
"read\0"
|
|
|
|
"readv\0"
|
|
|
|
"write\0"
|
|
|
|
"writev\0"
|
|
|
|
},
|
2017-09-30 14:34:50 +02:00
|
|
|
[SYSCALL_FILTER_SET_CHOWN] = {
|
|
|
|
.name = "@chown",
|
|
|
|
.help = "Change ownership of files and directories",
|
|
|
|
.value =
|
|
|
|
"chown\0"
|
|
|
|
"chown32\0"
|
|
|
|
"fchown\0"
|
|
|
|
"fchown32\0"
|
|
|
|
"fchownat\0"
|
|
|
|
"lchown\0"
|
|
|
|
"lchown32\0"
|
|
|
|
},
|
2016-10-21 21:50:05 +02:00
|
|
|
[SYSCALL_FILTER_SET_CLOCK] = {
|
|
|
|
.name = "@clock",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Change the system time",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"adjtimex\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"clock_adjtime\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"clock_adjtime64\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"clock_settime\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"clock_settime64\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"settimeofday\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"stime\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_CPU_EMULATION] = {
|
|
|
|
.name = "@cpu-emulation",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "System calls for CPU emulation functionality",
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
.value =
|
|
|
|
"modify_ldt\0"
|
|
|
|
"subpage_prot\0"
|
|
|
|
"switch_endian\0"
|
|
|
|
"vm86\0"
|
|
|
|
"vm86old\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_DEBUG] = {
|
|
|
|
.name = "@debug",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Debugging, performance monitoring and tracing functionality",
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
.value =
|
|
|
|
"lookup_dcookie\0"
|
|
|
|
"perf_event_open\0"
|
2020-05-10 09:19:29 +02:00
|
|
|
"pidfd_getfd\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"ptrace\0"
|
|
|
|
"rtas\0"
|
2020-08-18 16:10:47 +02:00
|
|
|
#if defined __s390__ || defined __s390x__
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"s390_runtime_instr\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
#endif
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"sys_debug_setcontext\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
2016-11-22 01:29:12 +01:00
|
|
|
[SYSCALL_FILTER_SET_FILE_SYSTEM] = {
|
|
|
|
.name = "@file-system",
|
|
|
|
.help = "File system operations",
|
|
|
|
.value =
|
|
|
|
"access\0"
|
|
|
|
"chdir\0"
|
|
|
|
"chmod\0"
|
|
|
|
"close\0"
|
|
|
|
"creat\0"
|
|
|
|
"faccessat\0"
|
2020-08-15 18:12:02 +02:00
|
|
|
"faccessat2\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"fallocate\0"
|
|
|
|
"fchdir\0"
|
|
|
|
"fchmod\0"
|
|
|
|
"fchmodat\0"
|
|
|
|
"fcntl\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"fcntl64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"fgetxattr\0"
|
|
|
|
"flistxattr\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"fremovexattr\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"fsetxattr\0"
|
|
|
|
"fstat\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"fstat64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"fstatat64\0"
|
|
|
|
"fstatfs\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"fstatfs64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"ftruncate\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"ftruncate64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"futimesat\0"
|
|
|
|
"getcwd\0"
|
|
|
|
"getdents\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"getdents64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"getxattr\0"
|
|
|
|
"inotify_add_watch\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"inotify_init\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"inotify_init1\0"
|
|
|
|
"inotify_rm_watch\0"
|
|
|
|
"lgetxattr\0"
|
|
|
|
"link\0"
|
|
|
|
"linkat\0"
|
|
|
|
"listxattr\0"
|
|
|
|
"llistxattr\0"
|
|
|
|
"lremovexattr\0"
|
|
|
|
"lsetxattr\0"
|
|
|
|
"lstat\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"lstat64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"mkdir\0"
|
|
|
|
"mkdirat\0"
|
|
|
|
"mknod\0"
|
|
|
|
"mknodat\0"
|
|
|
|
"mmap\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"mmap2\0"
|
2017-02-10 03:29:33 +01:00
|
|
|
"munmap\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"newfstatat\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"oldfstat\0"
|
|
|
|
"oldlstat\0"
|
|
|
|
"oldstat\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"open\0"
|
|
|
|
"openat\0"
|
2020-05-10 09:19:29 +02:00
|
|
|
"openat2\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"readlink\0"
|
|
|
|
"readlinkat\0"
|
|
|
|
"removexattr\0"
|
|
|
|
"rename\0"
|
|
|
|
"renameat\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"renameat2\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"rmdir\0"
|
|
|
|
"setxattr\0"
|
|
|
|
"stat\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"stat64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"statfs\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"statfs64\0"
|
2017-09-04 15:35:35 +02:00
|
|
|
"statx\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"symlink\0"
|
|
|
|
"symlinkat\0"
|
|
|
|
"truncate\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"truncate64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"unlink\0"
|
|
|
|
"unlinkat\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"utime\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"utimensat\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"utimensat_time64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"utimes\0"
|
|
|
|
},
|
2016-10-21 21:50:05 +02:00
|
|
|
[SYSCALL_FILTER_SET_IO_EVENT] = {
|
|
|
|
.name = "@io-event",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Event loop system calls",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"_newselect\0"
|
|
|
|
"epoll_create\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"epoll_create1\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"epoll_ctl\0"
|
|
|
|
"epoll_ctl_old\0"
|
|
|
|
"epoll_pwait\0"
|
|
|
|
"epoll_wait\0"
|
|
|
|
"epoll_wait_old\0"
|
|
|
|
"eventfd\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"eventfd2\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"poll\0"
|
|
|
|
"ppoll\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"ppoll_time64\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"pselect6\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"pselect6_time64\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"select\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_IPC] = {
|
|
|
|
.name = "@ipc",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "SysV IPC, POSIX Message Queues or other IPC",
|
|
|
|
.value =
|
|
|
|
"ipc\0"
|
2016-10-25 15:43:31 +02:00
|
|
|
"memfd_create\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"mq_getsetattr\0"
|
|
|
|
"mq_notify\0"
|
|
|
|
"mq_open\0"
|
|
|
|
"mq_timedreceive\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"mq_timedreceive_time64\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"mq_timedsend\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"mq_timedsend_time64\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"mq_unlink\0"
|
|
|
|
"msgctl\0"
|
|
|
|
"msgget\0"
|
|
|
|
"msgrcv\0"
|
|
|
|
"msgsnd\0"
|
2016-10-25 15:43:31 +02:00
|
|
|
"pipe\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"pipe2\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"process_vm_readv\0"
|
|
|
|
"process_vm_writev\0"
|
|
|
|
"semctl\0"
|
|
|
|
"semget\0"
|
|
|
|
"semop\0"
|
|
|
|
"semtimedop\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"semtimedop_time64\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"shmat\0"
|
|
|
|
"shmctl\0"
|
|
|
|
"shmdt\0"
|
|
|
|
"shmget\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_KEYRING] = {
|
|
|
|
.name = "@keyring",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Kernel keyring access",
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
.value =
|
|
|
|
"add_key\0"
|
|
|
|
"keyctl\0"
|
|
|
|
"request_key\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
2017-09-13 19:55:16 +02:00
|
|
|
[SYSCALL_FILTER_SET_MEMLOCK] = {
|
|
|
|
.name = "@memlock",
|
|
|
|
.help = "Memory locking control",
|
|
|
|
.value =
|
|
|
|
"mlock\0"
|
|
|
|
"mlock2\0"
|
|
|
|
"mlockall\0"
|
|
|
|
"munlock\0"
|
|
|
|
"munlockall\0"
|
|
|
|
},
|
2016-10-21 21:50:05 +02:00
|
|
|
[SYSCALL_FILTER_SET_MODULE] = {
|
|
|
|
.name = "@module",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Loading and unloading of kernel modules",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"delete_module\0"
|
|
|
|
"finit_module\0"
|
|
|
|
"init_module\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_MOUNT] = {
|
|
|
|
.name = "@mount",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Mounting and unmounting of file systems",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"chroot\0"
|
2019-10-30 11:11:05 +01:00
|
|
|
"fsconfig\0"
|
|
|
|
"fsmount\0"
|
|
|
|
"fsopen\0"
|
|
|
|
"fspick\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"mount\0"
|
2019-10-30 11:11:05 +01:00
|
|
|
"move_mount\0"
|
|
|
|
"open_tree\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"pivot_root\0"
|
|
|
|
"umount\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"umount2\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_NETWORK_IO] = {
|
|
|
|
.name = "@network-io",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Network or Unix socket IO, should not be needed if not network facing",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"accept\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"accept4\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"bind\0"
|
|
|
|
"connect\0"
|
|
|
|
"getpeername\0"
|
|
|
|
"getsockname\0"
|
|
|
|
"getsockopt\0"
|
|
|
|
"listen\0"
|
|
|
|
"recv\0"
|
|
|
|
"recvfrom\0"
|
|
|
|
"recvmmsg\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"recvmmsg_time64\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"recvmsg\0"
|
|
|
|
"send\0"
|
|
|
|
"sendmmsg\0"
|
|
|
|
"sendmsg\0"
|
|
|
|
"sendto\0"
|
|
|
|
"setsockopt\0"
|
|
|
|
"shutdown\0"
|
|
|
|
"socket\0"
|
|
|
|
"socketcall\0"
|
|
|
|
"socketpair\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_OBSOLETE] = {
|
2016-11-02 17:24:34 +01:00
|
|
|
/* some unknown even to libseccomp */
|
2016-10-21 21:50:05 +02:00
|
|
|
.name = "@obsolete",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Unusual, obsolete or unimplemented system calls",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"_sysctl\0"
|
|
|
|
"afs_syscall\0"
|
2016-12-27 14:28:19 +01:00
|
|
|
"bdflush\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"break\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"create_module\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"ftime\0"
|
|
|
|
"get_kernel_syms\0"
|
|
|
|
"getpmsg\0"
|
|
|
|
"gtty\0"
|
2017-09-13 19:39:54 +02:00
|
|
|
"idle\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"lock\0"
|
|
|
|
"mpx\0"
|
|
|
|
"prof\0"
|
|
|
|
"profil\0"
|
|
|
|
"putpmsg\0"
|
|
|
|
"query_module\0"
|
|
|
|
"security\0"
|
|
|
|
"sgetmask\0"
|
|
|
|
"ssetmask\0"
|
|
|
|
"stty\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"sysfs\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"tuxcall\0"
|
|
|
|
"ulimit\0"
|
|
|
|
"uselib\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"ustat\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"vserver\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
2019-11-08 12:56:56 +01:00
|
|
|
[SYSCALL_FILTER_SET_PKEY] = {
|
|
|
|
.name = "@pkey",
|
|
|
|
.help = "System calls used for memory protection keys",
|
|
|
|
.value =
|
|
|
|
"pkey_alloc\0"
|
|
|
|
"pkey_free\0"
|
|
|
|
"pkey_mprotect\0"
|
|
|
|
},
|
2016-10-21 21:50:05 +02:00
|
|
|
[SYSCALL_FILTER_SET_PRIVILEGED] = {
|
|
|
|
.name = "@privileged",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "All system calls which need super-user capabilities",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
2017-09-30 14:34:50 +02:00
|
|
|
"@chown\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"@clock\0"
|
|
|
|
"@module\0"
|
|
|
|
"@raw-io\0"
|
2017-10-02 09:16:50 +02:00
|
|
|
"@reboot\0"
|
|
|
|
"@swap\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"_sysctl\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"acct\0"
|
|
|
|
"bpf\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"capset\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"chroot\0"
|
2018-11-14 19:53:49 +01:00
|
|
|
"fanotify_init\0"
|
2019-10-30 11:11:05 +01:00
|
|
|
"fanotify_mark\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"nfsservctl\0"
|
2018-11-14 19:53:49 +01:00
|
|
|
"open_by_handle_at\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"pivot_root\0"
|
|
|
|
"quotactl\0"
|
|
|
|
"setdomainname\0"
|
|
|
|
"setfsuid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setfsuid32\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"setgroups\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setgroups32\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"sethostname\0"
|
|
|
|
"setresuid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setresuid32\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"setreuid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setreuid32\0"
|
2018-04-18 21:45:44 +02:00
|
|
|
"setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
|
2017-09-13 19:39:02 +02:00
|
|
|
"setuid32\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"vhangup\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_PROCESS] = {
|
|
|
|
.name = "@process",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Process control, execution, namespaceing operations",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"arch_prctl\0"
|
2017-10-03 07:20:05 +02:00
|
|
|
"capget\0" /* Able to query arbitrary processes */
|
2016-06-01 11:56:01 +02:00
|
|
|
"clone\0"
|
2019-10-30 11:11:05 +01:00
|
|
|
"clone3\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"execveat\0"
|
|
|
|
"fork\0"
|
2017-09-13 19:40:23 +02:00
|
|
|
"getrusage\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"kill\0"
|
2019-10-30 11:11:05 +01:00
|
|
|
"pidfd_open\0"
|
2019-05-28 15:06:49 +02:00
|
|
|
"pidfd_send_signal\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"prctl\0"
|
2017-09-13 19:40:23 +02:00
|
|
|
"rt_sigqueueinfo\0"
|
|
|
|
"rt_tgsigqueueinfo\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"setns\0"
|
2018-07-02 22:58:01 +02:00
|
|
|
"swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
|
2016-06-01 11:56:01 +02:00
|
|
|
"tgkill\0"
|
2017-09-13 19:40:23 +02:00
|
|
|
"times\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"tkill\0"
|
|
|
|
"unshare\0"
|
|
|
|
"vfork\0"
|
2017-09-13 19:40:23 +02:00
|
|
|
"wait4\0"
|
|
|
|
"waitid\0"
|
|
|
|
"waitpid\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_RAW_IO] = {
|
|
|
|
.name = "@raw-io",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Raw I/O port access",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"ioperm\0"
|
|
|
|
"iopl\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"pciconfig_iobase\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"pciconfig_read\0"
|
|
|
|
"pciconfig_write\0"
|
2020-08-18 16:10:47 +02:00
|
|
|
#if defined __s390__ || defined __s390x__
|
2016-06-01 11:56:01 +02:00
|
|
|
"s390_pci_mmio_read\0"
|
|
|
|
"s390_pci_mmio_write\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
#endif
|
|
|
|
},
|
2016-12-27 14:26:55 +01:00
|
|
|
[SYSCALL_FILTER_SET_REBOOT] = {
|
|
|
|
.name = "@reboot",
|
|
|
|
.help = "Reboot and reboot preparation/kexec",
|
|
|
|
.value =
|
|
|
|
"kexec_file_load\0"
|
2017-10-04 21:09:52 +02:00
|
|
|
"kexec_load\0"
|
2016-12-27 14:26:55 +01:00
|
|
|
"reboot\0"
|
|
|
|
},
|
2016-11-02 15:46:18 +01:00
|
|
|
[SYSCALL_FILTER_SET_RESOURCES] = {
|
|
|
|
.name = "@resources",
|
2016-12-27 14:27:39 +01:00
|
|
|
.help = "Alter resource settings",
|
2016-11-02 15:46:18 +01:00
|
|
|
.value =
|
2017-09-13 19:44:11 +02:00
|
|
|
"ioprio_set\0"
|
|
|
|
"mbind\0"
|
|
|
|
"migrate_pages\0"
|
|
|
|
"move_pages\0"
|
|
|
|
"nice\0"
|
|
|
|
"sched_setaffinity\0"
|
|
|
|
"sched_setattr\0"
|
2016-11-02 15:46:18 +01:00
|
|
|
"sched_setparam\0"
|
|
|
|
"sched_setscheduler\0"
|
2017-09-13 19:44:11 +02:00
|
|
|
"set_mempolicy\0"
|
2016-11-02 15:46:18 +01:00
|
|
|
"setpriority\0"
|
|
|
|
"setrlimit\0"
|
|
|
|
},
|
2017-08-09 15:04:05 +02:00
|
|
|
[SYSCALL_FILTER_SET_SETUID] = {
|
|
|
|
.name = "@setuid",
|
|
|
|
.help = "Operations for changing user/group credentials",
|
|
|
|
.value =
|
|
|
|
"setgid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setgid32\0"
|
2017-08-09 15:04:05 +02:00
|
|
|
"setgroups\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setgroups32\0"
|
2017-08-09 15:04:05 +02:00
|
|
|
"setregid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setregid32\0"
|
2017-08-09 15:04:05 +02:00
|
|
|
"setresgid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setresgid32\0"
|
2017-08-09 15:04:05 +02:00
|
|
|
"setresuid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setresuid32\0"
|
2017-08-09 15:04:05 +02:00
|
|
|
"setreuid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setreuid32\0"
|
2017-08-09 15:04:05 +02:00
|
|
|
"setuid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setuid32\0"
|
2017-08-09 15:04:05 +02:00
|
|
|
},
|
2017-09-13 19:55:16 +02:00
|
|
|
[SYSCALL_FILTER_SET_SIGNAL] = {
|
|
|
|
.name = "@signal",
|
|
|
|
.help = "Process signal handling",
|
|
|
|
.value =
|
|
|
|
"rt_sigaction\0"
|
|
|
|
"rt_sigpending\0"
|
|
|
|
"rt_sigprocmask\0"
|
|
|
|
"rt_sigsuspend\0"
|
|
|
|
"rt_sigtimedwait\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"rt_sigtimedwait_time64\0"
|
2017-09-13 19:55:16 +02:00
|
|
|
"sigaction\0"
|
|
|
|
"sigaltstack\0"
|
|
|
|
"signal\0"
|
|
|
|
"signalfd\0"
|
|
|
|
"signalfd4\0"
|
|
|
|
"sigpending\0"
|
|
|
|
"sigprocmask\0"
|
|
|
|
"sigsuspend\0"
|
|
|
|
},
|
2016-12-27 14:26:55 +01:00
|
|
|
[SYSCALL_FILTER_SET_SWAP] = {
|
|
|
|
.name = "@swap",
|
|
|
|
.help = "Enable/disable swap devices",
|
|
|
|
.value =
|
|
|
|
"swapoff\0"
|
|
|
|
"swapon\0"
|
|
|
|
},
|
2017-09-30 14:34:50 +02:00
|
|
|
[SYSCALL_FILTER_SET_SYNC] = {
|
|
|
|
.name = "@sync",
|
|
|
|
.help = "Synchronize files and memory to storage",
|
|
|
|
.value =
|
|
|
|
"fdatasync\0"
|
|
|
|
"fsync\0"
|
|
|
|
"msync\0"
|
|
|
|
"sync\0"
|
|
|
|
"sync_file_range\0"
|
2019-08-19 08:51:39 +02:00
|
|
|
"sync_file_range2\0"
|
2017-09-30 14:34:50 +02:00
|
|
|
"syncfs\0"
|
|
|
|
},
|
2018-04-18 21:19:54 +02:00
|
|
|
[SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
|
|
|
|
.name = "@system-service",
|
|
|
|
.help = "General system service operations",
|
|
|
|
.value =
|
|
|
|
"@aio\0"
|
|
|
|
"@basic-io\0"
|
|
|
|
"@chown\0"
|
|
|
|
"@default\0"
|
|
|
|
"@file-system\0"
|
|
|
|
"@io-event\0"
|
|
|
|
"@ipc\0"
|
|
|
|
"@keyring\0"
|
|
|
|
"@memlock\0"
|
|
|
|
"@network-io\0"
|
|
|
|
"@process\0"
|
|
|
|
"@resources\0"
|
|
|
|
"@setuid\0"
|
|
|
|
"@signal\0"
|
|
|
|
"@sync\0"
|
|
|
|
"@timer\0"
|
|
|
|
"brk\0"
|
|
|
|
"capget\0"
|
|
|
|
"capset\0"
|
|
|
|
"copy_file_range\0"
|
|
|
|
"fadvise64\0"
|
|
|
|
"fadvise64_64\0"
|
|
|
|
"flock\0"
|
|
|
|
"get_mempolicy\0"
|
|
|
|
"getcpu\0"
|
|
|
|
"getpriority\0"
|
|
|
|
"getrandom\0"
|
|
|
|
"ioctl\0"
|
|
|
|
"ioprio_get\0"
|
|
|
|
"kcmp\0"
|
|
|
|
"madvise\0"
|
|
|
|
"mprotect\0"
|
|
|
|
"mremap\0"
|
|
|
|
"name_to_handle_at\0"
|
|
|
|
"oldolduname\0"
|
|
|
|
"olduname\0"
|
|
|
|
"personality\0"
|
|
|
|
"readahead\0"
|
|
|
|
"readdir\0"
|
|
|
|
"remap_file_pages\0"
|
|
|
|
"sched_get_priority_max\0"
|
|
|
|
"sched_get_priority_min\0"
|
|
|
|
"sched_getaffinity\0"
|
|
|
|
"sched_getattr\0"
|
|
|
|
"sched_getparam\0"
|
|
|
|
"sched_getscheduler\0"
|
|
|
|
"sched_rr_get_interval\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"sched_rr_get_interval_time64\0"
|
2018-04-18 21:19:54 +02:00
|
|
|
"sched_yield\0"
|
|
|
|
"sendfile\0"
|
|
|
|
"sendfile64\0"
|
|
|
|
"setfsgid\0"
|
|
|
|
"setfsgid32\0"
|
|
|
|
"setfsuid\0"
|
|
|
|
"setfsuid32\0"
|
|
|
|
"setpgid\0"
|
|
|
|
"setsid\0"
|
|
|
|
"splice\0"
|
|
|
|
"sysinfo\0"
|
|
|
|
"tee\0"
|
|
|
|
"umask\0"
|
|
|
|
"uname\0"
|
|
|
|
"userfaultfd\0"
|
|
|
|
"vmsplice\0"
|
|
|
|
},
|
2017-09-13 19:55:16 +02:00
|
|
|
[SYSCALL_FILTER_SET_TIMER] = {
|
|
|
|
.name = "@timer",
|
|
|
|
.help = "Schedule operations by time",
|
|
|
|
.value =
|
|
|
|
"alarm\0"
|
|
|
|
"getitimer\0"
|
|
|
|
"setitimer\0"
|
|
|
|
"timer_create\0"
|
|
|
|
"timer_delete\0"
|
|
|
|
"timer_getoverrun\0"
|
|
|
|
"timer_gettime\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"timer_gettime64\0"
|
2017-09-13 19:55:16 +02:00
|
|
|
"timer_settime\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"timer_settime64\0"
|
2017-09-13 19:55:16 +02:00
|
|
|
"timerfd_create\0"
|
|
|
|
"timerfd_gettime\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"timerfd_gettime64\0"
|
2017-09-13 19:55:16 +02:00
|
|
|
"timerfd_settime\0"
|
2019-11-08 14:00:10 +01:00
|
|
|
"timerfd_settime64\0"
|
2017-09-13 19:55:16 +02:00
|
|
|
"times\0"
|
|
|
|
},
|
2020-08-19 17:43:23 +02:00
|
|
|
[SYSCALL_FILTER_SET_KNOWN] = {
|
|
|
|
.name = "@known",
|
|
|
|
.help = "All known syscalls declared in the kernel",
|
|
|
|
.value =
|
|
|
|
#include "syscall-list.h"
|
|
|
|
},
|
2016-06-01 11:56:01 +02:00
|
|
|
};
|
2016-10-21 21:50:05 +02:00
|
|
|
|
|
|
|
const SyscallFilterSet *syscall_filter_set_find(const char *name) {
|
|
|
|
if (isempty(name) || name[0] != '@')
|
|
|
|
return NULL;
|
|
|
|
|
2020-08-19 17:43:40 +02:00
|
|
|
for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
|
2016-10-21 21:50:05 +02:00
|
|
|
if (streq(syscall_filter_sets[i].name, name))
|
|
|
|
return syscall_filter_sets + i;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2020-08-21 17:21:04 +02:00
|
|
|
static int add_syscall_filter_set(
|
|
|
|
scmp_filter_ctx seccomp,
|
|
|
|
const SyscallFilterSet *set,
|
|
|
|
uint32_t action,
|
|
|
|
char **exclude,
|
|
|
|
bool log_missing,
|
|
|
|
char ***added);
|
|
|
|
|
|
|
|
int seccomp_add_syscall_filter_item(
|
|
|
|
scmp_filter_ctx *seccomp,
|
|
|
|
const char *name,
|
|
|
|
uint32_t action,
|
|
|
|
char **exclude,
|
|
|
|
bool log_missing,
|
|
|
|
char ***added) {
|
2017-09-10 19:10:29 +02:00
|
|
|
|
|
|
|
assert(seccomp);
|
|
|
|
assert(name);
|
|
|
|
|
2017-09-11 17:45:21 +02:00
|
|
|
if (strv_contains(exclude, name))
|
|
|
|
return 0;
|
|
|
|
|
2020-08-21 17:21:04 +02:00
|
|
|
/* Any syscalls that are handled are added to the *added strv. The pointer
|
|
|
|
* must be either NULL or point to a valid pre-initialized possibly-empty strv. */
|
|
|
|
|
2017-09-10 19:10:29 +02:00
|
|
|
if (name[0] == '@') {
|
|
|
|
const SyscallFilterSet *other;
|
|
|
|
|
|
|
|
other = syscall_filter_set_find(name);
|
2018-11-20 23:40:44 +01:00
|
|
|
if (!other)
|
|
|
|
return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
|
|
|
|
"Filter set %s is not known!",
|
|
|
|
name);
|
2017-09-10 19:10:29 +02:00
|
|
|
|
2020-08-21 17:21:04 +02:00
|
|
|
return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
|
2018-09-24 16:59:12 +02:00
|
|
|
|
2017-09-10 19:10:29 +02:00
|
|
|
} else {
|
2018-09-24 16:59:12 +02:00
|
|
|
int id, r;
|
2017-09-10 19:10:29 +02:00
|
|
|
|
|
|
|
id = seccomp_syscall_resolve_name(name);
|
2017-09-13 19:57:32 +02:00
|
|
|
if (id == __NR_SCMP_ERROR) {
|
2018-09-24 16:59:12 +02:00
|
|
|
if (log_missing)
|
|
|
|
log_debug("System call %s is not known, ignoring.", name);
|
2017-10-05 11:23:07 +02:00
|
|
|
return 0;
|
2017-09-13 19:57:32 +02:00
|
|
|
}
|
2017-09-10 19:10:29 +02:00
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(seccomp, action, id, 0);
|
2018-09-24 16:59:12 +02:00
|
|
|
if (r < 0) {
|
2017-09-10 19:10:29 +02:00
|
|
|
/* If the system call is not known on this architecture, then that's fine, let's ignore it */
|
seccomp: tighten checking of seccomp filter creation
In seccomp code, the code is changed to propagate errors which are about
anything other than unknown/unimplemented syscalls. I *think* such errors
should not happen in normal usage, but so far we would summarilly ignore all
errors, so that part is uncertain. If it turns out that other errors occur and
should be ignored, this should be added later.
In nspawn, we would count the number of added filters, but didn't use this for
anything. Drop that part.
The comments suggested that seccomp_add_syscall_filter_item() returned negative
if the syscall is unknown, but this wasn't true: it returns 0.
The error at this point can only be if the syscall was known but couldn't be
added. If the error comes from our internal whitelist in nspawn, treat this as
error, because it means that our internal table is wrong. If the error comes
from user arguments, warn and ignore. (If some syscall is not known at current
architecture, it is still silently ignored.)
2018-09-20 14:19:41 +02:00
|
|
|
bool ignore = r == -EDOM;
|
|
|
|
|
|
|
|
if (!ignore || log_missing)
|
|
|
|
log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
|
|
|
|
name, id, ignore ? ", ignoring" : "");
|
|
|
|
if (!ignore)
|
|
|
|
return r;
|
2018-09-24 16:59:12 +02:00
|
|
|
}
|
2017-09-10 19:10:29 +02:00
|
|
|
|
2020-08-21 17:21:04 +02:00
|
|
|
if (added) {
|
|
|
|
r = strv_extend(added, name);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2018-09-24 16:59:12 +02:00
|
|
|
return 0;
|
|
|
|
}
|
2017-09-10 19:10:29 +02:00
|
|
|
}
|
|
|
|
|
2020-08-21 17:21:04 +02:00
|
|
|
static int add_syscall_filter_set(
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
scmp_filter_ctx seccomp,
|
|
|
|
const SyscallFilterSet *set,
|
2017-09-11 17:45:21 +02:00
|
|
|
uint32_t action,
|
2018-09-24 16:59:12 +02:00
|
|
|
char **exclude,
|
2020-08-21 17:21:04 +02:00
|
|
|
bool log_missing,
|
|
|
|
char ***added) {
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
|
2016-10-21 21:50:05 +02:00
|
|
|
const char *sys;
|
|
|
|
int r;
|
|
|
|
|
2020-08-21 17:21:04 +02:00
|
|
|
/* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
|
|
|
|
|
2016-10-21 21:50:05 +02:00
|
|
|
assert(seccomp);
|
|
|
|
assert(set);
|
|
|
|
|
|
|
|
NULSTR_FOREACH(sys, set->value) {
|
2020-08-21 17:21:04 +02:00
|
|
|
r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
|
2017-09-10 19:10:29 +02:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-09-24 16:59:12 +02:00
|
|
|
int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
uint32_t arch;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(set);
|
|
|
|
|
|
|
|
/* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
|
2018-11-10 07:43:57 +01:00
|
|
|
* each local arch. */
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
|
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
|
|
|
|
|
|
|
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
|
|
|
|
|
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, default_action);
|
2016-10-21 21:50:05 +02:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
|
2020-08-21 17:21:04 +02:00
|
|
|
r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
|
seccomp: tighten checking of seccomp filter creation
In seccomp code, the code is changed to propagate errors which are about
anything other than unknown/unimplemented syscalls. I *think* such errors
should not happen in normal usage, but so far we would summarilly ignore all
errors, so that part is uncertain. If it turns out that other errors occur and
should be ignored, this should be added later.
In nspawn, we would count the number of added filters, but didn't use this for
anything. Drop that part.
The comments suggested that seccomp_add_syscall_filter_item() returned negative
if the syscall is unknown, but this wasn't true: it returns 0.
The error at this point can only be if the syscall was known but couldn't be
added. If the error comes from our internal whitelist in nspawn, treat this as
error, because it means that our internal table is wrong. If the error comes
from user arguments, warn and ignore. (If some syscall is not known at current
architecture, it is still silently ignored.)
2018-09-20 14:19:41 +02:00
|
|
|
if (r < 0)
|
|
|
|
return log_debug_errno(r, "Failed to add filter set: %m");
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
2019-04-11 01:08:41 +02:00
|
|
|
if (ERRNO_IS_SECCOMP_FATAL(r))
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
2016-10-21 21:50:05 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2016-10-21 21:18:46 +02:00
|
|
|
|
2018-09-24 16:59:12 +02:00
|
|
|
int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
uint32_t arch;
|
2016-10-21 21:18:46 +02:00
|
|
|
int r;
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
/* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
|
|
|
|
* SyscallFilterSet* table. */
|
2016-10-21 21:18:46 +02:00
|
|
|
|
2017-11-11 13:35:49 +01:00
|
|
|
if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
return 0;
|
2016-10-21 21:18:46 +02:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
|
|
|
Iterator i;
|
2018-09-24 16:59:12 +02:00
|
|
|
void *syscall_id, *val;
|
2016-10-21 21:18:46 +02:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
|
2016-10-21 21:18:46 +02:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, default_action);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
2016-10-21 21:18:46 +02:00
|
|
|
|
2018-09-24 16:59:12 +02:00
|
|
|
HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
|
2017-11-11 13:35:49 +01:00
|
|
|
uint32_t a = action;
|
2018-09-24 16:59:12 +02:00
|
|
|
int id = PTR_TO_INT(syscall_id) - 1;
|
|
|
|
int error = PTR_TO_INT(val);
|
2017-11-11 13:35:49 +01:00
|
|
|
|
2018-09-24 16:59:12 +02:00
|
|
|
if (action != SCMP_ACT_ALLOW && error >= 0)
|
|
|
|
a = SCMP_ACT_ERRNO(error);
|
2017-11-11 13:35:49 +01:00
|
|
|
|
2018-09-24 16:59:12 +02:00
|
|
|
r = seccomp_rule_add_exact(seccomp, a, id, 0);
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
if (r < 0) {
|
|
|
|
/* If the system call is not known on this architecture, then that's fine, let's ignore it */
|
|
|
|
_cleanup_free_ char *n = NULL;
|
seccomp: tighten checking of seccomp filter creation
In seccomp code, the code is changed to propagate errors which are about
anything other than unknown/unimplemented syscalls. I *think* such errors
should not happen in normal usage, but so far we would summarilly ignore all
errors, so that part is uncertain. If it turns out that other errors occur and
should be ignored, this should be added later.
In nspawn, we would count the number of added filters, but didn't use this for
anything. Drop that part.
The comments suggested that seccomp_add_syscall_filter_item() returned negative
if the syscall is unknown, but this wasn't true: it returns 0.
The error at this point can only be if the syscall was known but couldn't be
added. If the error comes from our internal whitelist in nspawn, treat this as
error, because it means that our internal table is wrong. If the error comes
from user arguments, warn and ignore. (If some syscall is not known at current
architecture, it is still silently ignored.)
2018-09-20 14:19:41 +02:00
|
|
|
bool ignore;
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
|
2018-09-24 16:59:12 +02:00
|
|
|
n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
|
seccomp: tighten checking of seccomp filter creation
In seccomp code, the code is changed to propagate errors which are about
anything other than unknown/unimplemented syscalls. I *think* such errors
should not happen in normal usage, but so far we would summarilly ignore all
errors, so that part is uncertain. If it turns out that other errors occur and
should be ignored, this should be added later.
In nspawn, we would count the number of added filters, but didn't use this for
anything. Drop that part.
The comments suggested that seccomp_add_syscall_filter_item() returned negative
if the syscall is unknown, but this wasn't true: it returns 0.
The error at this point can only be if the syscall was known but couldn't be
added. If the error comes from our internal whitelist in nspawn, treat this as
error, because it means that our internal table is wrong. If the error comes
from user arguments, warn and ignore. (If some syscall is not known at current
architecture, it is still silently ignored.)
2018-09-20 14:19:41 +02:00
|
|
|
ignore = r == -EDOM;
|
|
|
|
if (!ignore || log_missing)
|
|
|
|
log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
|
|
|
|
strna(n), id, ignore ? ", ignoring" : "");
|
|
|
|
if (!ignore)
|
|
|
|
return r;
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
2019-04-11 01:08:41 +02:00
|
|
|
if (ERRNO_IS_SECCOMP_FATAL(r))
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
2016-11-02 03:25:19 +01:00
|
|
|
}
|
|
|
|
|
2019-04-03 09:17:42 +02:00
|
|
|
int seccomp_parse_syscall_filter(
|
2017-12-23 10:45:32 +01:00
|
|
|
const char *name,
|
|
|
|
int errno_num,
|
|
|
|
Hashmap *filter,
|
2018-02-26 12:51:35 +01:00
|
|
|
SeccompParseFlags flags,
|
2017-12-23 10:45:32 +01:00
|
|
|
const char *unit,
|
|
|
|
const char *filename,
|
|
|
|
unsigned line) {
|
|
|
|
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(name);
|
|
|
|
assert(filter);
|
|
|
|
|
|
|
|
if (name[0] == '@') {
|
|
|
|
const SyscallFilterSet *set;
|
|
|
|
const char *i;
|
|
|
|
|
|
|
|
set = syscall_filter_set_find(name);
|
|
|
|
if (!set) {
|
2018-02-26 12:51:35 +01:00
|
|
|
if (!(flags & SECCOMP_PARSE_PERMISSIVE))
|
2017-12-23 10:45:32 +01:00
|
|
|
return -EINVAL;
|
2018-02-26 12:51:35 +01:00
|
|
|
|
|
|
|
log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
|
|
|
|
"Unknown system call group, ignoring: %s", name);
|
|
|
|
return 0;
|
2017-12-23 10:45:32 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
NULSTR_FOREACH(i, set->value) {
|
2018-02-26 12:51:35 +01:00
|
|
|
/* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
|
|
|
|
* away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
|
|
|
|
* not a problem in user configuration data and we shouldn't pretend otherwise by complaining
|
|
|
|
* about them. */
|
2019-04-03 09:17:42 +02:00
|
|
|
r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
|
2017-12-23 10:45:32 +01:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
int id;
|
|
|
|
|
|
|
|
id = seccomp_syscall_resolve_name(name);
|
|
|
|
if (id == __NR_SCMP_ERROR) {
|
2018-02-26 12:51:35 +01:00
|
|
|
if (!(flags & SECCOMP_PARSE_PERMISSIVE))
|
2017-12-23 10:45:32 +01:00
|
|
|
return -EINVAL;
|
2018-02-26 12:51:35 +01:00
|
|
|
|
|
|
|
log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
|
|
|
|
"Failed to parse system call, ignoring: %s", name);
|
|
|
|
return 0;
|
2017-12-23 10:45:32 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/* If we previously wanted to forbid a syscall and now
|
|
|
|
* we want to allow it, then remove it from the list. */
|
2020-06-23 08:31:16 +02:00
|
|
|
if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_ALLOW_LIST)) {
|
2017-12-23 10:45:32 +01:00
|
|
|
r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
|
|
|
|
if (r < 0)
|
2018-08-26 00:27:29 +02:00
|
|
|
switch (r) {
|
|
|
|
case -ENOMEM:
|
|
|
|
return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
|
|
|
|
case -EEXIST:
|
2018-08-29 21:35:38 +02:00
|
|
|
assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
|
|
|
|
break;
|
2018-08-26 00:27:29 +02:00
|
|
|
default:
|
|
|
|
return r;
|
|
|
|
}
|
2017-12-23 10:45:32 +01:00
|
|
|
} else
|
|
|
|
(void) hashmap_remove(filter, INT_TO_PTR(id + 1));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-11-02 03:25:19 +01:00
|
|
|
int seccomp_restrict_namespaces(unsigned long retain) {
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
uint32_t arch;
|
2016-11-02 03:25:19 +01:00
|
|
|
int r;
|
|
|
|
|
2017-12-15 11:09:00 +01:00
|
|
|
if (DEBUG_LOGGING) {
|
2016-11-02 03:25:19 +01:00
|
|
|
_cleanup_free_ char *s = NULL;
|
|
|
|
|
2018-05-01 03:48:21 +02:00
|
|
|
(void) namespace_flags_to_string(retain, &s);
|
2016-11-02 03:25:19 +01:00
|
|
|
log_debug("Restricting namespace to: %s.", strna(s));
|
|
|
|
}
|
|
|
|
|
|
|
|
/* NOOP? */
|
|
|
|
if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
|
|
|
|
return 0;
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
2016-11-02 03:25:19 +01:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
|
|
|
|
|
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
if ((retain & NAMESPACE_FLAGS_ALL) == 0)
|
|
|
|
/* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
|
|
|
|
* altogether. */
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(setns),
|
|
|
|
0);
|
|
|
|
else
|
|
|
|
/* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
|
|
|
|
* special invocation with a zero flags argument, right here. */
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(setns),
|
|
|
|
1,
|
|
|
|
SCMP_A1(SCMP_CMP_EQ, 0));
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-08-19 17:43:40 +02:00
|
|
|
for (unsigned i = 0; namespace_flag_map[i].name; i++) {
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
unsigned long f;
|
|
|
|
|
|
|
|
f = namespace_flag_map[i].flag;
|
|
|
|
if ((retain & f) == f) {
|
|
|
|
log_debug("Permitting %s.", namespace_flag_map[i].name);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
log_debug("Blocking %s.", namespace_flag_map[i].name);
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(unshare),
|
|
|
|
1,
|
|
|
|
SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2017-05-05 05:10:30 +02:00
|
|
|
/* On s390/s390x the first two parameters to clone are switched */
|
|
|
|
if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
|
2017-02-08 16:21:11 +01:00
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(clone),
|
|
|
|
1,
|
|
|
|
SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
|
|
|
|
else
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(clone),
|
|
|
|
1,
|
|
|
|
SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(setns),
|
|
|
|
1,
|
|
|
|
SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (r < 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
2019-04-11 01:08:41 +02:00
|
|
|
if (ERRNO_IS_SECCOMP_FATAL(r))
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int seccomp_protect_sysctl(void) {
|
|
|
|
uint32_t arch;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
|
|
|
|
|
|
|
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
|
|
|
|
|
2020-08-19 22:44:15 +02:00
|
|
|
if (IN_SET(arch,
|
|
|
|
SCMP_ARCH_AARCH64,
|
|
|
|
#ifdef SCMP_ARCH_RISCV64
|
|
|
|
SCMP_ARCH_RISCV64,
|
|
|
|
#endif
|
|
|
|
SCMP_ARCH_X32
|
|
|
|
))
|
2017-07-15 21:28:02 +02:00
|
|
|
/* No _sysctl syscall */
|
|
|
|
continue;
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
2016-11-02 03:25:19 +01:00
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_SYS(_sysctl),
|
2016-11-02 03:25:19 +01:00
|
|
|
0);
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
2019-04-11 01:08:41 +02:00
|
|
|
if (ERRNO_IS_SECCOMP_FATAL(r))
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-11-05 02:17:01 +01:00
|
|
|
int seccomp_protect_syslog(void) {
|
|
|
|
uint32_t arch;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
|
|
|
|
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(syslog),
|
|
|
|
0);
|
|
|
|
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
|
|
|
if (ERRNO_IS_SECCOMP_FATAL(r))
|
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-06-23 08:31:16 +02:00
|
|
|
int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
uint32_t arch;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
2017-02-12 21:25:40 +01:00
|
|
|
bool supported;
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
Iterator i;
|
|
|
|
|
|
|
|
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
|
|
|
|
|
2017-02-12 21:25:40 +01:00
|
|
|
switch (arch) {
|
|
|
|
|
|
|
|
case SCMP_ARCH_X86_64:
|
|
|
|
case SCMP_ARCH_X32:
|
|
|
|
case SCMP_ARCH_ARM:
|
|
|
|
case SCMP_ARCH_AARCH64:
|
2018-03-20 16:08:20 +01:00
|
|
|
case SCMP_ARCH_PPC:
|
seccomp: enable RestrictAddressFamilies on ppc64, autodetect SECCOMP_RESTRICT_ADDRESS_FAMILIES_BROKEN
We expect that if socket() syscall is available, seccomp works for that
architecture. So instead of explicitly listing all architectures where we know
it is not available, just assume it is broken if the number is not defined.
This should have the same effect, except that other architectures where it is
also broken will pass tests without further changes. (Architectures where the
filter should work, but does not work because of missing entries in
seccomp-util.c, will still fail.)
i386, s390, s390x are the exception — setting the filter fails, even though
socket() is available, so it needs to be special-cased
(https://github.com/systemd/systemd/issues/5215#issuecomment-277241488).
This remove the last define in seccomp-util.h that was only used in test-seccomp.c. Porting
the seccomp filter to new architectures should be simpler because now only two places need
to be modified.
RestrictAddressFamilies seems to work on ppc64[bl]e, so enable it (the tests pass).
2017-05-10 00:57:10 +02:00
|
|
|
case SCMP_ARCH_PPC64:
|
|
|
|
case SCMP_ARCH_PPC64LE:
|
2018-03-22 15:40:44 +01:00
|
|
|
case SCMP_ARCH_MIPSEL64N32:
|
|
|
|
case SCMP_ARCH_MIPS64N32:
|
|
|
|
case SCMP_ARCH_MIPSEL64:
|
|
|
|
case SCMP_ARCH_MIPS64:
|
2020-08-19 22:44:15 +02:00
|
|
|
#ifdef SCMP_ARCH_RISCV64
|
|
|
|
case SCMP_ARCH_RISCV64:
|
|
|
|
#endif
|
2017-02-12 21:25:40 +01:00
|
|
|
/* These we know we support (i.e. are the ones that do not use socketcall()) */
|
|
|
|
supported = true;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case SCMP_ARCH_S390:
|
|
|
|
case SCMP_ARCH_S390X:
|
seccomp: enable RestrictAddressFamilies on ppc64, autodetect SECCOMP_RESTRICT_ADDRESS_FAMILIES_BROKEN
We expect that if socket() syscall is available, seccomp works for that
architecture. So instead of explicitly listing all architectures where we know
it is not available, just assume it is broken if the number is not defined.
This should have the same effect, except that other architectures where it is
also broken will pass tests without further changes. (Architectures where the
filter should work, but does not work because of missing entries in
seccomp-util.c, will still fail.)
i386, s390, s390x are the exception — setting the filter fails, even though
socket() is available, so it needs to be special-cased
(https://github.com/systemd/systemd/issues/5215#issuecomment-277241488).
This remove the last define in seccomp-util.h that was only used in test-seccomp.c. Porting
the seccomp filter to new architectures should be simpler because now only two places need
to be modified.
RestrictAddressFamilies seems to work on ppc64[bl]e, so enable it (the tests pass).
2017-05-10 00:57:10 +02:00
|
|
|
case SCMP_ARCH_X86:
|
2018-03-22 15:40:44 +01:00
|
|
|
case SCMP_ARCH_MIPSEL:
|
|
|
|
case SCMP_ARCH_MIPS:
|
2017-02-12 21:25:40 +01:00
|
|
|
default:
|
|
|
|
/* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
|
|
|
|
* don't know */
|
|
|
|
supported = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!supported)
|
|
|
|
continue;
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
2020-06-23 08:31:16 +02:00
|
|
|
if (allow_list) {
|
2020-08-19 17:43:40 +02:00
|
|
|
int first = 0, last = 0;
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
void *afp;
|
|
|
|
|
2020-06-23 08:31:16 +02:00
|
|
|
/* If this is an allow list, we first block the address families that are out of
|
|
|
|
* range and then everything that is not in the set. First, we find the lowest and
|
|
|
|
* highest address family in the set. */
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
|
|
|
|
SET_FOREACH(afp, address_families, i) {
|
2020-08-19 17:43:40 +02:00
|
|
|
int af = PTR_TO_INT(afp);
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
|
|
|
|
if (af <= 0 || af >= af_max())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (first == 0 || af < first)
|
|
|
|
first = af;
|
|
|
|
|
|
|
|
if (last == 0 || af > last)
|
|
|
|
last = af;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert((first == 0) == (last == 0));
|
|
|
|
|
|
|
|
if (first == 0) {
|
|
|
|
|
|
|
|
/* No entries in the valid range, block everything */
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EAFNOSUPPORT),
|
|
|
|
SCMP_SYS(socket),
|
|
|
|
0);
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
/* Block everything below the first entry */
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EAFNOSUPPORT),
|
|
|
|
SCMP_SYS(socket),
|
|
|
|
1,
|
|
|
|
SCMP_A0(SCMP_CMP_LT, first));
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Block everything above the last entry */
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EAFNOSUPPORT),
|
|
|
|
SCMP_SYS(socket),
|
|
|
|
1,
|
|
|
|
SCMP_A0(SCMP_CMP_GT, last));
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Block everything between the first and last entry */
|
2020-08-19 17:43:40 +02:00
|
|
|
for (int af = 1; af < af_max(); af++) {
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
|
|
|
|
if (set_contains(address_families, INT_TO_PTR(af)))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EAFNOSUPPORT),
|
|
|
|
SCMP_SYS(socket),
|
|
|
|
1,
|
|
|
|
SCMP_A0(SCMP_CMP_EQ, af));
|
|
|
|
if (r < 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} else {
|
|
|
|
void *af;
|
|
|
|
|
2020-06-23 08:31:16 +02:00
|
|
|
/* If this is a deny list, then generate one rule for each address family that are
|
|
|
|
* then combined in OR checks. */
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
|
|
|
|
SET_FOREACH(af, address_families, i) {
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EAFNOSUPPORT),
|
|
|
|
SCMP_SYS(socket),
|
|
|
|
1,
|
|
|
|
SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
|
|
|
|
if (r < 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
2019-04-11 01:08:41 +02:00
|
|
|
if (ERRNO_IS_SECCOMP_FATAL(r))
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int seccomp_restrict_realtime(void) {
|
|
|
|
static const int permitted_policies[] = {
|
|
|
|
SCHED_OTHER,
|
|
|
|
SCHED_BATCH,
|
|
|
|
SCHED_IDLE,
|
|
|
|
};
|
|
|
|
|
|
|
|
int r, max_policy = 0;
|
|
|
|
uint32_t arch;
|
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
/* Determine the highest policy constant we want to allow */
|
|
|
|
for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
|
|
|
|
if (permitted_policies[i] > max_policy)
|
|
|
|
max_policy = permitted_policies[i];
|
|
|
|
|
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
|
|
|
int p;
|
|
|
|
|
|
|
|
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
|
|
|
|
|
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
/* Go through all policies with lower values than that, and block them -- unless they appear in the
|
2020-06-23 08:31:16 +02:00
|
|
|
* allow list. */
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
for (p = 0; p < max_policy; p++) {
|
|
|
|
bool good = false;
|
|
|
|
|
2020-06-23 08:31:16 +02:00
|
|
|
/* Check if this is in the allow list. */
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
|
|
|
|
if (permitted_policies[i] == p) {
|
|
|
|
good = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (good)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Deny this policy */
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(sched_setscheduler),
|
|
|
|
1,
|
|
|
|
SCMP_A1(SCMP_CMP_EQ, p));
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-23 08:31:16 +02:00
|
|
|
/* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
|
|
|
|
* are unsigned here, hence no need no check for < 0 values. */
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
r = seccomp_rule_add_exact(
|
2016-11-02 03:25:19 +01:00
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_SYS(sched_setscheduler),
|
2016-11-02 03:25:19 +01:00
|
|
|
1,
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_A1(SCMP_CMP_GT, max_policy));
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
2016-11-02 03:25:19 +01:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
r = seccomp_load(seccomp);
|
2019-04-11 01:08:41 +02:00
|
|
|
if (ERRNO_IS_SECCOMP_FATAL(r))
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-05-05 05:10:30 +02:00
|
|
|
static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
|
|
|
|
uint32_t arch,
|
|
|
|
int nr,
|
2018-10-19 20:00:46 +02:00
|
|
|
unsigned arg_cnt,
|
2017-05-05 05:10:30 +02:00
|
|
|
const struct scmp_arg_cmp arg) {
|
|
|
|
int r;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
|
|
|
|
if (r < 0) {
|
|
|
|
_cleanup_free_ char *n = NULL;
|
|
|
|
|
|
|
|
n = seccomp_syscall_resolve_num_arch(arch, nr);
|
|
|
|
log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
|
|
|
|
strna(n),
|
|
|
|
seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2017-05-05 05:10:30 +02:00
|
|
|
/* For known architectures, check that syscalls are indeed defined or not. */
|
2020-08-19 22:44:15 +02:00
|
|
|
#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
|
2017-05-05 05:10:30 +02:00
|
|
|
assert_cc(SCMP_SYS(shmget) > 0);
|
|
|
|
assert_cc(SCMP_SYS(shmat) > 0);
|
|
|
|
assert_cc(SCMP_SYS(shmdt) > 0);
|
|
|
|
#endif
|
2017-05-05 05:10:30 +02:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
int seccomp_memory_deny_write_execute(void) {
|
|
|
|
uint32_t arch;
|
2019-12-06 15:04:51 +01:00
|
|
|
unsigned loaded = 0;
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
|
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
2019-12-06 15:04:51 +01:00
|
|
|
int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
|
2016-11-02 03:25:19 +01:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
|
|
|
|
|
2017-02-08 15:14:02 +01:00
|
|
|
switch (arch) {
|
|
|
|
|
2019-11-27 09:52:07 +01:00
|
|
|
/* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
|
|
|
|
* We ignore that here, which means there's still a way to get writable/executable
|
|
|
|
* memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
|
|
|
|
|
2017-02-08 15:14:02 +01:00
|
|
|
case SCMP_ARCH_X86:
|
2019-07-25 02:48:49 +02:00
|
|
|
case SCMP_ARCH_S390:
|
2017-02-08 15:14:02 +01:00
|
|
|
filter_syscall = SCMP_SYS(mmap2);
|
|
|
|
block_syscall = SCMP_SYS(mmap);
|
2019-11-27 09:52:07 +01:00
|
|
|
/* shmat multiplexed, see above */
|
2017-05-05 05:10:30 +02:00
|
|
|
break;
|
|
|
|
|
2017-12-22 10:06:29 +01:00
|
|
|
case SCMP_ARCH_PPC:
|
2017-05-05 05:10:30 +02:00
|
|
|
case SCMP_ARCH_PPC64:
|
|
|
|
case SCMP_ARCH_PPC64LE:
|
2019-11-27 09:52:07 +01:00
|
|
|
case SCMP_ARCH_S390X:
|
2017-05-05 05:10:30 +02:00
|
|
|
filter_syscall = SCMP_SYS(mmap);
|
2019-11-27 09:52:07 +01:00
|
|
|
/* shmat multiplexed, see above */
|
2017-02-08 15:14:02 +01:00
|
|
|
break;
|
|
|
|
|
2017-05-05 05:10:31 +02:00
|
|
|
case SCMP_ARCH_ARM:
|
|
|
|
filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
|
|
|
|
shmat_syscall = SCMP_SYS(shmat);
|
|
|
|
break;
|
|
|
|
|
2017-02-08 15:14:02 +01:00
|
|
|
case SCMP_ARCH_X86_64:
|
|
|
|
case SCMP_ARCH_X32:
|
2017-07-15 21:30:01 +02:00
|
|
|
case SCMP_ARCH_AARCH64:
|
2020-08-19 22:44:15 +02:00
|
|
|
#ifdef SCMP_ARCH_RISCV64
|
|
|
|
case SCMP_ARCH_RISCV64:
|
|
|
|
#endif
|
|
|
|
filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */
|
2017-02-08 15:14:02 +01:00
|
|
|
shmat_syscall = SCMP_SYS(shmat);
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* Please add more definitions here, if you port systemd to other architectures! */
|
|
|
|
|
2020-08-19 22:44:15 +02:00
|
|
|
#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
|
2017-02-08 15:14:02 +01:00
|
|
|
#warning "Consider adding the right mmap() syscall definitions here!"
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Can't filter mmap() on this arch, then skip it */
|
|
|
|
if (filter_syscall == 0)
|
|
|
|
continue;
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
2017-05-05 05:10:30 +02:00
|
|
|
r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
|
|
|
|
1,
|
|
|
|
SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
|
|
|
|
if (r < 0)
|
|
|
|
continue;
|
2017-02-08 15:14:02 +01:00
|
|
|
|
|
|
|
if (block_syscall != 0) {
|
2017-05-05 05:10:30 +02:00
|
|
|
r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
|
|
|
|
if (r < 0)
|
2017-02-08 15:14:02 +01:00
|
|
|
continue;
|
2016-11-02 03:25:19 +01:00
|
|
|
}
|
2016-10-21 21:18:46 +02:00
|
|
|
|
2017-05-05 05:10:30 +02:00
|
|
|
r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
|
|
|
|
1,
|
2017-11-12 17:28:48 +01:00
|
|
|
SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
|
|
|
|
if (r < 0)
|
|
|
|
continue;
|
|
|
|
|
2017-11-13 09:35:49 +01:00
|
|
|
#ifdef __NR_pkey_mprotect
|
2017-11-12 17:28:48 +01:00
|
|
|
r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
|
|
|
|
1,
|
2017-05-05 05:10:30 +02:00
|
|
|
SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
|
|
|
|
if (r < 0)
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
continue;
|
2017-11-13 09:35:49 +01:00
|
|
|
#endif
|
2016-11-02 03:25:19 +01:00
|
|
|
|
2019-03-15 12:46:56 +01:00
|
|
|
if (shmat_syscall > 0) {
|
2019-11-27 10:53:50 +01:00
|
|
|
r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
|
2017-05-05 05:10:30 +02:00
|
|
|
1,
|
|
|
|
SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
|
|
|
|
if (r < 0)
|
2017-02-08 15:14:02 +01:00
|
|
|
continue;
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
2019-04-11 01:08:41 +02:00
|
|
|
if (ERRNO_IS_SECCOMP_FATAL(r))
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
return r;
|
2016-11-02 03:25:19 +01:00
|
|
|
if (r < 0)
|
2019-12-06 15:04:51 +01:00
|
|
|
log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
|
|
|
|
seccomp_arch_to_string(arch));
|
2019-11-27 09:57:55 +01:00
|
|
|
loaded++;
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
}
|
2016-11-02 03:25:19 +01:00
|
|
|
|
2019-11-27 09:57:55 +01:00
|
|
|
if (loaded == 0)
|
2019-12-06 15:04:51 +01:00
|
|
|
log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
|
2019-11-27 09:57:55 +01:00
|
|
|
|
|
|
|
return loaded;
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
int seccomp_restrict_archs(Set *archs) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
|
|
|
Iterator i;
|
|
|
|
void *id;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
/* This installs a filter with no rules, but that restricts the system call architectures to the specified
|
2018-02-02 17:06:32 +01:00
|
|
|
* list.
|
|
|
|
*
|
|
|
|
* There are some qualifications. However the most important use is to stop processes from bypassing
|
|
|
|
* system call restrictions, in case they used a broader (multiplexing) syscall which is only available
|
|
|
|
* in a non-native architecture. There are no holes in this use case, at least so far. */
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
|
2018-02-02 17:06:32 +01:00
|
|
|
/* Note libseccomp includes our "native" (current) architecture in the filter by default.
|
|
|
|
* We do not remove it. For example, our callers expect to be able to call execve() afterwards
|
|
|
|
* to run a program with the restrictions applied. */
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
seccomp = seccomp_init(SCMP_ACT_ALLOW);
|
|
|
|
if (!seccomp)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
SET_FOREACH(id, archs, i) {
|
|
|
|
r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
|
2018-02-02 17:06:32 +01:00
|
|
|
if (r < 0 && r != -EEXIST)
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
|
|
|
|
* x32 syscalls should basically match x86-64 for everything except the pointer type.
|
|
|
|
* The important thing is that you can block the old 32-bit x86 syscalls.
|
|
|
|
* https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
|
|
|
|
|
|
|
|
if (seccomp_arch_native() == SCMP_ARCH_X32 ||
|
|
|
|
set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
|
|
|
|
|
|
|
|
r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
|
|
|
|
if (r < 0 && r != -EEXIST)
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
return r;
|
2016-11-02 03:25:19 +01:00
|
|
|
}
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
2016-11-02 03:25:19 +01:00
|
|
|
|
2017-10-05 11:24:51 +02:00
|
|
|
r = seccomp_load(seccomp);
|
2019-04-11 01:08:41 +02:00
|
|
|
if (ERRNO_IS_SECCOMP_FATAL(r))
|
2017-10-05 11:24:51 +02:00
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
|
|
|
|
|
|
|
|
return 0;
|
2016-10-21 21:18:46 +02:00
|
|
|
}
|
2017-08-02 06:46:45 +02:00
|
|
|
|
2020-06-05 15:12:29 +02:00
|
|
|
int parse_syscall_archs(char **l, Set **ret_archs) {
|
|
|
|
_cleanup_set_free_ Set *archs = NULL;
|
2017-08-02 06:46:45 +02:00
|
|
|
char **s;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(l);
|
2020-06-05 15:12:29 +02:00
|
|
|
assert(ret_archs);
|
2017-08-02 06:46:45 +02:00
|
|
|
|
|
|
|
STRV_FOREACH(s, l) {
|
|
|
|
uint32_t a;
|
|
|
|
|
|
|
|
r = seccomp_arch_from_string(*s, &a);
|
|
|
|
if (r < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2020-06-05 15:12:29 +02:00
|
|
|
r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
|
2017-08-02 06:46:45 +02:00
|
|
|
if (r < 0)
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2020-06-05 15:12:29 +02:00
|
|
|
*ret_archs = TAKE_PTR(archs);
|
2017-08-02 06:46:45 +02:00
|
|
|
return 0;
|
|
|
|
}
|
2017-08-09 16:09:04 +02:00
|
|
|
|
2017-11-11 13:35:49 +01:00
|
|
|
int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
|
2017-08-09 16:09:04 +02:00
|
|
|
const char *i;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(set);
|
|
|
|
|
|
|
|
NULSTR_FOREACH(i, set->value) {
|
|
|
|
|
|
|
|
if (i[0] == '@') {
|
|
|
|
const SyscallFilterSet *more;
|
|
|
|
|
|
|
|
more = syscall_filter_set_find(i);
|
|
|
|
if (!more)
|
|
|
|
return -ENXIO;
|
|
|
|
|
|
|
|
r = seccomp_filter_set_add(filter, add, more);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
} else {
|
|
|
|
int id;
|
|
|
|
|
|
|
|
id = seccomp_syscall_resolve_name(i);
|
2017-10-05 11:23:07 +02:00
|
|
|
if (id == __NR_SCMP_ERROR) {
|
|
|
|
log_debug("Couldn't resolve system call, ignoring: %s", i);
|
|
|
|
continue;
|
|
|
|
}
|
2017-08-09 16:09:04 +02:00
|
|
|
|
|
|
|
if (add) {
|
2017-11-11 13:35:49 +01:00
|
|
|
r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
|
2017-08-09 16:09:04 +02:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
} else
|
2017-11-11 13:35:49 +01:00
|
|
|
(void) hashmap_remove(filter, INT_TO_PTR(id + 1));
|
2017-08-09 16:09:04 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2017-07-04 14:48:18 +02:00
|
|
|
|
|
|
|
int seccomp_lock_personality(unsigned long personality) {
|
2017-08-09 20:43:35 +02:00
|
|
|
uint32_t arch;
|
2017-07-04 14:48:18 +02:00
|
|
|
int r;
|
|
|
|
|
2017-08-09 20:43:35 +02:00
|
|
|
if (personality >= PERSONALITY_INVALID)
|
|
|
|
return -EINVAL;
|
2017-07-04 14:48:18 +02:00
|
|
|
|
2017-08-09 20:43:35 +02:00
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
2017-07-04 14:48:18 +02:00
|
|
|
|
2017-08-09 20:43:35 +02:00
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(personality),
|
|
|
|
1,
|
|
|
|
SCMP_A0(SCMP_CMP_NE, personality));
|
2017-10-05 11:26:09 +02:00
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
2017-08-09 20:43:35 +02:00
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
2019-04-11 01:08:41 +02:00
|
|
|
if (ERRNO_IS_SECCOMP_FATAL(r))
|
2017-08-09 20:43:35 +02:00
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
2017-07-04 14:48:18 +02:00
|
|
|
}
|
2019-02-08 18:25:00 +01:00
|
|
|
|
|
|
|
int seccomp_protect_hostname(void) {
|
|
|
|
uint32_t arch;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
|
|
|
|
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(sethostname),
|
|
|
|
0);
|
2019-03-20 18:59:59 +01:00
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
2019-02-08 18:25:00 +01:00
|
|
|
continue;
|
2019-03-20 18:59:59 +01:00
|
|
|
}
|
2019-02-08 18:25:00 +01:00
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(setdomainname),
|
|
|
|
0);
|
2019-03-20 18:59:59 +01:00
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
2019-02-08 18:25:00 +01:00
|
|
|
continue;
|
2019-03-20 18:59:59 +01:00
|
|
|
}
|
2019-02-08 18:25:00 +01:00
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
2019-04-11 01:08:41 +02:00
|
|
|
if (ERRNO_IS_SECCOMP_FATAL(r))
|
2019-02-08 18:25:00 +01:00
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2019-03-20 19:00:28 +01:00
|
|
|
|
2019-04-03 13:11:00 +02:00
|
|
|
static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
|
|
|
|
/* Checks the mode_t parameter of the following system calls:
|
|
|
|
*
|
|
|
|
* → chmod() + fchmod() + fchmodat()
|
|
|
|
* → open() + creat() + openat()
|
|
|
|
* → mkdir() + mkdirat()
|
|
|
|
* → mknod() + mknodat()
|
|
|
|
*
|
|
|
|
* Returns error if *everything* failed, and 0 otherwise.
|
|
|
|
*/
|
|
|
|
int r = 0;
|
|
|
|
bool any = false;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(chmod),
|
|
|
|
1,
|
|
|
|
SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to add filter for chmod: %m");
|
|
|
|
else
|
|
|
|
any = true;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(fchmod),
|
|
|
|
1,
|
|
|
|
SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to add filter for fchmod: %m");
|
|
|
|
else
|
|
|
|
any = true;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(fchmodat),
|
|
|
|
1,
|
|
|
|
SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to add filter for fchmodat: %m");
|
|
|
|
else
|
|
|
|
any = true;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(mkdir),
|
|
|
|
1,
|
|
|
|
SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to add filter for mkdir: %m");
|
|
|
|
else
|
|
|
|
any = true;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(mkdirat),
|
|
|
|
1,
|
|
|
|
SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to add filter for mkdirat: %m");
|
|
|
|
else
|
|
|
|
any = true;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(mknod),
|
|
|
|
1,
|
|
|
|
SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to add filter for mknod: %m");
|
|
|
|
else
|
|
|
|
any = true;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(mknodat),
|
|
|
|
1,
|
|
|
|
SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to add filter for mknodat: %m");
|
|
|
|
else
|
|
|
|
any = true;
|
|
|
|
|
|
|
|
#if SCMP_SYS(open) > 0
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(open),
|
|
|
|
2,
|
|
|
|
SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
|
|
|
|
SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to add filter for open: %m");
|
|
|
|
else
|
|
|
|
any = true;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(openat),
|
|
|
|
2,
|
|
|
|
SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
|
|
|
|
SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to add filter for openat: %m");
|
|
|
|
else
|
|
|
|
any = true;
|
|
|
|
|
2020-06-02 10:40:25 +02:00
|
|
|
#if defined(__SNR_openat2)
|
|
|
|
/* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
|
|
|
|
* an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
|
|
|
|
* for now, since openat2() is very new and code generally needs fallback logic anyway to be
|
|
|
|
* compatible with kernels that are not absolutely recent. */
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(openat2),
|
|
|
|
0);
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to add filter for openat2: %m");
|
|
|
|
else
|
|
|
|
any = true;
|
|
|
|
#endif
|
|
|
|
|
2019-04-03 13:11:00 +02:00
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(creat),
|
|
|
|
1,
|
|
|
|
SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to add filter for creat: %m");
|
|
|
|
else
|
|
|
|
any = true;
|
|
|
|
|
|
|
|
return any ? 0 : r;
|
|
|
|
}
|
|
|
|
|
2019-03-20 19:00:28 +01:00
|
|
|
int seccomp_restrict_suid_sgid(void) {
|
|
|
|
uint32_t arch;
|
2019-04-03 13:11:00 +02:00
|
|
|
int r, k;
|
2019-03-20 19:00:28 +01:00
|
|
|
|
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
|
|
|
|
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
2019-04-03 13:11:00 +02:00
|
|
|
r = seccomp_restrict_sxid(seccomp, S_ISUID);
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
|
2019-03-20 19:00:28 +01:00
|
|
|
|
2019-04-03 13:11:00 +02:00
|
|
|
k = seccomp_restrict_sxid(seccomp, S_ISGID);
|
|
|
|
if (k < 0)
|
|
|
|
log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
|
2019-03-20 19:00:28 +01:00
|
|
|
|
2019-04-03 13:11:00 +02:00
|
|
|
if (r < 0 && k < 0)
|
2019-03-20 19:00:28 +01:00
|
|
|
continue;
|
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
2019-04-11 01:08:41 +02:00
|
|
|
if (ERRNO_IS_SECCOMP_FATAL(r))
|
2019-03-20 19:00:28 +01:00
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2019-04-29 11:54:00 +02:00
|
|
|
|
|
|
|
uint32_t scmp_act_kill_process(void) {
|
|
|
|
|
|
|
|
/* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
|
|
|
|
* actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
|
|
|
|
* a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
|
|
|
|
* for single-threaded apps does the right thing. */
|
|
|
|
|
|
|
|
#ifdef SCMP_ACT_KILL_PROCESS
|
|
|
|
if (seccomp_api_get() >= 3)
|
|
|
|
return SCMP_ACT_KILL_PROCESS;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
|
|
|
|
}
|