2017-11-18 17:09:20 +01:00
|
|
|
/* SPDX-License-Identifier: LGPL-2.1+ */
|
2014-02-13 00:24:00 +01:00
|
|
|
/***
|
|
|
|
This file is part of systemd.
|
|
|
|
|
|
|
|
Copyright 2014 Lennart Poettering
|
|
|
|
|
|
|
|
systemd is free software; you can redistribute it and/or modify it
|
|
|
|
under the terms of the GNU Lesser General Public License as published by
|
|
|
|
the Free Software Foundation; either version 2.1 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
systemd is distributed in the hope that it will be useful, but
|
|
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public License
|
|
|
|
along with systemd; If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
***/
|
|
|
|
|
2015-12-03 21:13:37 +01:00
|
|
|
#include <errno.h>
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
#include <linux/seccomp.h>
|
2014-02-13 00:24:00 +01:00
|
|
|
#include <seccomp.h>
|
2015-12-03 21:13:37 +01:00
|
|
|
#include <stddef.h>
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
#include <sys/mman.h>
|
2016-08-31 15:00:35 +02:00
|
|
|
#include <sys/prctl.h>
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
#include <sys/shm.h>
|
2014-02-13 00:24:00 +01:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
#include "af-list.h"
|
2016-11-02 03:25:19 +01:00
|
|
|
#include "alloc-util.h"
|
2015-12-03 21:13:37 +01:00
|
|
|
#include "macro.h"
|
2016-11-02 03:25:19 +01:00
|
|
|
#include "nsflags.h"
|
2017-07-04 14:48:18 +02:00
|
|
|
#include "process-util.h"
|
2015-11-16 22:09:36 +01:00
|
|
|
#include "seccomp-util.h"
|
2017-08-02 06:46:45 +02:00
|
|
|
#include "set.h"
|
2015-10-24 22:58:24 +02:00
|
|
|
#include "string-util.h"
|
2017-08-02 06:46:45 +02:00
|
|
|
#include "strv.h"
|
2016-10-21 21:50:05 +02:00
|
|
|
#include "util.h"
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
#include "errno-list.h"
|
|
|
|
|
|
|
|
const uint32_t seccomp_local_archs[] = {
|
|
|
|
|
2017-02-10 23:47:50 +01:00
|
|
|
/* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
|
|
|
|
|
|
|
|
#if defined(__x86_64__) && defined(__ILP32__)
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_X86,
|
|
|
|
SCMP_ARCH_X86_64,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_X32, /* native */
|
|
|
|
#elif defined(__x86_64__) && !defined(__ILP32__)
|
|
|
|
SCMP_ARCH_X86,
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_X32,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_X86_64, /* native */
|
|
|
|
#elif defined(__i386__)
|
|
|
|
SCMP_ARCH_X86,
|
|
|
|
#elif defined(__aarch64__)
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_ARM,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_AARCH64, /* native */
|
|
|
|
#elif defined(__arm__)
|
|
|
|
SCMP_ARCH_ARM,
|
|
|
|
#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
|
|
|
|
SCMP_ARCH_MIPSEL,
|
|
|
|
SCMP_ARCH_MIPS, /* native */
|
|
|
|
#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_MIPS,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_MIPSEL, /* native */
|
|
|
|
#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
|
|
|
|
SCMP_ARCH_MIPSEL,
|
|
|
|
SCMP_ARCH_MIPS,
|
|
|
|
SCMP_ARCH_MIPSEL64N32,
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_MIPS64N32,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_MIPSEL64,
|
|
|
|
SCMP_ARCH_MIPS64, /* native */
|
|
|
|
#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
|
|
|
|
SCMP_ARCH_MIPS,
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_MIPSEL,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_MIPS64N32,
|
|
|
|
SCMP_ARCH_MIPSEL64N32,
|
|
|
|
SCMP_ARCH_MIPS64,
|
|
|
|
SCMP_ARCH_MIPSEL64, /* native */
|
|
|
|
#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
|
|
|
|
SCMP_ARCH_MIPSEL,
|
|
|
|
SCMP_ARCH_MIPS,
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_MIPSEL64,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_MIPS64,
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_MIPSEL64N32,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_MIPS64N32, /* native */
|
|
|
|
#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
|
|
|
|
SCMP_ARCH_MIPS,
|
|
|
|
SCMP_ARCH_MIPSEL,
|
|
|
|
SCMP_ARCH_MIPS64,
|
|
|
|
SCMP_ARCH_MIPSEL64,
|
|
|
|
SCMP_ARCH_MIPS64N32,
|
|
|
|
SCMP_ARCH_MIPSEL64N32, /* native */
|
|
|
|
#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_PPC,
|
|
|
|
SCMP_ARCH_PPC64LE,
|
2017-02-10 23:47:50 +01:00
|
|
|
SCMP_ARCH_PPC64, /* native */
|
|
|
|
#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
|
|
|
|
SCMP_ARCH_PPC,
|
|
|
|
SCMP_ARCH_PPC64,
|
|
|
|
SCMP_ARCH_PPC64LE, /* native */
|
|
|
|
#elif defined(__powerpc__)
|
|
|
|
SCMP_ARCH_PPC,
|
|
|
|
#elif defined(__s390x__)
|
|
|
|
SCMP_ARCH_S390,
|
|
|
|
SCMP_ARCH_S390X, /* native */
|
|
|
|
#elif defined(__s390__)
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_ARCH_S390,
|
|
|
|
#endif
|
|
|
|
(uint32_t) -1
|
|
|
|
};
|
2014-02-13 00:24:00 +01:00
|
|
|
|
|
|
|
const char* seccomp_arch_to_string(uint32_t c) {
|
2016-11-01 16:33:18 +01:00
|
|
|
/* Maintain order used in <seccomp.h>.
|
|
|
|
*
|
|
|
|
* Names used here should be the same as those used for ConditionArchitecture=,
|
|
|
|
* except for "subarchitectures" like x32. */
|
2014-02-13 00:24:00 +01:00
|
|
|
|
2016-11-01 16:33:18 +01:00
|
|
|
switch(c) {
|
|
|
|
case SCMP_ARCH_NATIVE:
|
2014-02-13 00:24:00 +01:00
|
|
|
return "native";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_X86:
|
2014-02-13 00:24:00 +01:00
|
|
|
return "x86";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_X86_64:
|
2014-02-13 00:24:00 +01:00
|
|
|
return "x86-64";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_X32:
|
2014-02-13 00:24:00 +01:00
|
|
|
return "x32";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_ARM:
|
2014-02-13 00:24:00 +01:00
|
|
|
return "arm";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_AARCH64:
|
|
|
|
return "arm64";
|
|
|
|
case SCMP_ARCH_MIPS:
|
|
|
|
return "mips";
|
|
|
|
case SCMP_ARCH_MIPS64:
|
|
|
|
return "mips64";
|
|
|
|
case SCMP_ARCH_MIPS64N32:
|
|
|
|
return "mips64-n32";
|
|
|
|
case SCMP_ARCH_MIPSEL:
|
|
|
|
return "mips-le";
|
|
|
|
case SCMP_ARCH_MIPSEL64:
|
|
|
|
return "mips64-le";
|
|
|
|
case SCMP_ARCH_MIPSEL64N32:
|
|
|
|
return "mips64-le-n32";
|
|
|
|
case SCMP_ARCH_PPC:
|
|
|
|
return "ppc";
|
|
|
|
case SCMP_ARCH_PPC64:
|
|
|
|
return "ppc64";
|
|
|
|
case SCMP_ARCH_PPC64LE:
|
|
|
|
return "ppc64-le";
|
|
|
|
case SCMP_ARCH_S390:
|
2016-10-05 13:58:55 +02:00
|
|
|
return "s390";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_S390X:
|
2016-10-05 13:58:55 +02:00
|
|
|
return "s390x";
|
2016-11-01 16:33:18 +01:00
|
|
|
default:
|
|
|
|
return NULL;
|
|
|
|
}
|
2014-02-13 00:24:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
int seccomp_arch_from_string(const char *n, uint32_t *ret) {
|
|
|
|
if (!n)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
assert(ret);
|
|
|
|
|
|
|
|
if (streq(n, "native"))
|
|
|
|
*ret = SCMP_ARCH_NATIVE;
|
|
|
|
else if (streq(n, "x86"))
|
|
|
|
*ret = SCMP_ARCH_X86;
|
|
|
|
else if (streq(n, "x86-64"))
|
|
|
|
*ret = SCMP_ARCH_X86_64;
|
|
|
|
else if (streq(n, "x32"))
|
|
|
|
*ret = SCMP_ARCH_X32;
|
|
|
|
else if (streq(n, "arm"))
|
|
|
|
*ret = SCMP_ARCH_ARM;
|
2016-11-01 16:33:18 +01:00
|
|
|
else if (streq(n, "arm64"))
|
|
|
|
*ret = SCMP_ARCH_AARCH64;
|
|
|
|
else if (streq(n, "mips"))
|
|
|
|
*ret = SCMP_ARCH_MIPS;
|
|
|
|
else if (streq(n, "mips64"))
|
|
|
|
*ret = SCMP_ARCH_MIPS64;
|
|
|
|
else if (streq(n, "mips64-n32"))
|
|
|
|
*ret = SCMP_ARCH_MIPS64N32;
|
|
|
|
else if (streq(n, "mips-le"))
|
|
|
|
*ret = SCMP_ARCH_MIPSEL;
|
|
|
|
else if (streq(n, "mips64-le"))
|
|
|
|
*ret = SCMP_ARCH_MIPSEL64;
|
|
|
|
else if (streq(n, "mips64-le-n32"))
|
|
|
|
*ret = SCMP_ARCH_MIPSEL64N32;
|
|
|
|
else if (streq(n, "ppc"))
|
|
|
|
*ret = SCMP_ARCH_PPC;
|
|
|
|
else if (streq(n, "ppc64"))
|
|
|
|
*ret = SCMP_ARCH_PPC64;
|
|
|
|
else if (streq(n, "ppc64-le"))
|
|
|
|
*ret = SCMP_ARCH_PPC64LE;
|
2016-10-05 13:58:55 +02:00
|
|
|
else if (streq(n, "s390"))
|
|
|
|
*ret = SCMP_ARCH_S390;
|
|
|
|
else if (streq(n, "s390x"))
|
|
|
|
*ret = SCMP_ARCH_S390X;
|
2014-02-13 00:24:00 +01:00
|
|
|
else
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2014-02-18 22:14:00 +01:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
|
2016-10-21 20:28:05 +02:00
|
|
|
scmp_filter_ctx seccomp;
|
|
|
|
int r;
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
/* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
|
|
|
|
* any others. Also, turns off the NNP fiddling. */
|
2016-10-21 20:28:05 +02:00
|
|
|
|
|
|
|
seccomp = seccomp_init(default_action);
|
|
|
|
if (!seccomp)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
if (arch != SCMP_ARCH_NATIVE &&
|
|
|
|
arch != seccomp_arch_native()) {
|
|
|
|
|
2017-02-05 17:58:19 +01:00
|
|
|
r = seccomp_arch_remove(seccomp, seccomp_arch_native());
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
if (r < 0)
|
|
|
|
goto finish;
|
|
|
|
|
2017-02-05 17:58:19 +01:00
|
|
|
r = seccomp_arch_add(seccomp, arch);
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
if (r < 0)
|
|
|
|
goto finish;
|
|
|
|
|
|
|
|
assert(seccomp_arch_exist(seccomp, arch) >= 0);
|
|
|
|
assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
|
|
|
|
assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
|
|
|
|
} else {
|
|
|
|
assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
|
|
|
|
assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
|
2016-10-21 20:28:05 +02:00
|
|
|
if (r < 0)
|
|
|
|
goto finish;
|
|
|
|
|
|
|
|
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
|
|
|
|
if (r < 0)
|
|
|
|
goto finish;
|
|
|
|
|
|
|
|
*ret = seccomp;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
finish:
|
|
|
|
seccomp_release(seccomp);
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2016-08-31 15:00:35 +02:00
|
|
|
static bool is_basic_seccomp_available(void) {
|
2016-12-27 16:50:02 +01:00
|
|
|
return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
|
2016-08-31 15:00:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool is_seccomp_filter_available(void) {
|
2016-12-27 16:50:02 +01:00
|
|
|
return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
|
|
|
|
errno == EFAULT;
|
2016-08-31 15:00:35 +02:00
|
|
|
}
|
|
|
|
|
2016-08-22 21:40:58 +02:00
|
|
|
bool is_seccomp_available(void) {
|
|
|
|
static int cached_enabled = -1;
|
2016-12-27 16:50:02 +01:00
|
|
|
|
2016-08-22 21:40:58 +02:00
|
|
|
if (cached_enabled < 0)
|
2016-12-27 16:50:02 +01:00
|
|
|
cached_enabled =
|
|
|
|
is_basic_seccomp_available() &&
|
|
|
|
is_seccomp_filter_available();
|
|
|
|
|
2016-08-22 21:40:58 +02:00
|
|
|
return cached_enabled;
|
|
|
|
}
|
|
|
|
|
2016-10-21 21:50:05 +02:00
|
|
|
const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
|
2016-11-02 17:01:04 +01:00
|
|
|
[SYSCALL_FILTER_SET_DEFAULT] = {
|
|
|
|
.name = "@default",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "System calls that are always permitted",
|
2016-11-02 17:01:04 +01:00
|
|
|
.value =
|
|
|
|
"clock_getres\0"
|
|
|
|
"clock_gettime\0"
|
|
|
|
"clock_nanosleep\0"
|
|
|
|
"execve\0"
|
|
|
|
"exit\0"
|
|
|
|
"exit_group\0"
|
seccomp: update "@default" seccomp group a bit
Let's add more of the most basic operations to "@default" as absolute
baseline needed by glibc and such to operate. Specifically:
futex, get_robust_list, get_thread_area, membarrier, set_robust_list,
set_thread_area, set_tid_address are all required to properly implement
mutexes and other thread synchronization logic. Given that a ton of
datastructures are protected by mutexes (such as stdio and such), let's
just whitelist this by default, so that things can just work.
restart_syscall is used to implement EAGAIN SA_RESTART stuff in some
archs, and synthesized by the kernel without any explicit user logic,
hence let's make this work out of the box.
2017-09-13 19:27:51 +02:00
|
|
|
"futex\0"
|
|
|
|
"get_robust_list\0"
|
|
|
|
"get_thread_area\0"
|
2017-10-03 07:20:05 +02:00
|
|
|
"getegid\0"
|
|
|
|
"getegid32\0"
|
|
|
|
"geteuid\0"
|
|
|
|
"geteuid32\0"
|
|
|
|
"getgid\0"
|
|
|
|
"getgid32\0"
|
|
|
|
"getgroups\0"
|
|
|
|
"getgroups32\0"
|
|
|
|
"getpgid\0"
|
|
|
|
"getpgrp\0"
|
|
|
|
"getpid\0"
|
|
|
|
"getppid\0"
|
|
|
|
"getresgid\0"
|
|
|
|
"getresgid32\0"
|
|
|
|
"getresuid\0"
|
|
|
|
"getresuid32\0"
|
2016-11-02 17:01:04 +01:00
|
|
|
"getrlimit\0" /* make sure processes can query stack size and such */
|
2017-10-03 07:20:05 +02:00
|
|
|
"getsid\0"
|
|
|
|
"gettid\0"
|
2016-11-02 17:01:04 +01:00
|
|
|
"gettimeofday\0"
|
2017-10-03 07:20:05 +02:00
|
|
|
"getuid\0"
|
|
|
|
"getuid32\0"
|
seccomp: update "@default" seccomp group a bit
Let's add more of the most basic operations to "@default" as absolute
baseline needed by glibc and such to operate. Specifically:
futex, get_robust_list, get_thread_area, membarrier, set_robust_list,
set_thread_area, set_tid_address are all required to properly implement
mutexes and other thread synchronization logic. Given that a ton of
datastructures are protected by mutexes (such as stdio and such), let's
just whitelist this by default, so that things can just work.
restart_syscall is used to implement EAGAIN SA_RESTART stuff in some
archs, and synthesized by the kernel without any explicit user logic,
hence let's make this work out of the box.
2017-09-13 19:27:51 +02:00
|
|
|
"membarrier\0"
|
2016-11-02 17:01:04 +01:00
|
|
|
"nanosleep\0"
|
|
|
|
"pause\0"
|
2017-09-30 14:08:26 +02:00
|
|
|
"prlimit64\0"
|
seccomp: update "@default" seccomp group a bit
Let's add more of the most basic operations to "@default" as absolute
baseline needed by glibc and such to operate. Specifically:
futex, get_robust_list, get_thread_area, membarrier, set_robust_list,
set_thread_area, set_tid_address are all required to properly implement
mutexes and other thread synchronization logic. Given that a ton of
datastructures are protected by mutexes (such as stdio and such), let's
just whitelist this by default, so that things can just work.
restart_syscall is used to implement EAGAIN SA_RESTART stuff in some
archs, and synthesized by the kernel without any explicit user logic,
hence let's make this work out of the box.
2017-09-13 19:27:51 +02:00
|
|
|
"restart_syscall\0"
|
2016-11-02 17:01:04 +01:00
|
|
|
"rt_sigreturn\0"
|
2017-10-04 11:41:42 +02:00
|
|
|
"sched_yield\0"
|
seccomp: update "@default" seccomp group a bit
Let's add more of the most basic operations to "@default" as absolute
baseline needed by glibc and such to operate. Specifically:
futex, get_robust_list, get_thread_area, membarrier, set_robust_list,
set_thread_area, set_tid_address are all required to properly implement
mutexes and other thread synchronization logic. Given that a ton of
datastructures are protected by mutexes (such as stdio and such), let's
just whitelist this by default, so that things can just work.
restart_syscall is used to implement EAGAIN SA_RESTART stuff in some
archs, and synthesized by the kernel without any explicit user logic,
hence let's make this work out of the box.
2017-09-13 19:27:51 +02:00
|
|
|
"set_robust_list\0"
|
|
|
|
"set_thread_area\0"
|
|
|
|
"set_tid_address\0"
|
2017-11-12 16:34:43 +01:00
|
|
|
"set_tls\0"
|
2016-11-02 17:01:04 +01:00
|
|
|
"sigreturn\0"
|
|
|
|
"time\0"
|
2017-09-30 14:08:26 +02:00
|
|
|
"ugetrlimit\0"
|
2016-11-02 17:01:04 +01:00
|
|
|
},
|
2017-09-30 14:34:50 +02:00
|
|
|
[SYSCALL_FILTER_SET_AIO] = {
|
|
|
|
.name = "@aio",
|
|
|
|
.help = "Asynchronous IO",
|
|
|
|
.value =
|
|
|
|
"io_cancel\0"
|
|
|
|
"io_destroy\0"
|
|
|
|
"io_getevents\0"
|
|
|
|
"io_setup\0"
|
|
|
|
"io_submit\0"
|
|
|
|
},
|
2016-11-02 15:46:18 +01:00
|
|
|
[SYSCALL_FILTER_SET_BASIC_IO] = {
|
|
|
|
.name = "@basic-io",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Basic IO",
|
2016-11-02 15:46:18 +01:00
|
|
|
.value =
|
2017-09-13 19:31:43 +02:00
|
|
|
"_llseek\0"
|
2016-11-02 15:46:18 +01:00
|
|
|
"close\0"
|
2017-09-13 19:31:43 +02:00
|
|
|
"dup\0"
|
2016-11-02 15:46:18 +01:00
|
|
|
"dup2\0"
|
|
|
|
"dup3\0"
|
|
|
|
"lseek\0"
|
|
|
|
"pread64\0"
|
|
|
|
"preadv\0"
|
2017-09-30 14:34:50 +02:00
|
|
|
"preadv2\0"
|
2016-11-02 15:46:18 +01:00
|
|
|
"pwrite64\0"
|
|
|
|
"pwritev\0"
|
2017-09-30 14:34:50 +02:00
|
|
|
"pwritev2\0"
|
2016-11-02 15:46:18 +01:00
|
|
|
"read\0"
|
|
|
|
"readv\0"
|
|
|
|
"write\0"
|
|
|
|
"writev\0"
|
|
|
|
},
|
2017-09-30 14:34:50 +02:00
|
|
|
[SYSCALL_FILTER_SET_CHOWN] = {
|
|
|
|
.name = "@chown",
|
|
|
|
.help = "Change ownership of files and directories",
|
|
|
|
.value =
|
|
|
|
"chown\0"
|
|
|
|
"chown32\0"
|
|
|
|
"fchown\0"
|
|
|
|
"fchown32\0"
|
|
|
|
"fchownat\0"
|
|
|
|
"lchown\0"
|
|
|
|
"lchown32\0"
|
|
|
|
},
|
2016-10-21 21:50:05 +02:00
|
|
|
[SYSCALL_FILTER_SET_CLOCK] = {
|
|
|
|
.name = "@clock",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Change the system time",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"adjtimex\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"clock_adjtime\0"
|
|
|
|
"clock_settime\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"settimeofday\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"stime\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_CPU_EMULATION] = {
|
|
|
|
.name = "@cpu-emulation",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "System calls for CPU emulation functionality",
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
.value =
|
|
|
|
"modify_ldt\0"
|
|
|
|
"subpage_prot\0"
|
|
|
|
"switch_endian\0"
|
|
|
|
"vm86\0"
|
|
|
|
"vm86old\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_DEBUG] = {
|
|
|
|
.name = "@debug",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Debugging, performance monitoring and tracing functionality",
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
.value =
|
|
|
|
"lookup_dcookie\0"
|
|
|
|
"perf_event_open\0"
|
|
|
|
"process_vm_readv\0"
|
|
|
|
"process_vm_writev\0"
|
|
|
|
"ptrace\0"
|
|
|
|
"rtas\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
#ifdef __NR_s390_runtime_instr
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"s390_runtime_instr\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
#endif
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"sys_debug_setcontext\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
2016-11-22 01:29:12 +01:00
|
|
|
[SYSCALL_FILTER_SET_FILE_SYSTEM] = {
|
|
|
|
.name = "@file-system",
|
|
|
|
.help = "File system operations",
|
|
|
|
.value =
|
|
|
|
"access\0"
|
|
|
|
"chdir\0"
|
|
|
|
"chmod\0"
|
|
|
|
"close\0"
|
|
|
|
"creat\0"
|
|
|
|
"faccessat\0"
|
|
|
|
"fallocate\0"
|
|
|
|
"fchdir\0"
|
|
|
|
"fchmod\0"
|
|
|
|
"fchmodat\0"
|
|
|
|
"fcntl\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"fcntl64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"fgetxattr\0"
|
|
|
|
"flistxattr\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"fremovexattr\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"fsetxattr\0"
|
|
|
|
"fstat\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"fstat64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"fstatat64\0"
|
|
|
|
"fstatfs\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"fstatfs64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"ftruncate\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"ftruncate64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"futimesat\0"
|
|
|
|
"getcwd\0"
|
|
|
|
"getdents\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"getdents64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"getxattr\0"
|
|
|
|
"inotify_add_watch\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"inotify_init\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"inotify_init1\0"
|
|
|
|
"inotify_rm_watch\0"
|
|
|
|
"lgetxattr\0"
|
|
|
|
"link\0"
|
|
|
|
"linkat\0"
|
|
|
|
"listxattr\0"
|
|
|
|
"llistxattr\0"
|
|
|
|
"lremovexattr\0"
|
|
|
|
"lsetxattr\0"
|
|
|
|
"lstat\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"lstat64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"mkdir\0"
|
|
|
|
"mkdirat\0"
|
|
|
|
"mknod\0"
|
|
|
|
"mknodat\0"
|
|
|
|
"mmap\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"mmap2\0"
|
2017-02-10 03:29:33 +01:00
|
|
|
"munmap\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"newfstatat\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"oldfstat\0"
|
|
|
|
"oldlstat\0"
|
|
|
|
"oldstat\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"open\0"
|
|
|
|
"openat\0"
|
|
|
|
"readlink\0"
|
|
|
|
"readlinkat\0"
|
|
|
|
"removexattr\0"
|
|
|
|
"rename\0"
|
|
|
|
"renameat\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"renameat2\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"rmdir\0"
|
|
|
|
"setxattr\0"
|
|
|
|
"stat\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"stat64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"statfs\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"statfs64\0"
|
2017-11-10 11:07:36 +01:00
|
|
|
#ifdef __NR_statx
|
2017-09-04 15:35:35 +02:00
|
|
|
"statx\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
#endif
|
2016-11-22 01:29:12 +01:00
|
|
|
"symlink\0"
|
|
|
|
"symlinkat\0"
|
|
|
|
"truncate\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"truncate64\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"unlink\0"
|
|
|
|
"unlinkat\0"
|
2017-09-13 19:33:54 +02:00
|
|
|
"utime\0"
|
2016-11-22 01:29:12 +01:00
|
|
|
"utimensat\0"
|
|
|
|
"utimes\0"
|
|
|
|
},
|
2016-10-21 21:50:05 +02:00
|
|
|
[SYSCALL_FILTER_SET_IO_EVENT] = {
|
|
|
|
.name = "@io-event",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Event loop system calls",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"_newselect\0"
|
|
|
|
"epoll_create\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"epoll_create1\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"epoll_ctl\0"
|
|
|
|
"epoll_ctl_old\0"
|
|
|
|
"epoll_pwait\0"
|
|
|
|
"epoll_wait\0"
|
|
|
|
"epoll_wait_old\0"
|
|
|
|
"eventfd\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"eventfd2\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"poll\0"
|
|
|
|
"ppoll\0"
|
|
|
|
"pselect6\0"
|
|
|
|
"select\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_IPC] = {
|
|
|
|
.name = "@ipc",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "SysV IPC, POSIX Message Queues or other IPC",
|
|
|
|
.value =
|
|
|
|
"ipc\0"
|
2016-10-25 15:43:31 +02:00
|
|
|
"memfd_create\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"mq_getsetattr\0"
|
|
|
|
"mq_notify\0"
|
|
|
|
"mq_open\0"
|
|
|
|
"mq_timedreceive\0"
|
|
|
|
"mq_timedsend\0"
|
|
|
|
"mq_unlink\0"
|
|
|
|
"msgctl\0"
|
|
|
|
"msgget\0"
|
|
|
|
"msgrcv\0"
|
|
|
|
"msgsnd\0"
|
2016-10-25 15:43:31 +02:00
|
|
|
"pipe\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"pipe2\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"process_vm_readv\0"
|
|
|
|
"process_vm_writev\0"
|
|
|
|
"semctl\0"
|
|
|
|
"semget\0"
|
|
|
|
"semop\0"
|
|
|
|
"semtimedop\0"
|
|
|
|
"shmat\0"
|
|
|
|
"shmctl\0"
|
|
|
|
"shmdt\0"
|
|
|
|
"shmget\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_KEYRING] = {
|
|
|
|
.name = "@keyring",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Kernel keyring access",
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
.value =
|
|
|
|
"add_key\0"
|
|
|
|
"keyctl\0"
|
|
|
|
"request_key\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
2017-09-13 19:55:16 +02:00
|
|
|
[SYSCALL_FILTER_SET_MEMLOCK] = {
|
|
|
|
.name = "@memlock",
|
|
|
|
.help = "Memory locking control",
|
|
|
|
.value =
|
|
|
|
"mlock\0"
|
|
|
|
"mlock2\0"
|
|
|
|
"mlockall\0"
|
|
|
|
"munlock\0"
|
|
|
|
"munlockall\0"
|
|
|
|
},
|
2016-10-21 21:50:05 +02:00
|
|
|
[SYSCALL_FILTER_SET_MODULE] = {
|
|
|
|
.name = "@module",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Loading and unloading of kernel modules",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"delete_module\0"
|
|
|
|
"finit_module\0"
|
|
|
|
"init_module\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_MOUNT] = {
|
|
|
|
.name = "@mount",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Mounting and unmounting of file systems",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"chroot\0"
|
|
|
|
"mount\0"
|
|
|
|
"pivot_root\0"
|
|
|
|
"umount\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"umount2\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_NETWORK_IO] = {
|
|
|
|
.name = "@network-io",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Network or Unix socket IO, should not be needed if not network facing",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"accept\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"accept4\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"bind\0"
|
|
|
|
"connect\0"
|
|
|
|
"getpeername\0"
|
|
|
|
"getsockname\0"
|
|
|
|
"getsockopt\0"
|
|
|
|
"listen\0"
|
|
|
|
"recv\0"
|
|
|
|
"recvfrom\0"
|
|
|
|
"recvmmsg\0"
|
|
|
|
"recvmsg\0"
|
|
|
|
"send\0"
|
|
|
|
"sendmmsg\0"
|
|
|
|
"sendmsg\0"
|
|
|
|
"sendto\0"
|
|
|
|
"setsockopt\0"
|
|
|
|
"shutdown\0"
|
|
|
|
"socket\0"
|
|
|
|
"socketcall\0"
|
|
|
|
"socketpair\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_OBSOLETE] = {
|
2016-11-02 17:24:34 +01:00
|
|
|
/* some unknown even to libseccomp */
|
2016-10-21 21:50:05 +02:00
|
|
|
.name = "@obsolete",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Unusual, obsolete or unimplemented system calls",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"_sysctl\0"
|
|
|
|
"afs_syscall\0"
|
2016-12-27 14:28:19 +01:00
|
|
|
"bdflush\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"break\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"create_module\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"ftime\0"
|
|
|
|
"get_kernel_syms\0"
|
|
|
|
"getpmsg\0"
|
|
|
|
"gtty\0"
|
2017-09-13 19:39:54 +02:00
|
|
|
"idle\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"lock\0"
|
|
|
|
"mpx\0"
|
|
|
|
"prof\0"
|
|
|
|
"profil\0"
|
|
|
|
"putpmsg\0"
|
|
|
|
"query_module\0"
|
|
|
|
"security\0"
|
|
|
|
"sgetmask\0"
|
|
|
|
"ssetmask\0"
|
|
|
|
"stty\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"sysfs\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"tuxcall\0"
|
|
|
|
"ulimit\0"
|
|
|
|
"uselib\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"ustat\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"vserver\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_PRIVILEGED] = {
|
|
|
|
.name = "@privileged",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "All system calls which need super-user capabilities",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
2017-09-30 14:34:50 +02:00
|
|
|
"@chown\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"@clock\0"
|
|
|
|
"@module\0"
|
|
|
|
"@raw-io\0"
|
2017-10-02 09:16:50 +02:00
|
|
|
"@reboot\0"
|
|
|
|
"@swap\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"_sysctl\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"acct\0"
|
|
|
|
"bpf\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"capset\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"chroot\0"
|
|
|
|
"nfsservctl\0"
|
|
|
|
"pivot_root\0"
|
|
|
|
"quotactl\0"
|
|
|
|
"setdomainname\0"
|
|
|
|
"setfsuid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setfsuid32\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"setgroups\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setgroups32\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"sethostname\0"
|
|
|
|
"setresuid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setresuid32\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"setreuid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setreuid32\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"setuid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setuid32\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"vhangup\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_PROCESS] = {
|
|
|
|
.name = "@process",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Process control, execution, namespaceing operations",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"arch_prctl\0"
|
2017-10-03 07:20:05 +02:00
|
|
|
"capget\0" /* Able to query arbitrary processes */
|
2016-06-01 11:56:01 +02:00
|
|
|
"clone\0"
|
|
|
|
"execveat\0"
|
|
|
|
"fork\0"
|
2017-09-13 19:40:23 +02:00
|
|
|
"getrusage\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"kill\0"
|
|
|
|
"prctl\0"
|
2017-09-13 19:40:23 +02:00
|
|
|
"rt_sigqueueinfo\0"
|
|
|
|
"rt_tgsigqueueinfo\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"setns\0"
|
|
|
|
"tgkill\0"
|
2017-09-13 19:40:23 +02:00
|
|
|
"times\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"tkill\0"
|
|
|
|
"unshare\0"
|
|
|
|
"vfork\0"
|
2017-09-13 19:40:23 +02:00
|
|
|
"wait4\0"
|
|
|
|
"waitid\0"
|
|
|
|
"waitpid\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_RAW_IO] = {
|
|
|
|
.name = "@raw-io",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Raw I/O port access",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"ioperm\0"
|
|
|
|
"iopl\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"pciconfig_iobase\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"pciconfig_read\0"
|
|
|
|
"pciconfig_write\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
#ifdef __NR_s390_pci_mmio_read
|
2016-06-01 11:56:01 +02:00
|
|
|
"s390_pci_mmio_read\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
#endif
|
|
|
|
#ifdef __NR_s390_pci_mmio_write
|
2016-06-01 11:56:01 +02:00
|
|
|
"s390_pci_mmio_write\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
#endif
|
|
|
|
},
|
2016-12-27 14:26:55 +01:00
|
|
|
[SYSCALL_FILTER_SET_REBOOT] = {
|
|
|
|
.name = "@reboot",
|
|
|
|
.help = "Reboot and reboot preparation/kexec",
|
|
|
|
.value =
|
|
|
|
"kexec_file_load\0"
|
2017-10-04 21:09:52 +02:00
|
|
|
"kexec_load\0"
|
2016-12-27 14:26:55 +01:00
|
|
|
"reboot\0"
|
|
|
|
},
|
2016-11-02 15:46:18 +01:00
|
|
|
[SYSCALL_FILTER_SET_RESOURCES] = {
|
|
|
|
.name = "@resources",
|
2016-12-27 14:27:39 +01:00
|
|
|
.help = "Alter resource settings",
|
2016-11-02 15:46:18 +01:00
|
|
|
.value =
|
2017-09-13 19:44:11 +02:00
|
|
|
"ioprio_set\0"
|
|
|
|
"mbind\0"
|
|
|
|
"migrate_pages\0"
|
|
|
|
"move_pages\0"
|
|
|
|
"nice\0"
|
|
|
|
"sched_setaffinity\0"
|
|
|
|
"sched_setattr\0"
|
2016-11-02 15:46:18 +01:00
|
|
|
"sched_setparam\0"
|
|
|
|
"sched_setscheduler\0"
|
2017-09-13 19:44:11 +02:00
|
|
|
"set_mempolicy\0"
|
2016-11-02 15:46:18 +01:00
|
|
|
"setpriority\0"
|
|
|
|
"setrlimit\0"
|
|
|
|
},
|
2017-08-09 15:04:05 +02:00
|
|
|
[SYSCALL_FILTER_SET_SETUID] = {
|
|
|
|
.name = "@setuid",
|
|
|
|
.help = "Operations for changing user/group credentials",
|
|
|
|
.value =
|
|
|
|
"setgid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setgid32\0"
|
2017-08-09 15:04:05 +02:00
|
|
|
"setgroups\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setgroups32\0"
|
2017-08-09 15:04:05 +02:00
|
|
|
"setregid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setregid32\0"
|
2017-08-09 15:04:05 +02:00
|
|
|
"setresgid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setresgid32\0"
|
2017-08-09 15:04:05 +02:00
|
|
|
"setresuid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setresuid32\0"
|
2017-08-09 15:04:05 +02:00
|
|
|
"setreuid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setreuid32\0"
|
2017-08-09 15:04:05 +02:00
|
|
|
"setuid\0"
|
2017-09-13 19:39:02 +02:00
|
|
|
"setuid32\0"
|
2017-08-09 15:04:05 +02:00
|
|
|
},
|
2017-09-13 19:55:16 +02:00
|
|
|
[SYSCALL_FILTER_SET_SIGNAL] = {
|
|
|
|
.name = "@signal",
|
|
|
|
.help = "Process signal handling",
|
|
|
|
.value =
|
|
|
|
"rt_sigaction\0"
|
|
|
|
"rt_sigpending\0"
|
|
|
|
"rt_sigprocmask\0"
|
|
|
|
"rt_sigsuspend\0"
|
|
|
|
"rt_sigtimedwait\0"
|
|
|
|
"sigaction\0"
|
|
|
|
"sigaltstack\0"
|
|
|
|
"signal\0"
|
|
|
|
"signalfd\0"
|
|
|
|
"signalfd4\0"
|
|
|
|
"sigpending\0"
|
|
|
|
"sigprocmask\0"
|
|
|
|
"sigsuspend\0"
|
|
|
|
},
|
2016-12-27 14:26:55 +01:00
|
|
|
[SYSCALL_FILTER_SET_SWAP] = {
|
|
|
|
.name = "@swap",
|
|
|
|
.help = "Enable/disable swap devices",
|
|
|
|
.value =
|
|
|
|
"swapoff\0"
|
|
|
|
"swapon\0"
|
|
|
|
},
|
2017-09-30 14:34:50 +02:00
|
|
|
[SYSCALL_FILTER_SET_SYNC] = {
|
|
|
|
.name = "@sync",
|
|
|
|
.help = "Synchronize files and memory to storage",
|
|
|
|
.value =
|
|
|
|
"fdatasync\0"
|
|
|
|
"fsync\0"
|
|
|
|
"msync\0"
|
|
|
|
"sync\0"
|
|
|
|
"sync_file_range\0"
|
|
|
|
"syncfs\0"
|
|
|
|
},
|
2017-09-13 19:55:16 +02:00
|
|
|
[SYSCALL_FILTER_SET_TIMER] = {
|
|
|
|
.name = "@timer",
|
|
|
|
.help = "Schedule operations by time",
|
|
|
|
.value =
|
|
|
|
"alarm\0"
|
|
|
|
"getitimer\0"
|
|
|
|
"setitimer\0"
|
|
|
|
"timer_create\0"
|
|
|
|
"timer_delete\0"
|
|
|
|
"timer_getoverrun\0"
|
|
|
|
"timer_gettime\0"
|
|
|
|
"timer_settime\0"
|
|
|
|
"timerfd_create\0"
|
|
|
|
"timerfd_gettime\0"
|
|
|
|
"timerfd_settime\0"
|
|
|
|
"times\0"
|
|
|
|
},
|
2016-06-01 11:56:01 +02:00
|
|
|
};
|
2016-10-21 21:50:05 +02:00
|
|
|
|
|
|
|
const SyscallFilterSet *syscall_filter_set_find(const char *name) {
|
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
if (isempty(name) || name[0] != '@')
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
|
|
|
|
if (streq(syscall_filter_sets[i].name, name))
|
|
|
|
return syscall_filter_sets + i;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2017-09-11 17:45:21 +02:00
|
|
|
static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
|
2017-09-10 19:10:29 +02:00
|
|
|
|
2017-09-11 17:45:21 +02:00
|
|
|
int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
|
2017-09-10 19:10:29 +02:00
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(seccomp);
|
|
|
|
assert(name);
|
|
|
|
|
2017-09-11 17:45:21 +02:00
|
|
|
if (strv_contains(exclude, name))
|
|
|
|
return 0;
|
|
|
|
|
2017-09-10 19:10:29 +02:00
|
|
|
if (name[0] == '@') {
|
|
|
|
const SyscallFilterSet *other;
|
|
|
|
|
|
|
|
other = syscall_filter_set_find(name);
|
2017-09-13 19:57:32 +02:00
|
|
|
if (!other) {
|
|
|
|
log_debug("Filter set %s is not known!", name);
|
2017-09-10 19:10:29 +02:00
|
|
|
return -EINVAL;
|
2017-09-13 19:57:32 +02:00
|
|
|
}
|
2017-09-10 19:10:29 +02:00
|
|
|
|
2017-09-11 17:45:21 +02:00
|
|
|
r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
|
2017-09-10 19:10:29 +02:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
} else {
|
|
|
|
int id;
|
|
|
|
|
|
|
|
id = seccomp_syscall_resolve_name(name);
|
2017-09-13 19:57:32 +02:00
|
|
|
if (id == __NR_SCMP_ERROR) {
|
2017-10-05 11:23:07 +02:00
|
|
|
log_debug("System call %s is not known, ignoring.", name);
|
|
|
|
return 0;
|
2017-09-13 19:57:32 +02:00
|
|
|
}
|
2017-09-10 19:10:29 +02:00
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(seccomp, action, id, 0);
|
|
|
|
if (r < 0)
|
|
|
|
/* If the system call is not known on this architecture, then that's fine, let's ignore it */
|
|
|
|
log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", name, id);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
static int seccomp_add_syscall_filter_set(
|
|
|
|
scmp_filter_ctx seccomp,
|
|
|
|
const SyscallFilterSet *set,
|
2017-09-11 17:45:21 +02:00
|
|
|
uint32_t action,
|
|
|
|
char **exclude) {
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
|
2016-10-21 21:50:05 +02:00
|
|
|
const char *sys;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(seccomp);
|
|
|
|
assert(set);
|
|
|
|
|
|
|
|
NULSTR_FOREACH(sys, set->value) {
|
2017-09-11 17:45:21 +02:00
|
|
|
r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
|
2017-09-10 19:10:29 +02:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
|
|
|
|
uint32_t arch;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(set);
|
|
|
|
|
|
|
|
/* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
|
|
|
|
* earch local arch. */
|
|
|
|
|
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
|
|
|
|
|
|
|
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
|
|
|
|
|
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, default_action);
|
2016-10-21 21:50:05 +02:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
|
2017-09-11 17:45:21 +02:00
|
|
|
r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add filter set, ignoring: %m");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
|
|
|
if (IN_SET(r, -EPERM, -EACCES))
|
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
2016-10-21 21:50:05 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2016-10-21 21:18:46 +02:00
|
|
|
|
2017-11-11 13:35:49 +01:00
|
|
|
int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action) {
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
uint32_t arch;
|
2016-10-21 21:18:46 +02:00
|
|
|
int r;
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
/* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
|
|
|
|
* SyscallFilterSet* table. */
|
2016-10-21 21:18:46 +02:00
|
|
|
|
2017-11-11 13:35:49 +01:00
|
|
|
if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
return 0;
|
2016-10-21 21:18:46 +02:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
|
|
|
Iterator i;
|
2017-11-11 13:35:49 +01:00
|
|
|
void *id, *val;
|
2016-10-21 21:18:46 +02:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
|
2016-10-21 21:18:46 +02:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, default_action);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
2016-10-21 21:18:46 +02:00
|
|
|
|
2017-11-11 13:35:49 +01:00
|
|
|
HASHMAP_FOREACH_KEY(val, id, set, i) {
|
|
|
|
uint32_t a = action;
|
|
|
|
int e = PTR_TO_INT(val);
|
|
|
|
|
|
|
|
if (action != SCMP_ACT_ALLOW && e >= 0)
|
|
|
|
a = SCMP_ACT_ERRNO(e);
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(seccomp, a, PTR_TO_INT(id) - 1, 0);
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
if (r < 0) {
|
|
|
|
/* If the system call is not known on this architecture, then that's fine, let's ignore it */
|
|
|
|
_cleanup_free_ char *n = NULL;
|
|
|
|
|
2018-01-31 18:11:47 +01:00
|
|
|
n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
|
2017-07-15 21:25:19 +02:00
|
|
|
log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n), PTR_TO_INT(id) - 1);
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
|
|
|
if (IN_SET(r, -EPERM, -EACCES))
|
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
2016-11-02 03:25:19 +01:00
|
|
|
}
|
|
|
|
|
2017-12-23 10:45:32 +01:00
|
|
|
int seccomp_parse_syscall_filter_internal(
|
|
|
|
bool invert,
|
|
|
|
const char *name,
|
|
|
|
int errno_num,
|
|
|
|
Hashmap *filter,
|
|
|
|
bool whitelist,
|
|
|
|
bool warn,
|
|
|
|
const char *unit,
|
|
|
|
const char *filename,
|
|
|
|
unsigned line) {
|
|
|
|
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(name);
|
|
|
|
assert(filter);
|
|
|
|
|
|
|
|
if (name[0] == '@') {
|
|
|
|
const SyscallFilterSet *set;
|
|
|
|
const char *i;
|
|
|
|
|
|
|
|
set = syscall_filter_set_find(name);
|
|
|
|
if (!set) {
|
|
|
|
if (warn) {
|
|
|
|
log_syntax(unit, LOG_WARNING, filename, line, 0, "Unknown system call group, ignoring: %s", name);
|
|
|
|
return 0;
|
|
|
|
} else
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
NULSTR_FOREACH(i, set->value) {
|
|
|
|
r = seccomp_parse_syscall_filter_internal(invert, i, errno_num, filter, whitelist, warn, unit, filename, line);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
int id;
|
|
|
|
|
|
|
|
id = seccomp_syscall_resolve_name(name);
|
|
|
|
if (id == __NR_SCMP_ERROR) {
|
|
|
|
if (warn) {
|
|
|
|
log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse system call, ignoring: %s", name);
|
|
|
|
return 0;
|
|
|
|
} else
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If we previously wanted to forbid a syscall and now
|
|
|
|
* we want to allow it, then remove it from the list. */
|
|
|
|
if (!invert == whitelist) {
|
|
|
|
r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
|
|
|
|
if (r < 0)
|
|
|
|
return warn ? log_oom() : -ENOMEM;
|
|
|
|
} else
|
|
|
|
(void) hashmap_remove(filter, INT_TO_PTR(id + 1));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-11-02 03:25:19 +01:00
|
|
|
int seccomp_restrict_namespaces(unsigned long retain) {
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
uint32_t arch;
|
2016-11-02 03:25:19 +01:00
|
|
|
int r;
|
|
|
|
|
2017-12-15 11:09:00 +01:00
|
|
|
if (DEBUG_LOGGING) {
|
2016-11-02 03:25:19 +01:00
|
|
|
_cleanup_free_ char *s = NULL;
|
|
|
|
|
|
|
|
(void) namespace_flag_to_string_many(retain, &s);
|
|
|
|
log_debug("Restricting namespace to: %s.", strna(s));
|
|
|
|
}
|
|
|
|
|
|
|
|
/* NOOP? */
|
|
|
|
if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
|
|
|
|
return 0;
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
|
|
|
unsigned i;
|
2016-11-02 03:25:19 +01:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
|
|
|
|
|
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
if ((retain & NAMESPACE_FLAGS_ALL) == 0)
|
|
|
|
/* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
|
|
|
|
* altogether. */
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(setns),
|
|
|
|
0);
|
|
|
|
else
|
|
|
|
/* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
|
|
|
|
* special invocation with a zero flags argument, right here. */
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(setns),
|
|
|
|
1,
|
|
|
|
SCMP_A1(SCMP_CMP_EQ, 0));
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; namespace_flag_map[i].name; i++) {
|
|
|
|
unsigned long f;
|
|
|
|
|
|
|
|
f = namespace_flag_map[i].flag;
|
|
|
|
if ((retain & f) == f) {
|
|
|
|
log_debug("Permitting %s.", namespace_flag_map[i].name);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
log_debug("Blocking %s.", namespace_flag_map[i].name);
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(unshare),
|
|
|
|
1,
|
|
|
|
SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2017-05-05 05:10:30 +02:00
|
|
|
/* On s390/s390x the first two parameters to clone are switched */
|
|
|
|
if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
|
2017-02-08 16:21:11 +01:00
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(clone),
|
|
|
|
1,
|
|
|
|
SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
|
|
|
|
else
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(clone),
|
|
|
|
1,
|
|
|
|
SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(setns),
|
|
|
|
1,
|
|
|
|
SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (r < 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
|
|
|
if (IN_SET(r, -EPERM, -EACCES))
|
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int seccomp_protect_sysctl(void) {
|
|
|
|
uint32_t arch;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
|
|
|
|
|
|
|
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
|
|
|
|
|
2017-07-15 21:28:02 +02:00
|
|
|
if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
|
|
|
|
/* No _sysctl syscall */
|
|
|
|
continue;
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
2016-11-02 03:25:19 +01:00
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_SYS(_sysctl),
|
2016-11-02 03:25:19 +01:00
|
|
|
0);
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
|
|
|
if (IN_SET(r, -EPERM, -EACCES))
|
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
|
|
|
|
uint32_t arch;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
2017-02-12 21:25:40 +01:00
|
|
|
bool supported;
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
Iterator i;
|
|
|
|
|
|
|
|
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
|
|
|
|
|
2017-02-12 21:25:40 +01:00
|
|
|
switch (arch) {
|
|
|
|
|
|
|
|
case SCMP_ARCH_X86_64:
|
|
|
|
case SCMP_ARCH_X32:
|
|
|
|
case SCMP_ARCH_ARM:
|
|
|
|
case SCMP_ARCH_AARCH64:
|
seccomp: enable RestrictAddressFamilies on ppc64, autodetect SECCOMP_RESTRICT_ADDRESS_FAMILIES_BROKEN
We expect that if socket() syscall is available, seccomp works for that
architecture. So instead of explicitly listing all architectures where we know
it is not available, just assume it is broken if the number is not defined.
This should have the same effect, except that other architectures where it is
also broken will pass tests without further changes. (Architectures where the
filter should work, but does not work because of missing entries in
seccomp-util.c, will still fail.)
i386, s390, s390x are the exception — setting the filter fails, even though
socket() is available, so it needs to be special-cased
(https://github.com/systemd/systemd/issues/5215#issuecomment-277241488).
This remove the last define in seccomp-util.h that was only used in test-seccomp.c. Porting
the seccomp filter to new architectures should be simpler because now only two places need
to be modified.
RestrictAddressFamilies seems to work on ppc64[bl]e, so enable it (the tests pass).
2017-05-10 00:57:10 +02:00
|
|
|
case SCMP_ARCH_PPC64:
|
|
|
|
case SCMP_ARCH_PPC64LE:
|
2017-02-12 21:25:40 +01:00
|
|
|
/* These we know we support (i.e. are the ones that do not use socketcall()) */
|
|
|
|
supported = true;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case SCMP_ARCH_S390:
|
|
|
|
case SCMP_ARCH_S390X:
|
|
|
|
case SCMP_ARCH_PPC:
|
seccomp: enable RestrictAddressFamilies on ppc64, autodetect SECCOMP_RESTRICT_ADDRESS_FAMILIES_BROKEN
We expect that if socket() syscall is available, seccomp works for that
architecture. So instead of explicitly listing all architectures where we know
it is not available, just assume it is broken if the number is not defined.
This should have the same effect, except that other architectures where it is
also broken will pass tests without further changes. (Architectures where the
filter should work, but does not work because of missing entries in
seccomp-util.c, will still fail.)
i386, s390, s390x are the exception — setting the filter fails, even though
socket() is available, so it needs to be special-cased
(https://github.com/systemd/systemd/issues/5215#issuecomment-277241488).
This remove the last define in seccomp-util.h that was only used in test-seccomp.c. Porting
the seccomp filter to new architectures should be simpler because now only two places need
to be modified.
RestrictAddressFamilies seems to work on ppc64[bl]e, so enable it (the tests pass).
2017-05-10 00:57:10 +02:00
|
|
|
case SCMP_ARCH_X86:
|
2017-02-12 21:25:40 +01:00
|
|
|
default:
|
|
|
|
/* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
|
|
|
|
* don't know */
|
|
|
|
supported = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!supported)
|
|
|
|
continue;
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
if (whitelist) {
|
|
|
|
int af, first = 0, last = 0;
|
|
|
|
void *afp;
|
|
|
|
|
|
|
|
/* If this is a whitelist, we first block the address families that are out of range and then
|
|
|
|
* everything that is not in the set. First, we find the lowest and highest address family in
|
|
|
|
* the set. */
|
|
|
|
|
|
|
|
SET_FOREACH(afp, address_families, i) {
|
|
|
|
af = PTR_TO_INT(afp);
|
|
|
|
|
|
|
|
if (af <= 0 || af >= af_max())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (first == 0 || af < first)
|
|
|
|
first = af;
|
|
|
|
|
|
|
|
if (last == 0 || af > last)
|
|
|
|
last = af;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert((first == 0) == (last == 0));
|
|
|
|
|
|
|
|
if (first == 0) {
|
|
|
|
|
|
|
|
/* No entries in the valid range, block everything */
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EAFNOSUPPORT),
|
|
|
|
SCMP_SYS(socket),
|
|
|
|
0);
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
/* Block everything below the first entry */
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EAFNOSUPPORT),
|
|
|
|
SCMP_SYS(socket),
|
|
|
|
1,
|
|
|
|
SCMP_A0(SCMP_CMP_LT, first));
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Block everything above the last entry */
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EAFNOSUPPORT),
|
|
|
|
SCMP_SYS(socket),
|
|
|
|
1,
|
|
|
|
SCMP_A0(SCMP_CMP_GT, last));
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Block everything between the first and last entry */
|
|
|
|
for (af = 1; af < af_max(); af++) {
|
|
|
|
|
|
|
|
if (set_contains(address_families, INT_TO_PTR(af)))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EAFNOSUPPORT),
|
|
|
|
SCMP_SYS(socket),
|
|
|
|
1,
|
|
|
|
SCMP_A0(SCMP_CMP_EQ, af));
|
|
|
|
if (r < 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} else {
|
|
|
|
void *af;
|
|
|
|
|
|
|
|
/* If this is a blacklist, then generate one rule for
|
|
|
|
* each address family that are then combined in OR
|
|
|
|
* checks. */
|
|
|
|
|
|
|
|
SET_FOREACH(af, address_families, i) {
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EAFNOSUPPORT),
|
|
|
|
SCMP_SYS(socket),
|
|
|
|
1,
|
|
|
|
SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
|
|
|
|
if (r < 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
|
|
|
if (IN_SET(r, -EPERM, -EACCES))
|
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int seccomp_restrict_realtime(void) {
|
|
|
|
static const int permitted_policies[] = {
|
|
|
|
SCHED_OTHER,
|
|
|
|
SCHED_BATCH,
|
|
|
|
SCHED_IDLE,
|
|
|
|
};
|
|
|
|
|
|
|
|
int r, max_policy = 0;
|
|
|
|
uint32_t arch;
|
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
/* Determine the highest policy constant we want to allow */
|
|
|
|
for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
|
|
|
|
if (permitted_policies[i] > max_policy)
|
|
|
|
max_policy = permitted_policies[i];
|
|
|
|
|
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
|
|
|
int p;
|
|
|
|
|
|
|
|
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
|
|
|
|
|
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
/* Go through all policies with lower values than that, and block them -- unless they appear in the
|
|
|
|
* whitelist. */
|
|
|
|
for (p = 0; p < max_policy; p++) {
|
|
|
|
bool good = false;
|
|
|
|
|
|
|
|
/* Check if this is in the whitelist. */
|
|
|
|
for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
|
|
|
|
if (permitted_policies[i] == p) {
|
|
|
|
good = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (good)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Deny this policy */
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(sched_setscheduler),
|
|
|
|
1,
|
|
|
|
SCMP_A1(SCMP_CMP_EQ, p));
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
|
|
|
|
* unsigned here, hence no need no check for < 0 values. */
|
|
|
|
r = seccomp_rule_add_exact(
|
2016-11-02 03:25:19 +01:00
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_SYS(sched_setscheduler),
|
2016-11-02 03:25:19 +01:00
|
|
|
1,
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
SCMP_A1(SCMP_CMP_GT, max_policy));
|
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
2016-11-02 03:25:19 +01:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
r = seccomp_load(seccomp);
|
|
|
|
if (IN_SET(r, -EPERM, -EACCES))
|
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-05-05 05:10:30 +02:00
|
|
|
static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
|
|
|
|
uint32_t arch,
|
|
|
|
int nr,
|
|
|
|
unsigned int arg_cnt,
|
|
|
|
const struct scmp_arg_cmp arg) {
|
|
|
|
int r;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
|
|
|
|
if (r < 0) {
|
|
|
|
_cleanup_free_ char *n = NULL;
|
|
|
|
|
|
|
|
n = seccomp_syscall_resolve_num_arch(arch, nr);
|
|
|
|
log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
|
|
|
|
strna(n),
|
|
|
|
seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2017-05-05 05:10:30 +02:00
|
|
|
/* For known architectures, check that syscalls are indeed defined or not. */
|
2017-05-05 05:10:31 +02:00
|
|
|
#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
|
2017-05-05 05:10:30 +02:00
|
|
|
assert_cc(SCMP_SYS(shmget) > 0);
|
|
|
|
assert_cc(SCMP_SYS(shmat) > 0);
|
|
|
|
assert_cc(SCMP_SYS(shmdt) > 0);
|
|
|
|
#elif defined(__i386__) || defined(__powerpc64__)
|
|
|
|
assert_cc(SCMP_SYS(shmget) < 0);
|
|
|
|
assert_cc(SCMP_SYS(shmat) < 0);
|
|
|
|
assert_cc(SCMP_SYS(shmdt) < 0);
|
|
|
|
#endif
|
2017-05-05 05:10:30 +02:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
int seccomp_memory_deny_write_execute(void) {
|
2017-02-08 15:14:02 +01:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
uint32_t arch;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
2017-02-08 15:14:02 +01:00
|
|
|
int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
|
2016-11-02 03:25:19 +01:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
|
|
|
|
|
2017-02-08 15:14:02 +01:00
|
|
|
switch (arch) {
|
|
|
|
|
|
|
|
case SCMP_ARCH_X86:
|
|
|
|
filter_syscall = SCMP_SYS(mmap2);
|
|
|
|
block_syscall = SCMP_SYS(mmap);
|
2017-05-05 05:10:30 +02:00
|
|
|
break;
|
|
|
|
|
2017-12-22 10:06:29 +01:00
|
|
|
case SCMP_ARCH_PPC:
|
2017-05-05 05:10:30 +02:00
|
|
|
case SCMP_ARCH_PPC64:
|
|
|
|
case SCMP_ARCH_PPC64LE:
|
|
|
|
filter_syscall = SCMP_SYS(mmap);
|
|
|
|
|
|
|
|
/* Note that shmat() isn't available, and the call is multiplexed through ipc().
|
|
|
|
* We ignore that here, which means there's still a way to get writable/executable
|
|
|
|
* memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
|
2017-02-08 15:14:02 +01:00
|
|
|
|
|
|
|
break;
|
|
|
|
|
2017-05-05 05:10:31 +02:00
|
|
|
case SCMP_ARCH_ARM:
|
|
|
|
filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
|
|
|
|
shmat_syscall = SCMP_SYS(shmat);
|
|
|
|
break;
|
|
|
|
|
2017-02-08 15:14:02 +01:00
|
|
|
case SCMP_ARCH_X86_64:
|
|
|
|
case SCMP_ARCH_X32:
|
2017-07-15 21:30:01 +02:00
|
|
|
case SCMP_ARCH_AARCH64:
|
|
|
|
filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
|
2017-02-08 15:14:02 +01:00
|
|
|
shmat_syscall = SCMP_SYS(shmat);
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* Please add more definitions here, if you port systemd to other architectures! */
|
|
|
|
|
2017-12-22 10:06:29 +01:00
|
|
|
#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
|
2017-02-08 15:14:02 +01:00
|
|
|
#warning "Consider adding the right mmap() syscall definitions here!"
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Can't filter mmap() on this arch, then skip it */
|
|
|
|
if (filter_syscall == 0)
|
|
|
|
continue;
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
2017-05-05 05:10:30 +02:00
|
|
|
r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
|
|
|
|
1,
|
|
|
|
SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
|
|
|
|
if (r < 0)
|
|
|
|
continue;
|
2017-02-08 15:14:02 +01:00
|
|
|
|
|
|
|
if (block_syscall != 0) {
|
2017-05-05 05:10:30 +02:00
|
|
|
r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
|
|
|
|
if (r < 0)
|
2017-02-08 15:14:02 +01:00
|
|
|
continue;
|
2016-11-02 03:25:19 +01:00
|
|
|
}
|
2016-10-21 21:18:46 +02:00
|
|
|
|
2017-05-05 05:10:30 +02:00
|
|
|
r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
|
|
|
|
1,
|
2017-11-12 17:28:48 +01:00
|
|
|
SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
|
|
|
|
if (r < 0)
|
|
|
|
continue;
|
|
|
|
|
2017-11-13 09:35:49 +01:00
|
|
|
#ifdef __NR_pkey_mprotect
|
2017-11-12 17:28:48 +01:00
|
|
|
r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
|
|
|
|
1,
|
2017-05-05 05:10:30 +02:00
|
|
|
SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
|
|
|
|
if (r < 0)
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
continue;
|
2017-11-13 09:35:49 +01:00
|
|
|
#endif
|
2016-11-02 03:25:19 +01:00
|
|
|
|
2017-02-08 15:14:02 +01:00
|
|
|
if (shmat_syscall != 0) {
|
2017-05-05 05:10:30 +02:00
|
|
|
r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
|
|
|
|
1,
|
|
|
|
SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
|
|
|
|
if (r < 0)
|
2017-02-08 15:14:02 +01:00
|
|
|
continue;
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
|
|
|
if (IN_SET(r, -EPERM, -EACCES))
|
|
|
|
return r;
|
2016-11-02 03:25:19 +01:00
|
|
|
if (r < 0)
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
}
|
2016-11-02 03:25:19 +01:00
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int seccomp_restrict_archs(Set *archs) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
|
|
|
Iterator i;
|
|
|
|
void *id;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
/* This installs a filter with no rules, but that restricts the system call architectures to the specified
|
|
|
|
* list. */
|
|
|
|
|
|
|
|
seccomp = seccomp_init(SCMP_ACT_ALLOW);
|
|
|
|
if (!seccomp)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
SET_FOREACH(id, archs, i) {
|
|
|
|
r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
|
|
|
|
if (r == -EEXIST)
|
|
|
|
continue;
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
2016-11-02 03:25:19 +01:00
|
|
|
}
|
|
|
|
|
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better
compatibility with some architectures, including i386.
So far we relied on libseccomp's internal handling of the multiple
syscall ABIs supported on Linux. This is problematic however, as it does
not define clear semantics if an ABI is not able to support specific
seccomp rules we install.
This rework hence changes a couple of things:
- We no longer use seccomp_rule_add(), but only
seccomp_rule_add_exact(), and fail the installation of a filter if the
architecture doesn't support it.
- We no longer rely on adding multiple syscall architectures to a single filter,
but instead install a separate filter for each syscall architecture
supported. This way, we can install a strict filter for x86-64, while
permitting a less strict filter for i386.
- All high-level filter additions are now moved from execute.c to
seccomp-util.c, so that we can test them independently of the service
execution logic.
- Tests have been added for all types of our seccomp filters.
- SystemCallFilters= and SystemCallArchitectures= are now implemented in
independent filters and installation logic, as they semantically are
very much independent of each other.
Fixes: #4575
2016-12-27 15:28:25 +01:00
|
|
|
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
2016-11-02 03:25:19 +01:00
|
|
|
|
2017-10-05 11:24:51 +02:00
|
|
|
r = seccomp_load(seccomp);
|
|
|
|
if (IN_SET(r, -EPERM, -EACCES))
|
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
|
|
|
|
|
|
|
|
return 0;
|
2016-10-21 21:18:46 +02:00
|
|
|
}
|
2017-08-02 06:46:45 +02:00
|
|
|
|
|
|
|
int parse_syscall_archs(char **l, Set **archs) {
|
|
|
|
_cleanup_set_free_ Set *_archs;
|
|
|
|
char **s;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(l);
|
|
|
|
assert(archs);
|
|
|
|
|
|
|
|
r = set_ensure_allocated(&_archs, NULL);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
STRV_FOREACH(s, l) {
|
|
|
|
uint32_t a;
|
|
|
|
|
|
|
|
r = seccomp_arch_from_string(*s, &a);
|
|
|
|
if (r < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
r = set_put(_archs, UINT32_TO_PTR(a + 1));
|
|
|
|
if (r < 0)
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
*archs = _archs;
|
|
|
|
_archs = NULL;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2017-08-09 16:09:04 +02:00
|
|
|
|
2017-11-11 13:35:49 +01:00
|
|
|
int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
|
2017-08-09 16:09:04 +02:00
|
|
|
const char *i;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(set);
|
|
|
|
|
|
|
|
NULSTR_FOREACH(i, set->value) {
|
|
|
|
|
|
|
|
if (i[0] == '@') {
|
|
|
|
const SyscallFilterSet *more;
|
|
|
|
|
|
|
|
more = syscall_filter_set_find(i);
|
|
|
|
if (!more)
|
|
|
|
return -ENXIO;
|
|
|
|
|
|
|
|
r = seccomp_filter_set_add(filter, add, more);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
} else {
|
|
|
|
int id;
|
|
|
|
|
|
|
|
id = seccomp_syscall_resolve_name(i);
|
2017-10-05 11:23:07 +02:00
|
|
|
if (id == __NR_SCMP_ERROR) {
|
|
|
|
log_debug("Couldn't resolve system call, ignoring: %s", i);
|
|
|
|
continue;
|
|
|
|
}
|
2017-08-09 16:09:04 +02:00
|
|
|
|
|
|
|
if (add) {
|
2017-11-11 13:35:49 +01:00
|
|
|
r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
|
2017-08-09 16:09:04 +02:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
} else
|
2017-11-11 13:35:49 +01:00
|
|
|
(void) hashmap_remove(filter, INT_TO_PTR(id + 1));
|
2017-08-09 16:09:04 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2017-07-04 14:48:18 +02:00
|
|
|
|
|
|
|
int seccomp_lock_personality(unsigned long personality) {
|
2017-08-09 20:43:35 +02:00
|
|
|
uint32_t arch;
|
2017-07-04 14:48:18 +02:00
|
|
|
int r;
|
|
|
|
|
2017-08-09 20:43:35 +02:00
|
|
|
if (personality >= PERSONALITY_INVALID)
|
|
|
|
return -EINVAL;
|
2017-07-04 14:48:18 +02:00
|
|
|
|
2017-08-09 20:43:35 +02:00
|
|
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
|
|
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
2017-07-04 14:48:18 +02:00
|
|
|
|
2017-08-09 20:43:35 +02:00
|
|
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
r = seccomp_rule_add_exact(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(personality),
|
|
|
|
1,
|
|
|
|
SCMP_A0(SCMP_CMP_NE, personality));
|
2017-10-05 11:26:09 +02:00
|
|
|
if (r < 0) {
|
|
|
|
log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
continue;
|
|
|
|
}
|
2017-08-09 20:43:35 +02:00
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
|
|
|
if (IN_SET(r, -EPERM, -EACCES))
|
|
|
|
return r;
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
2017-07-04 14:48:18 +02:00
|
|
|
}
|