2014-02-13 00:24:00 +01:00
|
|
|
/***
|
|
|
|
This file is part of systemd.
|
|
|
|
|
|
|
|
Copyright 2014 Lennart Poettering
|
|
|
|
|
|
|
|
systemd is free software; you can redistribute it and/or modify it
|
|
|
|
under the terms of the GNU Lesser General Public License as published by
|
|
|
|
the Free Software Foundation; either version 2.1 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
systemd is distributed in the hope that it will be useful, but
|
|
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public License
|
|
|
|
along with systemd; If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
***/
|
|
|
|
|
2015-12-03 21:13:37 +01:00
|
|
|
#include <errno.h>
|
2014-02-13 00:24:00 +01:00
|
|
|
#include <seccomp.h>
|
2015-12-03 21:13:37 +01:00
|
|
|
#include <stddef.h>
|
2016-08-31 15:00:35 +02:00
|
|
|
#include <sys/prctl.h>
|
|
|
|
#include <linux/seccomp.h>
|
2014-02-13 00:24:00 +01:00
|
|
|
|
2016-11-02 03:25:19 +01:00
|
|
|
#include "alloc-util.h"
|
2015-12-03 21:13:37 +01:00
|
|
|
#include "macro.h"
|
2016-11-02 03:25:19 +01:00
|
|
|
#include "nsflags.h"
|
2015-11-16 22:09:36 +01:00
|
|
|
#include "seccomp-util.h"
|
2015-10-24 22:58:24 +02:00
|
|
|
#include "string-util.h"
|
2016-10-21 21:50:05 +02:00
|
|
|
#include "util.h"
|
2014-02-13 00:24:00 +01:00
|
|
|
|
|
|
|
const char* seccomp_arch_to_string(uint32_t c) {
|
2016-11-01 16:33:18 +01:00
|
|
|
/* Maintain order used in <seccomp.h>.
|
|
|
|
*
|
|
|
|
* Names used here should be the same as those used for ConditionArchitecture=,
|
|
|
|
* except for "subarchitectures" like x32. */
|
2014-02-13 00:24:00 +01:00
|
|
|
|
2016-11-01 16:33:18 +01:00
|
|
|
switch(c) {
|
|
|
|
case SCMP_ARCH_NATIVE:
|
2014-02-13 00:24:00 +01:00
|
|
|
return "native";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_X86:
|
2014-02-13 00:24:00 +01:00
|
|
|
return "x86";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_X86_64:
|
2014-02-13 00:24:00 +01:00
|
|
|
return "x86-64";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_X32:
|
2014-02-13 00:24:00 +01:00
|
|
|
return "x32";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_ARM:
|
2014-02-13 00:24:00 +01:00
|
|
|
return "arm";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_AARCH64:
|
|
|
|
return "arm64";
|
|
|
|
case SCMP_ARCH_MIPS:
|
|
|
|
return "mips";
|
|
|
|
case SCMP_ARCH_MIPS64:
|
|
|
|
return "mips64";
|
|
|
|
case SCMP_ARCH_MIPS64N32:
|
|
|
|
return "mips64-n32";
|
|
|
|
case SCMP_ARCH_MIPSEL:
|
|
|
|
return "mips-le";
|
|
|
|
case SCMP_ARCH_MIPSEL64:
|
|
|
|
return "mips64-le";
|
|
|
|
case SCMP_ARCH_MIPSEL64N32:
|
|
|
|
return "mips64-le-n32";
|
|
|
|
case SCMP_ARCH_PPC:
|
|
|
|
return "ppc";
|
|
|
|
case SCMP_ARCH_PPC64:
|
|
|
|
return "ppc64";
|
|
|
|
case SCMP_ARCH_PPC64LE:
|
|
|
|
return "ppc64-le";
|
|
|
|
case SCMP_ARCH_S390:
|
2016-10-05 13:58:55 +02:00
|
|
|
return "s390";
|
2016-11-01 16:33:18 +01:00
|
|
|
case SCMP_ARCH_S390X:
|
2016-10-05 13:58:55 +02:00
|
|
|
return "s390x";
|
2016-11-01 16:33:18 +01:00
|
|
|
default:
|
|
|
|
return NULL;
|
|
|
|
}
|
2014-02-13 00:24:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
int seccomp_arch_from_string(const char *n, uint32_t *ret) {
|
|
|
|
if (!n)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
assert(ret);
|
|
|
|
|
|
|
|
if (streq(n, "native"))
|
|
|
|
*ret = SCMP_ARCH_NATIVE;
|
|
|
|
else if (streq(n, "x86"))
|
|
|
|
*ret = SCMP_ARCH_X86;
|
|
|
|
else if (streq(n, "x86-64"))
|
|
|
|
*ret = SCMP_ARCH_X86_64;
|
|
|
|
else if (streq(n, "x32"))
|
|
|
|
*ret = SCMP_ARCH_X32;
|
|
|
|
else if (streq(n, "arm"))
|
|
|
|
*ret = SCMP_ARCH_ARM;
|
2016-11-01 16:33:18 +01:00
|
|
|
else if (streq(n, "arm64"))
|
|
|
|
*ret = SCMP_ARCH_AARCH64;
|
|
|
|
else if (streq(n, "mips"))
|
|
|
|
*ret = SCMP_ARCH_MIPS;
|
|
|
|
else if (streq(n, "mips64"))
|
|
|
|
*ret = SCMP_ARCH_MIPS64;
|
|
|
|
else if (streq(n, "mips64-n32"))
|
|
|
|
*ret = SCMP_ARCH_MIPS64N32;
|
|
|
|
else if (streq(n, "mips-le"))
|
|
|
|
*ret = SCMP_ARCH_MIPSEL;
|
|
|
|
else if (streq(n, "mips64-le"))
|
|
|
|
*ret = SCMP_ARCH_MIPSEL64;
|
|
|
|
else if (streq(n, "mips64-le-n32"))
|
|
|
|
*ret = SCMP_ARCH_MIPSEL64N32;
|
|
|
|
else if (streq(n, "ppc"))
|
|
|
|
*ret = SCMP_ARCH_PPC;
|
|
|
|
else if (streq(n, "ppc64"))
|
|
|
|
*ret = SCMP_ARCH_PPC64;
|
|
|
|
else if (streq(n, "ppc64-le"))
|
|
|
|
*ret = SCMP_ARCH_PPC64LE;
|
2016-10-05 13:58:55 +02:00
|
|
|
else if (streq(n, "s390"))
|
|
|
|
*ret = SCMP_ARCH_S390;
|
|
|
|
else if (streq(n, "s390x"))
|
|
|
|
*ret = SCMP_ARCH_S390X;
|
2014-02-13 00:24:00 +01:00
|
|
|
else
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2014-02-18 22:14:00 +01:00
|
|
|
|
2016-10-21 20:28:05 +02:00
|
|
|
int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action) {
|
|
|
|
scmp_filter_ctx seccomp;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
/* Much like seccomp_init(), but tries to be a bit more conservative in its defaults: all secondary archs are
|
|
|
|
* added by default, and NNP is turned off. */
|
|
|
|
|
|
|
|
seccomp = seccomp_init(default_action);
|
|
|
|
if (!seccomp)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
r = seccomp_add_secondary_archs(seccomp);
|
|
|
|
if (r < 0)
|
|
|
|
goto finish;
|
|
|
|
|
|
|
|
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
|
|
|
|
if (r < 0)
|
|
|
|
goto finish;
|
|
|
|
|
|
|
|
*ret = seccomp;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
finish:
|
|
|
|
seccomp_release(seccomp);
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2016-11-01 16:33:18 +01:00
|
|
|
int seccomp_add_secondary_archs(scmp_filter_ctx ctx) {
|
2014-02-18 22:14:00 +01:00
|
|
|
|
|
|
|
/* Add in all possible secondary archs we are aware of that
|
|
|
|
* this kernel might support. */
|
|
|
|
|
2016-11-01 16:33:18 +01:00
|
|
|
static const int seccomp_arches[] = {
|
|
|
|
#if defined(__i386__) || defined(__x86_64__)
|
|
|
|
SCMP_ARCH_X86,
|
|
|
|
SCMP_ARCH_X86_64,
|
|
|
|
SCMP_ARCH_X32,
|
|
|
|
|
|
|
|
#elif defined(__arm__) || defined(__aarch64__)
|
|
|
|
SCMP_ARCH_ARM,
|
|
|
|
SCMP_ARCH_AARCH64,
|
|
|
|
|
|
|
|
#elif defined(__arm__) || defined(__aarch64__)
|
|
|
|
SCMP_ARCH_ARM,
|
|
|
|
SCMP_ARCH_AARCH64,
|
|
|
|
|
|
|
|
#elif defined(__mips__) || defined(__mips64__)
|
|
|
|
SCMP_ARCH_MIPS,
|
|
|
|
SCMP_ARCH_MIPS64,
|
|
|
|
SCMP_ARCH_MIPS64N32,
|
|
|
|
SCMP_ARCH_MIPSEL,
|
|
|
|
SCMP_ARCH_MIPSEL64,
|
|
|
|
SCMP_ARCH_MIPSEL64N32,
|
|
|
|
|
|
|
|
#elif defined(__powerpc__) || defined(__powerpc64__)
|
|
|
|
SCMP_ARCH_PPC,
|
|
|
|
SCMP_ARCH_PPC64,
|
|
|
|
SCMP_ARCH_PPC64LE,
|
2014-02-18 22:14:00 +01:00
|
|
|
|
2016-10-05 13:58:55 +02:00
|
|
|
#elif defined(__s390__) || defined(__s390x__)
|
2016-11-01 16:33:18 +01:00
|
|
|
SCMP_ARCH_S390,
|
|
|
|
SCMP_ARCH_S390X,
|
|
|
|
#endif
|
|
|
|
};
|
2016-10-05 13:58:55 +02:00
|
|
|
|
2016-11-01 16:33:18 +01:00
|
|
|
unsigned i;
|
|
|
|
int r;
|
2016-10-05 13:58:55 +02:00
|
|
|
|
2016-11-01 16:33:18 +01:00
|
|
|
for (i = 0; i < ELEMENTSOF(seccomp_arches); i++) {
|
|
|
|
r = seccomp_arch_add(ctx, seccomp_arches[i]);
|
|
|
|
if (r < 0 && r != -EEXIST)
|
|
|
|
return r;
|
|
|
|
}
|
2014-02-18 22:14:00 +01:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2016-06-01 11:56:01 +02:00
|
|
|
|
2016-08-31 15:00:35 +02:00
|
|
|
static bool is_basic_seccomp_available(void) {
|
|
|
|
int r;
|
|
|
|
r = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
|
|
|
|
return r >= 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool is_seccomp_filter_available(void) {
|
|
|
|
int r;
|
|
|
|
r = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
|
|
|
|
return r < 0 && errno == EFAULT;
|
|
|
|
}
|
|
|
|
|
2016-08-22 21:40:58 +02:00
|
|
|
bool is_seccomp_available(void) {
|
|
|
|
static int cached_enabled = -1;
|
|
|
|
if (cached_enabled < 0)
|
2016-08-31 15:00:35 +02:00
|
|
|
cached_enabled = is_basic_seccomp_available() && is_seccomp_filter_available();
|
2016-08-22 21:40:58 +02:00
|
|
|
return cached_enabled;
|
|
|
|
}
|
|
|
|
|
2016-10-21 21:50:05 +02:00
|
|
|
const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
|
2016-11-02 17:01:04 +01:00
|
|
|
[SYSCALL_FILTER_SET_DEFAULT] = {
|
|
|
|
.name = "@default",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "System calls that are always permitted",
|
2016-11-02 17:01:04 +01:00
|
|
|
.value =
|
|
|
|
"clock_getres\0"
|
|
|
|
"clock_gettime\0"
|
|
|
|
"clock_nanosleep\0"
|
|
|
|
"execve\0"
|
|
|
|
"exit\0"
|
|
|
|
"exit_group\0"
|
|
|
|
"getrlimit\0" /* make sure processes can query stack size and such */
|
|
|
|
"gettimeofday\0"
|
|
|
|
"nanosleep\0"
|
|
|
|
"pause\0"
|
|
|
|
"rt_sigreturn\0"
|
|
|
|
"sigreturn\0"
|
|
|
|
"time\0"
|
|
|
|
},
|
2016-11-02 15:46:18 +01:00
|
|
|
[SYSCALL_FILTER_SET_BASIC_IO] = {
|
|
|
|
.name = "@basic-io",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Basic IO",
|
2016-11-02 15:46:18 +01:00
|
|
|
.value =
|
|
|
|
"close\0"
|
|
|
|
"dup2\0"
|
|
|
|
"dup3\0"
|
|
|
|
"dup\0"
|
|
|
|
"lseek\0"
|
|
|
|
"pread64\0"
|
|
|
|
"preadv\0"
|
|
|
|
"pwrite64\0"
|
|
|
|
"pwritev\0"
|
|
|
|
"read\0"
|
|
|
|
"readv\0"
|
|
|
|
"write\0"
|
|
|
|
"writev\0"
|
|
|
|
},
|
2016-10-21 21:50:05 +02:00
|
|
|
[SYSCALL_FILTER_SET_CLOCK] = {
|
|
|
|
.name = "@clock",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Change the system time",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"adjtimex\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"clock_adjtime\0"
|
|
|
|
"clock_settime\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"settimeofday\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"stime\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_CPU_EMULATION] = {
|
|
|
|
.name = "@cpu-emulation",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "System calls for CPU emulation functionality",
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
.value =
|
|
|
|
"modify_ldt\0"
|
|
|
|
"subpage_prot\0"
|
|
|
|
"switch_endian\0"
|
|
|
|
"vm86\0"
|
|
|
|
"vm86old\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_DEBUG] = {
|
|
|
|
.name = "@debug",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Debugging, performance monitoring and tracing functionality",
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
.value =
|
|
|
|
"lookup_dcookie\0"
|
|
|
|
"perf_event_open\0"
|
|
|
|
"process_vm_readv\0"
|
|
|
|
"process_vm_writev\0"
|
|
|
|
"ptrace\0"
|
|
|
|
"rtas\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
#ifdef __NR_s390_runtime_instr
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"s390_runtime_instr\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
#endif
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"sys_debug_setcontext\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_IO_EVENT] = {
|
|
|
|
.name = "@io-event",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Event loop system calls",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"_newselect\0"
|
|
|
|
"epoll_create1\0"
|
|
|
|
"epoll_create\0"
|
|
|
|
"epoll_ctl\0"
|
|
|
|
"epoll_ctl_old\0"
|
|
|
|
"epoll_pwait\0"
|
|
|
|
"epoll_wait\0"
|
|
|
|
"epoll_wait_old\0"
|
|
|
|
"eventfd2\0"
|
|
|
|
"eventfd\0"
|
|
|
|
"poll\0"
|
|
|
|
"ppoll\0"
|
|
|
|
"pselect6\0"
|
|
|
|
"select\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_IPC] = {
|
|
|
|
.name = "@ipc",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "SysV IPC, POSIX Message Queues or other IPC",
|
|
|
|
.value =
|
|
|
|
"ipc\0"
|
2016-10-25 15:43:31 +02:00
|
|
|
"memfd_create\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"mq_getsetattr\0"
|
|
|
|
"mq_notify\0"
|
|
|
|
"mq_open\0"
|
|
|
|
"mq_timedreceive\0"
|
|
|
|
"mq_timedsend\0"
|
|
|
|
"mq_unlink\0"
|
|
|
|
"msgctl\0"
|
|
|
|
"msgget\0"
|
|
|
|
"msgrcv\0"
|
|
|
|
"msgsnd\0"
|
2016-10-25 15:43:31 +02:00
|
|
|
"pipe2\0"
|
|
|
|
"pipe\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"process_vm_readv\0"
|
|
|
|
"process_vm_writev\0"
|
|
|
|
"semctl\0"
|
|
|
|
"semget\0"
|
|
|
|
"semop\0"
|
|
|
|
"semtimedop\0"
|
|
|
|
"shmat\0"
|
|
|
|
"shmctl\0"
|
|
|
|
"shmdt\0"
|
|
|
|
"shmget\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_KEYRING] = {
|
|
|
|
.name = "@keyring",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Kernel keyring access",
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
.value =
|
|
|
|
"add_key\0"
|
|
|
|
"keyctl\0"
|
|
|
|
"request_key\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_MODULE] = {
|
|
|
|
.name = "@module",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Loading and unloading of kernel modules",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"delete_module\0"
|
|
|
|
"finit_module\0"
|
|
|
|
"init_module\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_MOUNT] = {
|
|
|
|
.name = "@mount",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Mounting and unmounting of file systems",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"chroot\0"
|
|
|
|
"mount\0"
|
|
|
|
"pivot_root\0"
|
|
|
|
"umount2\0"
|
|
|
|
"umount\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_NETWORK_IO] = {
|
|
|
|
.name = "@network-io",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Network or Unix socket IO, should not be needed if not network facing",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"accept4\0"
|
|
|
|
"accept\0"
|
|
|
|
"bind\0"
|
|
|
|
"connect\0"
|
|
|
|
"getpeername\0"
|
|
|
|
"getsockname\0"
|
|
|
|
"getsockopt\0"
|
|
|
|
"listen\0"
|
|
|
|
"recv\0"
|
|
|
|
"recvfrom\0"
|
|
|
|
"recvmmsg\0"
|
|
|
|
"recvmsg\0"
|
|
|
|
"send\0"
|
|
|
|
"sendmmsg\0"
|
|
|
|
"sendmsg\0"
|
|
|
|
"sendto\0"
|
|
|
|
"setsockopt\0"
|
|
|
|
"shutdown\0"
|
|
|
|
"socket\0"
|
|
|
|
"socketcall\0"
|
|
|
|
"socketpair\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_OBSOLETE] = {
|
2016-11-02 17:24:34 +01:00
|
|
|
/* some unknown even to libseccomp */
|
2016-10-21 21:50:05 +02:00
|
|
|
.name = "@obsolete",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Unusual, obsolete or unimplemented system calls",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"_sysctl\0"
|
|
|
|
"afs_syscall\0"
|
|
|
|
"break\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"create_module\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"ftime\0"
|
|
|
|
"get_kernel_syms\0"
|
|
|
|
"getpmsg\0"
|
|
|
|
"gtty\0"
|
|
|
|
"lock\0"
|
|
|
|
"mpx\0"
|
|
|
|
"prof\0"
|
|
|
|
"profil\0"
|
|
|
|
"putpmsg\0"
|
|
|
|
"query_module\0"
|
|
|
|
"security\0"
|
|
|
|
"sgetmask\0"
|
|
|
|
"ssetmask\0"
|
|
|
|
"stty\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"sysfs\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"tuxcall\0"
|
|
|
|
"ulimit\0"
|
|
|
|
"uselib\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"ustat\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"vserver\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_PRIVILEGED] = {
|
|
|
|
.name = "@privileged",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "All system calls which need super-user capabilities",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"@clock\0"
|
|
|
|
"@module\0"
|
|
|
|
"@raw-io\0"
|
|
|
|
"acct\0"
|
|
|
|
"bdflush\0"
|
|
|
|
"bpf\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"capset\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"chown32\0"
|
|
|
|
"chown\0"
|
|
|
|
"chroot\0"
|
|
|
|
"fchown32\0"
|
|
|
|
"fchown\0"
|
|
|
|
"fchownat\0"
|
|
|
|
"kexec_file_load\0"
|
|
|
|
"kexec_load\0"
|
|
|
|
"lchown32\0"
|
|
|
|
"lchown\0"
|
|
|
|
"nfsservctl\0"
|
|
|
|
"pivot_root\0"
|
|
|
|
"quotactl\0"
|
|
|
|
"reboot\0"
|
|
|
|
"setdomainname\0"
|
|
|
|
"setfsuid32\0"
|
|
|
|
"setfsuid\0"
|
|
|
|
"setgroups32\0"
|
|
|
|
"setgroups\0"
|
|
|
|
"sethostname\0"
|
|
|
|
"setresuid32\0"
|
|
|
|
"setresuid\0"
|
|
|
|
"setreuid32\0"
|
|
|
|
"setreuid\0"
|
|
|
|
"setuid32\0"
|
|
|
|
"setuid\0"
|
|
|
|
"swapoff\0"
|
|
|
|
"swapon\0"
|
2016-10-21 21:15:43 +02:00
|
|
|
"_sysctl\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"vhangup\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_PROCESS] = {
|
|
|
|
.name = "@process",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Process control, execution, namespaceing operations",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"arch_prctl\0"
|
|
|
|
"clone\0"
|
|
|
|
"execveat\0"
|
|
|
|
"fork\0"
|
|
|
|
"kill\0"
|
|
|
|
"prctl\0"
|
|
|
|
"setns\0"
|
|
|
|
"tgkill\0"
|
|
|
|
"tkill\0"
|
|
|
|
"unshare\0"
|
|
|
|
"vfork\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
},
|
|
|
|
[SYSCALL_FILTER_SET_RAW_IO] = {
|
|
|
|
.name = "@raw-io",
|
2016-11-02 17:24:34 +01:00
|
|
|
.help = "Raw I/O port access",
|
2016-06-01 11:56:01 +02:00
|
|
|
.value =
|
|
|
|
"ioperm\0"
|
|
|
|
"iopl\0"
|
core: improve seccomp syscall grouping a bit
This adds three new seccomp syscall groups: @keyring for kernel keyring access,
@cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and
suchlike, and @debug for ptrace() and related calls.
Also, the @clock group is updated with more syscalls that alter the system
clock. capset() is added to @privileged, and pciconfig_iobase() is added to
@raw-io.
Finally, @obsolete is a cleaned up. A number of syscalls that never existed on
Linux and have no number assigned on any architecture are removed, as they only
exist in the man pages and other operating sytems, but not in code at all.
create_module() is moved from @module to @obsolete, as it is an obsolete system
call. mem_getpolicy() is removed from the @obsolete list, as it is not
obsolete, but simply a NUMA API.
2016-06-10 17:43:38 +02:00
|
|
|
"pciconfig_iobase\0"
|
2016-06-01 11:56:01 +02:00
|
|
|
"pciconfig_read\0"
|
|
|
|
"pciconfig_write\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
#ifdef __NR_s390_pci_mmio_read
|
2016-06-01 11:56:01 +02:00
|
|
|
"s390_pci_mmio_read\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
#endif
|
|
|
|
#ifdef __NR_s390_pci_mmio_write
|
2016-06-01 11:56:01 +02:00
|
|
|
"s390_pci_mmio_write\0"
|
2016-10-21 21:50:05 +02:00
|
|
|
#endif
|
|
|
|
},
|
2016-11-02 15:46:18 +01:00
|
|
|
[SYSCALL_FILTER_SET_RESOURCES] = {
|
|
|
|
/* Alter resource settings */
|
|
|
|
.name = "@resources",
|
|
|
|
.value =
|
|
|
|
"sched_setparam\0"
|
|
|
|
"sched_setscheduler\0"
|
|
|
|
"sched_setaffinity\0"
|
|
|
|
"setpriority\0"
|
|
|
|
"setrlimit\0"
|
|
|
|
"set_mempolicy\0"
|
|
|
|
"migrate_pages\0"
|
|
|
|
"move_pages\0"
|
|
|
|
"mbind\0"
|
|
|
|
"sched_setattr\0"
|
|
|
|
"prlimit64\0"
|
|
|
|
},
|
2016-06-01 11:56:01 +02:00
|
|
|
};
|
2016-10-21 21:50:05 +02:00
|
|
|
|
|
|
|
const SyscallFilterSet *syscall_filter_set_find(const char *name) {
|
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
if (isempty(name) || name[0] != '@')
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
|
|
|
|
if (streq(syscall_filter_sets[i].name, name))
|
|
|
|
return syscall_filter_sets + i;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action) {
|
|
|
|
const char *sys;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(seccomp);
|
|
|
|
assert(set);
|
|
|
|
|
|
|
|
NULSTR_FOREACH(sys, set->value) {
|
|
|
|
int id;
|
|
|
|
|
|
|
|
if (sys[0] == '@') {
|
|
|
|
const SyscallFilterSet *other;
|
|
|
|
|
|
|
|
other = syscall_filter_set_find(sys);
|
|
|
|
if (!other)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
r = seccomp_add_syscall_filter_set(seccomp, other, action);
|
|
|
|
} else {
|
|
|
|
id = seccomp_syscall_resolve_name(sys);
|
|
|
|
if (id == __NR_SCMP_ERROR)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
r = seccomp_rule_add(seccomp, action, id, 0);
|
|
|
|
}
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2016-10-21 21:18:46 +02:00
|
|
|
|
|
|
|
int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
|
|
|
|
scmp_filter_ctx seccomp;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(set);
|
|
|
|
|
|
|
|
/* The one-stop solution: allocate a seccomp object, add a filter to it, and apply it */
|
|
|
|
|
|
|
|
r = seccomp_init_conservative(&seccomp, default_action);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
r = seccomp_add_syscall_filter_set(seccomp, set, action);
|
|
|
|
if (r < 0)
|
|
|
|
goto finish;
|
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
|
|
|
|
|
|
|
finish:
|
|
|
|
seccomp_release(seccomp);
|
|
|
|
return r;
|
2016-11-02 03:25:19 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
int seccomp_restrict_namespaces(unsigned long retain) {
|
|
|
|
scmp_filter_ctx seccomp;
|
|
|
|
unsigned i;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
if (log_get_max_level() >= LOG_DEBUG) {
|
|
|
|
_cleanup_free_ char *s = NULL;
|
|
|
|
|
|
|
|
(void) namespace_flag_to_string_many(retain, &s);
|
|
|
|
log_debug("Restricting namespace to: %s.", strna(s));
|
|
|
|
}
|
|
|
|
|
|
|
|
/* NOOP? */
|
|
|
|
if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
if ((retain & NAMESPACE_FLAGS_ALL) == 0)
|
|
|
|
/* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
|
|
|
|
* altogether. */
|
|
|
|
r = seccomp_rule_add(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(setns),
|
|
|
|
0);
|
|
|
|
else
|
|
|
|
/* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
|
|
|
|
* special invocation with a zero flags argument, right here. */
|
|
|
|
r = seccomp_rule_add(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(setns),
|
|
|
|
1,
|
|
|
|
SCMP_A1(SCMP_CMP_EQ, 0));
|
|
|
|
if (r < 0)
|
|
|
|
goto finish;
|
|
|
|
|
|
|
|
for (i = 0; namespace_flag_map[i].name; i++) {
|
|
|
|
unsigned long f;
|
|
|
|
|
|
|
|
f = namespace_flag_map[i].flag;
|
|
|
|
if ((retain & f) == f) {
|
|
|
|
log_debug("Permitting %s.", namespace_flag_map[i].name);
|
|
|
|
continue;
|
|
|
|
}
|
2016-10-21 21:18:46 +02:00
|
|
|
|
2016-11-02 03:25:19 +01:00
|
|
|
log_debug("Blocking %s.", namespace_flag_map[i].name);
|
|
|
|
|
|
|
|
r = seccomp_rule_add(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(unshare),
|
|
|
|
1,
|
|
|
|
SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
|
|
|
|
if (r < 0)
|
|
|
|
goto finish;
|
|
|
|
|
|
|
|
r = seccomp_rule_add(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(clone),
|
|
|
|
1,
|
|
|
|
SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
|
|
|
|
if (r < 0)
|
|
|
|
goto finish;
|
|
|
|
|
|
|
|
if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
|
|
|
|
r = seccomp_rule_add(
|
|
|
|
seccomp,
|
|
|
|
SCMP_ACT_ERRNO(EPERM),
|
|
|
|
SCMP_SYS(setns),
|
|
|
|
1,
|
|
|
|
SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
|
|
|
|
if (r < 0)
|
|
|
|
goto finish;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
r = seccomp_load(seccomp);
|
|
|
|
|
|
|
|
finish:
|
|
|
|
seccomp_release(seccomp);
|
|
|
|
return r;
|
2016-10-21 21:18:46 +02:00
|
|
|
}
|