Merge pull request #4450 from poettering/seccompfixes

Various seccomp fixes and NEWS update.
This commit is contained in:
Zbigniew Jędrzejewski-Szmek 2016-10-24 20:23:21 -04:00 committed by GitHub
commit 8d3eafa161
10 changed files with 438 additions and 267 deletions

View File

@ -1558,6 +1558,11 @@ tests += \
test-acl-util
endif
if HAVE_SECCOMP
tests += \
test-seccomp
endif
EXTRA_DIST += \
test/a.service \
test/basic.target \
@ -2026,6 +2031,12 @@ test_acl_util_SOURCES = \
test_acl_util_LDADD = \
libsystemd-shared.la
test_seccomp_SOURCES = \
src/test/test-seccomp.c
test_seccomp_LDADD = \
libsystemd-shared.la
test_namespace_LDADD = \
libcore.la

97
NEWS
View File

@ -35,14 +35,14 @@ CHANGES WITH 232 in spe
ProtectSystem=strict enabled, so they are not able to make any
permanent modifications to the system.
The nss-systemd module also always resolves root and nobody, making
* The nss-systemd module also always resolves root and nobody, making
it possible to have no /etc/passwd or /etc/group files in minimal
container systems.
container or chroot environments.
* Services may be started with their own user namespace using the new
PrivateUsers= option. Only root, nobody, and the uid/gid under which
the service is running are mapped. All other users are mapped to
nobody.
boolean PrivateUsers= option. Only root, nobody, and the uid/gid
under which the service is running are mapped. All other users are
mapped to nobody.
* Support for the cgroup namespace has been added to systemd-nspawn. If
supported by kernel, the container system started by systemd-nspawn
@ -57,12 +57,22 @@ CHANGES WITH 232 in spe
options. This controller requires out-of-tree patches for the kernel
and the support is provisional.
* .automount units may now be transient.
* Mount and automount units may now be created transiently
(i.e. dynamically at runtime via the bus API, instead of requiring
unit files in the file system).
* systemd-mount is a new tool which wraps mount(8) to pull in
additional dependencies through transient .mount and .automount
units. For example, this automatically runs fsck on the block device
before mounting, and allows the automount logic to be used.
* systemd-mount is a new tool which may mount file systems much like
mount(8), optionally pulling in additional dependencies through
transient .mount and .automount units. For example, this tool
automatically runs fsck on a backing block device before mounting,
and allows the automount logic to be used dynamically from the
command line for establishing mount points. This tool is particularly
useful when dealing with removable media, as it will ensure fsck is
run if necessary before the first access and that the file system
is quickly unmounted after each access by utilizing the automount
logic. This maximizes the chance that the file system on the
removable media stays in a clean state, and if it isn't in a clean
state is fixed automatically.
* LazyUnmount=yes option for mount units has been added to expose the
umount --lazy option. Similarly, ForceUnmount=yes exposes the --force
@ -75,6 +85,12 @@ CHANGES WITH 232 in spe
mount the EFI partition on systems where /boot is used for something
else.
* When operating on GPT disk images for containers, systemd-nspawn will
now mount the ESP to /boot or /efi according to the same rules as PID
1 running on a host. This allows tools like "bootctl" to operate
correctly within such containers, in order to make container images
bootable on physical systems.
* disk/by-id and disk/by-path symlinks are now created for NVMe drives.
* Two new user session targets have been added to support running
@ -95,7 +111,7 @@ CHANGES WITH 232 in spe
the top of the process hierarchy (which is usually the init process
of the container).
* systemd-journal-gatewayd learned the --directory option to serve
* systemd-journal-gatewayd learned the --directory= option to serve
files from the specified location.
* journalctl --root=… can be used to peruse the journal in the
@ -112,23 +128,26 @@ CHANGES WITH 232 in spe
a click rate that is different than the one for the vertical wheel.
* systemd-run gained a new --wait option that makes service execution
synchronous.
synchronous. (Specifically, the command will not return until the
specified service binary exited.)
systemctl gained a new --wait option that causes the start command to
* systemctl gained a new --wait option that causes the start command to
wait until the units being started have terminated again.
* A new journal output mode "short-full" has been added which uses
* A new journal output mode "short-full" has been added which displays
timestamps with abbreviated English day names and adds a timezone
suffix. Those timestamps include more information and can be parsed
by journalctl.
suffix. Those timestamps include more information than the default
"short" output mode, and can be passed directly to journalctl's
--since= and --until= options.
* /etc/resolv.conf will be bind-mounted into containers started by
systemd-nspawn, if possible, so any changes to resolv.conf contents
are automatically propagated to the container.
* The number of instances for socket-activated services originating
from a single IP can be limited with MaxConnectionsPerSource=,
extending the existing setting of MaxConnections.
from a single IP address can be limited with
MaxConnectionsPerSource=, extending the existing setting of
MaxConnections=.
* systemd-networkd gained support for vcan ("Virtual CAN") interface
configuration.
@ -143,21 +162,23 @@ CHANGES WITH 232 in spe
GenericReceiveOffload=, LargeReceiveOffload= options in the
[Link] section of .link files.
Spanning Tree Protocol enablement, Priority, Aging Time, and the
Default Port VLAN ID can be configured for bridge devices using the
new STP=, Priority=, AgeingTimeSec=, and DefaultPVID= settings in the
[Bridge] section of .netdev files.
* The Spanning Tree Protocol, Priority, Aging Time, and the Default
Port VLAN ID can be configured for bridge devices using the new STP=,
Priority=, AgeingTimeSec=, and DefaultPVID= settings in the [Bridge]
section of .netdev files.
The route table to which routes received over DHCP or RA should be
* The route table to which routes received over DHCP or RA should be
added can be configured with the new RouteTable= option in the [DHCP]
and [IPv6AcceptRA] sections of .network files.
Address Resolution Protocol can be disabled on links managed by
* The Address Resolution Protocol can be disabled on links managed by
systemd-networkd using the ARP=no setting in the [Link] section of
.network files.
* $SERVICE_RESULT, $EXIT_CODE, $EXIT_STATUS are set for ExecStop= and
ExecStopPost= commands.
* New environment variables $SERVICE_RESULT, $EXIT_CODE and
$EXIT_STATUS are set for ExecStop= and ExecStopPost= commands, and
encode information about the result and exit codes of the current
service runtime cycle.
* systemd-sysctl will now configure kernel parameters in the order
they occur in the configuration files. This matches what sysctl
@ -184,6 +205,30 @@ CHANGES WITH 232 in spe
$SYSTEMD_NSPAWN_SHARE_NS_UTS may be used to control the unsharing of
individual namespaces.
* "machinectl list" now shows the IP address of running containers in
the output, as well as OS release information.
* "loginctl list" now shows the TTY of each session in the output.
* sd-bus gained new API calls sd_bus_track_set_recursive(),
sd_bus_track_get_recursive(), sd_bus_track_count_name(),
sd_bus_track_count_sender(). They permit usage of sd_bus_track peer
tracking objects in a "recursive" mode, where a single client can be
counted multiple times, if it takes multiple references.
* sd-bus gained new API calls sd_bus_set_exit_on_disconnect() and
sd_bus_get_exit_on_disconnect(). They may be used to to make a
process using sd-bus automatically exit if the bus connection is
severed.
* Bus clients of the service manager may now "pin" loaded units into
memory, by taking an explicit reference on them. This is useful to
ensure the client can retrieve runtime data about the service even
after the service completed execution. Taking such a reference is
available only for privileged clients and should be helpful to watch
running services in a race-free manner, and in particular collect
information about exit statuses and results.
CHANGES WITH 231:
* In service units the various ExecXYZ= settings have been extended

View File

@ -1185,18 +1185,19 @@ static void rename_process_from_path(const char *path) {
#ifdef HAVE_SECCOMP
static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
if (!is_seccomp_available()) {
log_open();
log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
log_close();
return true;
}
return false;
if (is_seccomp_available())
return false;
log_open();
log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
log_close();
return true;
}
static int apply_seccomp(const Unit* u, const ExecContext *c) {
uint32_t negative_action, action;
scmp_filter_ctx *seccomp;
scmp_filter_ctx seccomp;
Iterator i;
void *id;
int r;
@ -1247,7 +1248,7 @@ finish:
}
static int apply_address_families(const Unit* u, const ExecContext *c) {
scmp_filter_ctx *seccomp;
scmp_filter_ctx seccomp;
Iterator i;
int r;
@ -1256,13 +1257,9 @@ static int apply_address_families(const Unit* u, const ExecContext *c) {
if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
return 0;
seccomp = seccomp_init(SCMP_ACT_ALLOW);
if (!seccomp)
return -ENOMEM;
r = seccomp_add_secondary_archs(seccomp);
r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
if (r < 0)
goto finish;
return r;
if (c->address_families_whitelist) {
int af, first = 0, last = 0;
@ -1359,10 +1356,6 @@ static int apply_address_families(const Unit* u, const ExecContext *c) {
}
}
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
if (r < 0)
goto finish;
r = seccomp_load(seccomp);
finish:
@ -1371,7 +1364,7 @@ finish:
}
static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
scmp_filter_ctx *seccomp;
scmp_filter_ctx seccomp;
int r;
assert(c);
@ -1379,13 +1372,9 @@ static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c)
if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
return 0;
seccomp = seccomp_init(SCMP_ACT_ALLOW);
if (!seccomp)
return -ENOMEM;
r = seccomp_add_secondary_archs(seccomp);
r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
if (r < 0)
goto finish;
return r;
r = seccomp_rule_add(
seccomp,
@ -1405,10 +1394,6 @@ static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c)
if (r < 0)
goto finish;
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
if (r < 0)
goto finish;
r = seccomp_load(seccomp);
finish:
@ -1423,7 +1408,7 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
SCHED_IDLE,
};
scmp_filter_ctx *seccomp;
scmp_filter_ctx seccomp;
unsigned i;
int r, p, max_policy = 0;
@ -1432,13 +1417,9 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
if (skip_seccomp_unavailable(u, "RestrictRealtime="))
return 0;
seccomp = seccomp_init(SCMP_ACT_ALLOW);
if (!seccomp)
return -ENOMEM;
r = seccomp_add_secondary_archs(seccomp);
r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
if (r < 0)
goto finish;
return r;
/* Determine the highest policy constant we want to allow */
for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
@ -1482,10 +1463,6 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
if (r < 0)
goto finish;
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
if (r < 0)
goto finish;
r = seccomp_load(seccomp);
finish:
@ -1494,7 +1471,7 @@ finish:
}
static int apply_protect_sysctl(Unit *u, const ExecContext *c) {
scmp_filter_ctx *seccomp;
scmp_filter_ctx seccomp;
int r;
assert(c);
@ -1505,13 +1482,9 @@ static int apply_protect_sysctl(Unit *u, const ExecContext *c) {
if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
return 0;
seccomp = seccomp_init(SCMP_ACT_ALLOW);
if (!seccomp)
return -ENOMEM;
r = seccomp_add_secondary_archs(seccomp);
r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
if (r < 0)
goto finish;
return r;
r = seccomp_rule_add(
seccomp,
@ -1521,10 +1494,6 @@ static int apply_protect_sysctl(Unit *u, const ExecContext *c) {
if (r < 0)
goto finish;
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
if (r < 0)
goto finish;
r = seccomp_load(seccomp);
finish:
@ -1533,56 +1502,17 @@ finish:
}
static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) {
static const int module_syscalls[] = {
SCMP_SYS(delete_module),
SCMP_SYS(finit_module),
SCMP_SYS(init_module),
};
scmp_filter_ctx *seccomp;
unsigned i;
int r;
assert(c);
/* Turn of module syscalls on ProtectKernelModules=yes */
/* Turn off module syscalls on ProtectKernelModules=yes */
if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
return 0;
seccomp = seccomp_init(SCMP_ACT_ALLOW);
if (!seccomp)
return -ENOMEM;
r = seccomp_add_secondary_archs(seccomp);
if (r < 0)
goto finish;
for (i = 0; i < ELEMENTSOF(module_syscalls); i++) {
r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM),
module_syscalls[i], 0);
if (r < 0)
goto finish;
}
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
if (r < 0)
goto finish;
r = seccomp_load(seccomp);
finish:
seccomp_release(seccomp);
return r;
return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
}
static int apply_private_devices(Unit *u, const ExecContext *c) {
const SystemCallFilterSet *set;
scmp_filter_ctx *seccomp;
const char *sys;
bool syscalls_found = false;
int r;
assert(c);
/* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
@ -1590,61 +1520,7 @@ static int apply_private_devices(Unit *u, const ExecContext *c) {
if (skip_seccomp_unavailable(u, "PrivateDevices="))
return 0;
seccomp = seccomp_init(SCMP_ACT_ALLOW);
if (!seccomp)
return -ENOMEM;
r = seccomp_add_secondary_archs(seccomp);
if (r < 0)
goto finish;
for (set = syscall_filter_sets; set->set_name; set++)
if (streq(set->set_name, "@raw-io")) {
syscalls_found = true;
break;
}
/* We should never fail here */
if (!syscalls_found) {
r = -EOPNOTSUPP;
goto finish;
}
NULSTR_FOREACH(sys, set->value) {
int id;
bool add = true;
#ifndef __NR_s390_pci_mmio_read
if (streq(sys, "s390_pci_mmio_read"))
add = false;
#endif
#ifndef __NR_s390_pci_mmio_write
if (streq(sys, "s390_pci_mmio_write"))
add = false;
#endif
if (!add)
continue;
id = seccomp_syscall_resolve_name(sys);
r = seccomp_rule_add(
seccomp,
SCMP_ACT_ERRNO(EPERM),
id, 0);
if (r < 0)
goto finish;
}
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
if (r < 0)
goto finish;
r = seccomp_load(seccomp);
finish:
seccomp_release(seccomp);
return r;
return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
}
#endif
@ -1890,9 +1766,9 @@ static int setup_private_users(uid_t uid, gid_t gid) {
asprintf(&uid_map,
"0 0 1\n" /* Map root → root */
UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
uid, uid); /* The case where the above is the same */
uid, uid);
else
uid_map = strdup("0 0 1\n");
uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
if (!uid_map)
return -ENOMEM;

View File

@ -2618,6 +2618,7 @@ int config_parse_documentation(const char *unit,
}
#ifdef HAVE_SECCOMP
static int syscall_filter_parse_one(
const char *unit,
const char *filename,
@ -2628,27 +2629,29 @@ static int syscall_filter_parse_one(
bool warn) {
int r;
if (*t == '@') {
const SystemCallFilterSet *set;
if (t[0] == '@') {
const SyscallFilterSet *set;
const char *i;
for (set = syscall_filter_sets; set->set_name; set++)
if (streq(set->set_name, t)) {
const char *sys;
set = syscall_filter_set_find(t);
if (!set) {
if (warn)
log_syntax(unit, LOG_WARNING, filename, line, 0, "Don't know system call group, ignoring: %s", t);
return 0;
}
NULSTR_FOREACH(sys, set->value) {
r = syscall_filter_parse_one(unit, filename, line, c, invert, sys, false);
if (r < 0)
return r;
}
break;
}
NULSTR_FOREACH(i, set->value) {
r = syscall_filter_parse_one(unit, filename, line, c, invert, i, false);
if (r < 0)
return r;
}
} else {
int id;
id = seccomp_syscall_resolve_name(t);
if (id == __NR_SCMP_ERROR) {
if (warn)
log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse system call, ignoring: %s", t);
log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse system call, ignoring: %s", t);
return 0;
}
@ -2662,8 +2665,9 @@ static int syscall_filter_parse_one(
if (r < 0)
return log_oom();
} else
set_remove(c->syscall_filter, INT_TO_PTR(id + 1));
(void) set_remove(c->syscall_filter, INT_TO_PTR(id + 1));
}
return 0;
}
@ -2682,8 +2686,7 @@ int config_parse_syscall_filter(
ExecContext *c = data;
Unit *u = userdata;
bool invert = false;
const char *word, *state;
size_t l;
const char *p;
int r;
assert(filename);
@ -2722,19 +2725,24 @@ int config_parse_syscall_filter(
}
}
FOREACH_WORD_QUOTED(word, l, rvalue, state) {
_cleanup_free_ char *t = NULL;
p = rvalue;
for (;;) {
_cleanup_free_ char *word = NULL;
t = strndup(word, l);
if (!t)
r = extract_first_word(&p, &word, NULL, 0);
if (r == 0)
break;
if (r == -ENOMEM)
return log_oom();
if (r < 0) {
log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
break;
}
r = syscall_filter_parse_one(unit, filename, line, c, invert, t, true);
r = syscall_filter_parse_one(unit, filename, line, c, invert, word, true);
if (r < 0)
return r;
}
if (!isempty(state))
log_syntax(unit, LOG_ERR, filename, line, 0, "Trailing garbage, ignoring.");
/* Turn on NNP, but only if it wasn't configured explicitly
* before, and only if we are in user mode. */

View File

@ -135,15 +135,9 @@ int setup_seccomp(uint64_t cap_list_retain) {
return 0;
}
seccomp = seccomp_init(SCMP_ACT_ALLOW);
if (!seccomp)
return log_oom();
r = seccomp_add_secondary_archs(seccomp);
if (r < 0) {
log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
goto finish;
}
r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
if (r < 0)
return log_error_errno(r, "Failed to allocate seccomp object: %m");
r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain);
if (r < 0)
@ -171,12 +165,6 @@ int setup_seccomp(uint64_t cap_list_retain) {
goto finish;
}
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
if (r < 0) {
log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
goto finish;
}
r = seccomp_load(seccomp);
if (r < 0) {
log_error_errno(r, "Failed to install seccomp audit filter: %m");

View File

@ -329,9 +329,9 @@ static int condition_test_needs_update(Condition *c) {
uint64_t timestamp;
int r;
r = parse_env_file(p, NULL, "TimestampNSec", &timestamp_str, NULL);
r = parse_env_file(p, NULL, "TIMESTAMP_NSEC", &timestamp_str, NULL);
if (r < 0) {
log_error_errno(-r, "Failed to parse timestamp file '%s', using mtime: %m", p);
log_error_errno(r, "Failed to parse timestamp file '%s', using mtime: %m", p);
return true;
} else if (r == 0) {
log_debug("No data in timestamp file '%s', using mtime", p);
@ -340,12 +340,11 @@ static int condition_test_needs_update(Condition *c) {
r = safe_atou64(timestamp_str, &timestamp);
if (r < 0) {
log_error_errno(-r, "Failed to parse timestamp value '%s' in file '%s', using mtime: %m",
timestamp_str, p);
log_error_errno(r, "Failed to parse timestamp value '%s' in file '%s', using mtime: %m", timestamp_str, p);
return true;
}
other.st_mtim.tv_nsec = timestamp % NSEC_PER_SEC;
timespec_store(&other.st_mtim, timestamp);
}
return usr.st_mtim.tv_nsec > other.st_mtim.tv_nsec;

View File

@ -26,6 +26,7 @@
#include "macro.h"
#include "seccomp-util.h"
#include "string-util.h"
#include "util.h"
const char* seccomp_arch_to_string(uint32_t c) {
@ -73,7 +74,34 @@ int seccomp_arch_from_string(const char *n, uint32_t *ret) {
return 0;
}
int seccomp_add_secondary_archs(scmp_filter_ctx *c) {
int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action) {
scmp_filter_ctx seccomp;
int r;
/* Much like seccomp_init(), but tries to be a bit more conservative in its defaults: all secondary archs are
* added by default, and NNP is turned off. */
seccomp = seccomp_init(default_action);
if (!seccomp)
return -ENOMEM;
r = seccomp_add_secondary_archs(seccomp);
if (r < 0)
goto finish;
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
if (r < 0)
goto finish;
*ret = seccomp;
return 0;
finish:
seccomp_release(seccomp);
return r;
}
int seccomp_add_secondary_archs(scmp_filter_ctx c) {
#if defined(__i386__) || defined(__x86_64__)
int r;
@ -110,7 +138,6 @@ int seccomp_add_secondary_archs(scmp_filter_ctx *c) {
#endif
return 0;
}
static bool is_basic_seccomp_available(void) {
@ -132,28 +159,30 @@ bool is_seccomp_available(void) {
return cached_enabled;
}
const SystemCallFilterSet syscall_filter_sets[] = {
{
const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
[SYSCALL_FILTER_SET_CLOCK] = {
/* Clock */
.set_name = "@clock",
.name = "@clock",
.value =
"adjtimex\0"
"clock_adjtime\0"
"clock_settime\0"
"settimeofday\0"
"stime\0"
}, {
},
[SYSCALL_FILTER_SET_CPU_EMULATION] = {
/* CPU emulation calls */
.set_name = "@cpu-emulation",
.name = "@cpu-emulation",
.value =
"modify_ldt\0"
"subpage_prot\0"
"switch_endian\0"
"vm86\0"
"vm86old\0"
}, {
},
[SYSCALL_FILTER_SET_DEBUG] = {
/* Debugging/Performance Monitoring/Tracing */
.set_name = "@debug",
.name = "@debug",
.value =
"lookup_dcookie\0"
"perf_event_open\0"
@ -161,11 +190,14 @@ const SystemCallFilterSet syscall_filter_sets[] = {
"process_vm_writev\0"
"ptrace\0"
"rtas\0"
#ifdef __NR_s390_runtime_instr
"s390_runtime_instr\0"
#endif
"sys_debug_setcontext\0"
}, {
},
[SYSCALL_FILTER_SET_DEFAULT] = {
/* Default list */
.set_name = "@default",
.name = "@default",
.value =
"execve\0"
"exit\0"
@ -173,9 +205,10 @@ const SystemCallFilterSet syscall_filter_sets[] = {
"getrlimit\0" /* make sure processes can query stack size and such */
"rt_sigreturn\0"
"sigreturn\0"
}, {
},
[SYSCALL_FILTER_SET_IO_EVENT] = {
/* Event loop use */
.set_name = "@io-event",
.name = "@io-event",
.value =
"_newselect\0"
"epoll_create1\0"
@ -191,9 +224,10 @@ const SystemCallFilterSet syscall_filter_sets[] = {
"ppoll\0"
"pselect6\0"
"select\0"
}, {
},
[SYSCALL_FILTER_SET_IPC] = {
/* Message queues, SYSV IPC or other IPC: unusual */
.set_name = "@ipc",
.name = "@ipc",
.value = "ipc\0"
"mq_getsetattr\0"
"mq_notify\0"
@ -215,33 +249,36 @@ const SystemCallFilterSet syscall_filter_sets[] = {
"shmctl\0"
"shmdt\0"
"shmget\0"
}, {
},
[SYSCALL_FILTER_SET_KEYRING] = {
/* Keyring */
.set_name = "@keyring",
.name = "@keyring",
.value =
"add_key\0"
"keyctl\0"
"request_key\0"
}, {
},
[SYSCALL_FILTER_SET_MODULE] = {
/* Kernel module control */
.set_name = "@module",
.name = "@module",
.value =
"delete_module\0"
"finit_module\0"
"init_module\0"
}, {
},
[SYSCALL_FILTER_SET_MOUNT] = {
/* Mounting */
.set_name = "@mount",
.name = "@mount",
.value =
"chroot\0"
"mount\0"
"oldumount\0"
"pivot_root\0"
"umount2\0"
"umount\0"
}, {
},
[SYSCALL_FILTER_SET_NETWORK_IO] = {
/* Network or Unix socket IO, should not be needed if not network facing */
.set_name = "@network-io",
.name = "@network-io",
.value =
"accept4\0"
"accept\0"
@ -264,9 +301,10 @@ const SystemCallFilterSet syscall_filter_sets[] = {
"socket\0"
"socketcall\0"
"socketpair\0"
}, {
},
[SYSCALL_FILTER_SET_OBSOLETE] = {
/* Unusual, obsolete or unimplemented, some unknown even to libseccomp */
.set_name = "@obsolete",
.name = "@obsolete",
.value =
"_sysctl\0"
"afs_syscall\0"
@ -292,9 +330,10 @@ const SystemCallFilterSet syscall_filter_sets[] = {
"uselib\0"
"ustat\0"
"vserver\0"
}, {
},
[SYSCALL_FILTER_SET_PRIVILEGED] = {
/* Nice grab-bag of all system calls which need superuser capabilities */
.set_name = "@privileged",
.name = "@privileged",
.value =
"@clock\0"
"@module\0"
@ -331,11 +370,12 @@ const SystemCallFilterSet syscall_filter_sets[] = {
"setuid\0"
"swapoff\0"
"swapon\0"
"sysctl\0"
"_sysctl\0"
"vhangup\0"
}, {
},
[SYSCALL_FILTER_SET_PROCESS] = {
/* Process control, execution, namespaces */
.set_name = "@process",
.name = "@process",
.value =
"arch_prctl\0"
"clone\0"
@ -349,19 +389,90 @@ const SystemCallFilterSet syscall_filter_sets[] = {
"tkill\0"
"unshare\0"
"vfork\0"
}, {
},
[SYSCALL_FILTER_SET_RAW_IO] = {
/* Raw I/O ports */
.set_name = "@raw-io",
.name = "@raw-io",
.value =
"ioperm\0"
"iopl\0"
"pciconfig_iobase\0"
"pciconfig_read\0"
"pciconfig_write\0"
#ifdef __NR_s390_pci_mmio_read
"s390_pci_mmio_read\0"
#endif
#ifdef __NR_s390_pci_mmio_write
"s390_pci_mmio_write\0"
}, {
.set_name = NULL,
.value = NULL
}
#endif
},
};
const SyscallFilterSet *syscall_filter_set_find(const char *name) {
unsigned i;
if (isempty(name) || name[0] != '@')
return NULL;
for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
if (streq(syscall_filter_sets[i].name, name))
return syscall_filter_sets + i;
return NULL;
}
int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action) {
const char *sys;
int r;
assert(seccomp);
assert(set);
NULSTR_FOREACH(sys, set->value) {
int id;
if (sys[0] == '@') {
const SyscallFilterSet *other;
other = syscall_filter_set_find(sys);
if (!other)
return -EINVAL;
r = seccomp_add_syscall_filter_set(seccomp, other, action);
} else {
id = seccomp_syscall_resolve_name(sys);
if (id == __NR_SCMP_ERROR)
return -EINVAL;
r = seccomp_rule_add(seccomp, action, id, 0);
}
if (r < 0)
return r;
}
return 0;
}
int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
scmp_filter_ctx seccomp;
int r;
assert(set);
/* The one-stop solution: allocate a seccomp object, add a filter to it, and apply it */
r = seccomp_init_conservative(&seccomp, default_action);
if (r < 0)
return r;
r = seccomp_add_syscall_filter_set(seccomp, set, action);
if (r < 0)
goto finish;
r = seccomp_load(seccomp);
finish:
seccomp_release(seccomp);
return r;
}

View File

@ -20,18 +20,45 @@
***/
#include <seccomp.h>
#include <stdbool.h>
#include <stdint.h>
const char* seccomp_arch_to_string(uint32_t c);
int seccomp_arch_from_string(const char *n, uint32_t *ret);
int seccomp_add_secondary_archs(scmp_filter_ctx *c);
int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action);
int seccomp_add_secondary_archs(scmp_filter_ctx c);
bool is_seccomp_available(void);
typedef struct SystemCallFilterSet {
const char *set_name;
typedef struct SyscallFilterSet {
const char *name;
const char *value;
} SystemCallFilterSet;
} SyscallFilterSet;
extern const SystemCallFilterSet syscall_filter_sets[];
enum {
SYSCALL_FILTER_SET_CLOCK,
SYSCALL_FILTER_SET_CPU_EMULATION,
SYSCALL_FILTER_SET_DEBUG,
SYSCALL_FILTER_SET_DEFAULT,
SYSCALL_FILTER_SET_IO_EVENT,
SYSCALL_FILTER_SET_IPC,
SYSCALL_FILTER_SET_KEYRING,
SYSCALL_FILTER_SET_MODULE,
SYSCALL_FILTER_SET_MOUNT,
SYSCALL_FILTER_SET_NETWORK_IO,
SYSCALL_FILTER_SET_OBSOLETE,
SYSCALL_FILTER_SET_PRIVILEGED,
SYSCALL_FILTER_SET_PROCESS,
SYSCALL_FILTER_SET_RAW_IO,
_SYSCALL_FILTER_SET_MAX
};
extern const SyscallFilterSet syscall_filter_sets[];
const SyscallFilterSet *syscall_filter_set_find(const char *name);
int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action);
int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action);

103
src/test/test-seccomp.c Normal file
View File

@ -0,0 +1,103 @@
/***
This file is part of systemd.
Copyright 2016 Lennart Poettering
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include <stdlib.h>
#include <sys/eventfd.h>
#include <unistd.h>
#include "fd-util.h"
#include "macro.h"
#include "process-util.h"
#include "seccomp-util.h"
static void test_seccomp_arch_to_string(void) {
uint32_t a, b;
const char *name;
a = seccomp_arch_native();
assert_se(a > 0);
name = seccomp_arch_to_string(a);
assert_se(name);
assert_se(seccomp_arch_from_string(name, &b) >= 0);
assert_se(a == b);
}
static void test_syscall_filter_set_find(void) {
assert_se(!syscall_filter_set_find(NULL));
assert_se(!syscall_filter_set_find(""));
assert_se(!syscall_filter_set_find("quux"));
assert_se(!syscall_filter_set_find("@quux"));
assert_se(syscall_filter_set_find("@clock") == syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK);
assert_se(syscall_filter_set_find("@default") == syscall_filter_sets + SYSCALL_FILTER_SET_DEFAULT);
assert_se(syscall_filter_set_find("@raw-io") == syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO);
}
static void test_filter_sets(void) {
unsigned i;
int r;
if (!is_seccomp_available())
return;
if (geteuid() != 0)
return;
for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++) {
pid_t pid;
log_info("Testing %s", syscall_filter_sets[i].name);
pid = fork();
assert_se(pid >= 0);
if (pid == 0) { /* Child? */
int fd;
if (i == SYSCALL_FILTER_SET_DEFAULT) /* if we look at the default set, whitelist instead of blacklist */
r = seccomp_load_filter_set(SCMP_ACT_ERRNO(EPERM), syscall_filter_sets + i, SCMP_ACT_ALLOW);
else
r = seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + i, SCMP_ACT_ERRNO(EPERM));
if (r < 0)
_exit(EXIT_FAILURE);
/* Test the sycall filter with one random system call */
fd = eventfd(0, EFD_NONBLOCK|EFD_CLOEXEC);
if (IN_SET(i, SYSCALL_FILTER_SET_IO_EVENT, SYSCALL_FILTER_SET_DEFAULT))
assert_se(fd < 0 && errno == EPERM);
else {
assert_se(fd >= 0);
safe_close(fd);
}
_exit(EXIT_SUCCESS);
}
assert_se(wait_for_terminate_and_warn(syscall_filter_sets[i].name, pid, true) == EXIT_SUCCESS);
}
}
int main(int argc, char *argv[]) {
test_seccomp_arch_to_string();
test_syscall_filter_set_find();
test_filter_sets();
return 0;
}

View File

@ -18,6 +18,7 @@
***/
#include "fd-util.h"
#include "fileio.h"
#include "io-util.h"
#include "selinux-util.h"
#include "util.h"
@ -32,8 +33,8 @@ static int apply_timestamp(const char *path, struct timespec *ts) {
*ts,
*ts
};
int fd = -1;
_cleanup_fclose_ FILE *f = NULL;
int fd = -1;
int r;
assert(path);
@ -59,18 +60,20 @@ static int apply_timestamp(const char *path, struct timespec *ts) {
return log_error_errno(errno, "Failed to create/open timestamp file %s: %m", path);
}
f = fdopen(fd, "w");
f = fdopen(fd, "we");
if (!f) {
safe_close(fd);
return log_error_errno(errno, "Failed to fdopen() timestamp file %s: %m", path);
}
(void) fprintf(f,
"%s"
"TimestampNSec=" NSEC_FMT "\n",
MESSAGE, timespec_load_nsec(ts));
MESSAGE
"TIMESTAMP_NSEC=" NSEC_FMT "\n",
timespec_load_nsec(ts));
fflush(f);
r = fflush_and_check(f);
if (r < 0)
return log_error_errno(r, "Failed to write timestamp file: %m");
if (futimens(fd, twice) < 0)
return log_error_errno(errno, "Failed to update timestamp on %s: %m", path);