diff --git a/Makefile.am b/Makefile.am index bad6a2d18a..a707d4a899 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1558,6 +1558,11 @@ tests += \ test-acl-util endif +if HAVE_SECCOMP +tests += \ + test-seccomp +endif + EXTRA_DIST += \ test/a.service \ test/basic.target \ @@ -2026,6 +2031,12 @@ test_acl_util_SOURCES = \ test_acl_util_LDADD = \ libsystemd-shared.la +test_seccomp_SOURCES = \ + src/test/test-seccomp.c + +test_seccomp_LDADD = \ + libsystemd-shared.la + test_namespace_LDADD = \ libcore.la diff --git a/NEWS b/NEWS index 87cc4f48c0..2a1edbe766 100644 --- a/NEWS +++ b/NEWS @@ -35,14 +35,14 @@ CHANGES WITH 232 in spe ProtectSystem=strict enabled, so they are not able to make any permanent modifications to the system. - The nss-systemd module also always resolves root and nobody, making + * The nss-systemd module also always resolves root and nobody, making it possible to have no /etc/passwd or /etc/group files in minimal - container systems. + container or chroot environments. * Services may be started with their own user namespace using the new - PrivateUsers= option. Only root, nobody, and the uid/gid under which - the service is running are mapped. All other users are mapped to - nobody. + boolean PrivateUsers= option. Only root, nobody, and the uid/gid + under which the service is running are mapped. All other users are + mapped to nobody. * Support for the cgroup namespace has been added to systemd-nspawn. If supported by kernel, the container system started by systemd-nspawn @@ -57,12 +57,22 @@ CHANGES WITH 232 in spe options. This controller requires out-of-tree patches for the kernel and the support is provisional. - * .automount units may now be transient. + * Mount and automount units may now be created transiently + (i.e. dynamically at runtime via the bus API, instead of requiring + unit files in the file system). - * systemd-mount is a new tool which wraps mount(8) to pull in - additional dependencies through transient .mount and .automount - units. For example, this automatically runs fsck on the block device - before mounting, and allows the automount logic to be used. + * systemd-mount is a new tool which may mount file systems – much like + mount(8), optionally pulling in additional dependencies through + transient .mount and .automount units. For example, this tool + automatically runs fsck on a backing block device before mounting, + and allows the automount logic to be used dynamically from the + command line for establishing mount points. This tool is particularly + useful when dealing with removable media, as it will ensure fsck is + run – if necessary – before the first access and that the file system + is quickly unmounted after each access by utilizing the automount + logic. This maximizes the chance that the file system on the + removable media stays in a clean state, and if it isn't in a clean + state is fixed automatically. * LazyUnmount=yes option for mount units has been added to expose the umount --lazy option. Similarly, ForceUnmount=yes exposes the --force @@ -75,6 +85,12 @@ CHANGES WITH 232 in spe mount the EFI partition on systems where /boot is used for something else. + * When operating on GPT disk images for containers, systemd-nspawn will + now mount the ESP to /boot or /efi according to the same rules as PID + 1 running on a host. This allows tools like "bootctl" to operate + correctly within such containers, in order to make container images + bootable on physical systems. + * disk/by-id and disk/by-path symlinks are now created for NVMe drives. * Two new user session targets have been added to support running @@ -95,7 +111,7 @@ CHANGES WITH 232 in spe the top of the process hierarchy (which is usually the init process of the container). - * systemd-journal-gatewayd learned the --directory option to serve + * systemd-journal-gatewayd learned the --directory= option to serve files from the specified location. * journalctl --root=… can be used to peruse the journal in the @@ -112,23 +128,26 @@ CHANGES WITH 232 in spe a click rate that is different than the one for the vertical wheel. * systemd-run gained a new --wait option that makes service execution - synchronous. + synchronous. (Specifically, the command will not return until the + specified service binary exited.) - systemctl gained a new --wait option that causes the start command to + * systemctl gained a new --wait option that causes the start command to wait until the units being started have terminated again. - * A new journal output mode "short-full" has been added which uses + * A new journal output mode "short-full" has been added which displays timestamps with abbreviated English day names and adds a timezone - suffix. Those timestamps include more information and can be parsed - by journalctl. + suffix. Those timestamps include more information than the default + "short" output mode, and can be passed directly to journalctl's + --since= and --until= options. * /etc/resolv.conf will be bind-mounted into containers started by systemd-nspawn, if possible, so any changes to resolv.conf contents are automatically propagated to the container. * The number of instances for socket-activated services originating - from a single IP can be limited with MaxConnectionsPerSource=, - extending the existing setting of MaxConnections. + from a single IP address can be limited with + MaxConnectionsPerSource=, extending the existing setting of + MaxConnections=. * systemd-networkd gained support for vcan ("Virtual CAN") interface configuration. @@ -143,21 +162,23 @@ CHANGES WITH 232 in spe GenericReceiveOffload=, LargeReceiveOffload= options in the [Link] section of .link files. - Spanning Tree Protocol enablement, Priority, Aging Time, and the - Default Port VLAN ID can be configured for bridge devices using the - new STP=, Priority=, AgeingTimeSec=, and DefaultPVID= settings in the - [Bridge] section of .netdev files. + * The Spanning Tree Protocol, Priority, Aging Time, and the Default + Port VLAN ID can be configured for bridge devices using the new STP=, + Priority=, AgeingTimeSec=, and DefaultPVID= settings in the [Bridge] + section of .netdev files. - The route table to which routes received over DHCP or RA should be + * The route table to which routes received over DHCP or RA should be added can be configured with the new RouteTable= option in the [DHCP] and [IPv6AcceptRA] sections of .network files. - Address Resolution Protocol can be disabled on links managed by + * The Address Resolution Protocol can be disabled on links managed by systemd-networkd using the ARP=no setting in the [Link] section of .network files. - * $SERVICE_RESULT, $EXIT_CODE, $EXIT_STATUS are set for ExecStop= and - ExecStopPost= commands. + * New environment variables $SERVICE_RESULT, $EXIT_CODE and + $EXIT_STATUS are set for ExecStop= and ExecStopPost= commands, and + encode information about the result and exit codes of the current + service runtime cycle. * systemd-sysctl will now configure kernel parameters in the order they occur in the configuration files. This matches what sysctl @@ -184,6 +205,30 @@ CHANGES WITH 232 in spe $SYSTEMD_NSPAWN_SHARE_NS_UTS may be used to control the unsharing of individual namespaces. + * "machinectl list" now shows the IP address of running containers in + the output, as well as OS release information. + + * "loginctl list" now shows the TTY of each session in the output. + + * sd-bus gained new API calls sd_bus_track_set_recursive(), + sd_bus_track_get_recursive(), sd_bus_track_count_name(), + sd_bus_track_count_sender(). They permit usage of sd_bus_track peer + tracking objects in a "recursive" mode, where a single client can be + counted multiple times, if it takes multiple references. + + * sd-bus gained new API calls sd_bus_set_exit_on_disconnect() and + sd_bus_get_exit_on_disconnect(). They may be used to to make a + process using sd-bus automatically exit if the bus connection is + severed. + + * Bus clients of the service manager may now "pin" loaded units into + memory, by taking an explicit reference on them. This is useful to + ensure the client can retrieve runtime data about the service even + after the service completed execution. Taking such a reference is + available only for privileged clients and should be helpful to watch + running services in a race-free manner, and in particular collect + information about exit statuses and results. + CHANGES WITH 231: * In service units the various ExecXYZ= settings have been extended diff --git a/src/core/execute.c b/src/core/execute.c index 53356c3c06..5e7d7c25d7 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1185,18 +1185,19 @@ static void rename_process_from_path(const char *path) { #ifdef HAVE_SECCOMP static bool skip_seccomp_unavailable(const Unit* u, const char* msg) { - if (!is_seccomp_available()) { - log_open(); - log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg); - log_close(); - return true; - } - return false; + + if (is_seccomp_available()) + return false; + + log_open(); + log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg); + log_close(); + return true; } static int apply_seccomp(const Unit* u, const ExecContext *c) { uint32_t negative_action, action; - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; Iterator i; void *id; int r; @@ -1247,7 +1248,7 @@ finish: } static int apply_address_families(const Unit* u, const ExecContext *c) { - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; Iterator i; int r; @@ -1256,13 +1257,9 @@ static int apply_address_families(const Unit* u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "RestrictAddressFamilies=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); if (r < 0) - goto finish; + return r; if (c->address_families_whitelist) { int af, first = 0, last = 0; @@ -1359,10 +1356,6 @@ static int apply_address_families(const Unit* u, const ExecContext *c) { } } - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - r = seccomp_load(seccomp); finish: @@ -1371,7 +1364,7 @@ finish: } static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) { - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; int r; assert(c); @@ -1379,13 +1372,9 @@ static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); if (r < 0) - goto finish; + return r; r = seccomp_rule_add( seccomp, @@ -1405,10 +1394,6 @@ static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) if (r < 0) goto finish; - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - r = seccomp_load(seccomp); finish: @@ -1423,7 +1408,7 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { SCHED_IDLE, }; - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; unsigned i; int r, p, max_policy = 0; @@ -1432,13 +1417,9 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "RestrictRealtime=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); if (r < 0) - goto finish; + return r; /* Determine the highest policy constant we want to allow */ for (i = 0; i < ELEMENTSOF(permitted_policies); i++) @@ -1482,10 +1463,6 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { if (r < 0) goto finish; - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - r = seccomp_load(seccomp); finish: @@ -1494,7 +1471,7 @@ finish: } static int apply_protect_sysctl(Unit *u, const ExecContext *c) { - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; int r; assert(c); @@ -1505,13 +1482,9 @@ static int apply_protect_sysctl(Unit *u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "ProtectKernelTunables=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); if (r < 0) - goto finish; + return r; r = seccomp_rule_add( seccomp, @@ -1521,10 +1494,6 @@ static int apply_protect_sysctl(Unit *u, const ExecContext *c) { if (r < 0) goto finish; - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - r = seccomp_load(seccomp); finish: @@ -1533,56 +1502,17 @@ finish: } static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { - static const int module_syscalls[] = { - SCMP_SYS(delete_module), - SCMP_SYS(finit_module), - SCMP_SYS(init_module), - }; - - scmp_filter_ctx *seccomp; - unsigned i; - int r; - assert(c); - /* Turn of module syscalls on ProtectKernelModules=yes */ + /* Turn off module syscalls on ProtectKernelModules=yes */ if (skip_seccomp_unavailable(u, "ProtectKernelModules=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); - if (r < 0) - goto finish; - - for (i = 0; i < ELEMENTSOF(module_syscalls); i++) { - r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), - module_syscalls[i], 0); - if (r < 0) - goto finish; - } - - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - - r = seccomp_load(seccomp); - -finish: - seccomp_release(seccomp); - return r; + return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM)); } static int apply_private_devices(Unit *u, const ExecContext *c) { - const SystemCallFilterSet *set; - scmp_filter_ctx *seccomp; - const char *sys; - bool syscalls_found = false; - int r; - assert(c); /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */ @@ -1590,61 +1520,7 @@ static int apply_private_devices(Unit *u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "PrivateDevices=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); - if (r < 0) - goto finish; - - for (set = syscall_filter_sets; set->set_name; set++) - if (streq(set->set_name, "@raw-io")) { - syscalls_found = true; - break; - } - - /* We should never fail here */ - if (!syscalls_found) { - r = -EOPNOTSUPP; - goto finish; - } - - NULSTR_FOREACH(sys, set->value) { - int id; - bool add = true; - -#ifndef __NR_s390_pci_mmio_read - if (streq(sys, "s390_pci_mmio_read")) - add = false; -#endif -#ifndef __NR_s390_pci_mmio_write - if (streq(sys, "s390_pci_mmio_write")) - add = false; -#endif - - if (!add) - continue; - - id = seccomp_syscall_resolve_name(sys); - - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EPERM), - id, 0); - if (r < 0) - goto finish; - } - - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - - r = seccomp_load(seccomp); - -finish: - seccomp_release(seccomp); - return r; + return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM)); } #endif @@ -1890,9 +1766,9 @@ static int setup_private_users(uid_t uid, gid_t gid) { asprintf(&uid_map, "0 0 1\n" /* Map root → root */ UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */ - uid, uid); /* The case where the above is the same */ + uid, uid); else - uid_map = strdup("0 0 1\n"); + uid_map = strdup("0 0 1\n"); /* The case where the above is the same */ if (!uid_map) return -ENOMEM; diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 6f68e23340..118b39c1cf 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -2618,6 +2618,7 @@ int config_parse_documentation(const char *unit, } #ifdef HAVE_SECCOMP + static int syscall_filter_parse_one( const char *unit, const char *filename, @@ -2628,27 +2629,29 @@ static int syscall_filter_parse_one( bool warn) { int r; - if (*t == '@') { - const SystemCallFilterSet *set; + if (t[0] == '@') { + const SyscallFilterSet *set; + const char *i; - for (set = syscall_filter_sets; set->set_name; set++) - if (streq(set->set_name, t)) { - const char *sys; + set = syscall_filter_set_find(t); + if (!set) { + if (warn) + log_syntax(unit, LOG_WARNING, filename, line, 0, "Don't know system call group, ignoring: %s", t); + return 0; + } - NULSTR_FOREACH(sys, set->value) { - r = syscall_filter_parse_one(unit, filename, line, c, invert, sys, false); - if (r < 0) - return r; - } - break; - } + NULSTR_FOREACH(i, set->value) { + r = syscall_filter_parse_one(unit, filename, line, c, invert, i, false); + if (r < 0) + return r; + } } else { int id; id = seccomp_syscall_resolve_name(t); if (id == __NR_SCMP_ERROR) { if (warn) - log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse system call, ignoring: %s", t); + log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse system call, ignoring: %s", t); return 0; } @@ -2662,8 +2665,9 @@ static int syscall_filter_parse_one( if (r < 0) return log_oom(); } else - set_remove(c->syscall_filter, INT_TO_PTR(id + 1)); + (void) set_remove(c->syscall_filter, INT_TO_PTR(id + 1)); } + return 0; } @@ -2682,8 +2686,7 @@ int config_parse_syscall_filter( ExecContext *c = data; Unit *u = userdata; bool invert = false; - const char *word, *state; - size_t l; + const char *p; int r; assert(filename); @@ -2722,19 +2725,24 @@ int config_parse_syscall_filter( } } - FOREACH_WORD_QUOTED(word, l, rvalue, state) { - _cleanup_free_ char *t = NULL; + p = rvalue; + for (;;) { + _cleanup_free_ char *word = NULL; - t = strndup(word, l); - if (!t) + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + break; + if (r == -ENOMEM) return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + break; + } - r = syscall_filter_parse_one(unit, filename, line, c, invert, t, true); + r = syscall_filter_parse_one(unit, filename, line, c, invert, word, true); if (r < 0) return r; } - if (!isempty(state)) - log_syntax(unit, LOG_ERR, filename, line, 0, "Trailing garbage, ignoring."); /* Turn on NNP, but only if it wasn't configured explicitly * before, and only if we are in user mode. */ diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c index 44a0b397ab..03a397d30c 100644 --- a/src/nspawn/nspawn-seccomp.c +++ b/src/nspawn/nspawn-seccomp.c @@ -135,15 +135,9 @@ int setup_seccomp(uint64_t cap_list_retain) { return 0; } - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return log_oom(); - - r = seccomp_add_secondary_archs(seccomp); - if (r < 0) { - log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m"); - goto finish; - } + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); + if (r < 0) + return log_error_errno(r, "Failed to allocate seccomp object: %m"); r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain); if (r < 0) @@ -171,12 +165,6 @@ int setup_seccomp(uint64_t cap_list_retain) { goto finish; } - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) { - log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m"); - goto finish; - } - r = seccomp_load(seccomp); if (r < 0) { log_error_errno(r, "Failed to install seccomp audit filter: %m"); diff --git a/src/shared/condition.c b/src/shared/condition.c index f13fa6a9fd..69b4837e1f 100644 --- a/src/shared/condition.c +++ b/src/shared/condition.c @@ -329,9 +329,9 @@ static int condition_test_needs_update(Condition *c) { uint64_t timestamp; int r; - r = parse_env_file(p, NULL, "TimestampNSec", ×tamp_str, NULL); + r = parse_env_file(p, NULL, "TIMESTAMP_NSEC", ×tamp_str, NULL); if (r < 0) { - log_error_errno(-r, "Failed to parse timestamp file '%s', using mtime: %m", p); + log_error_errno(r, "Failed to parse timestamp file '%s', using mtime: %m", p); return true; } else if (r == 0) { log_debug("No data in timestamp file '%s', using mtime", p); @@ -340,12 +340,11 @@ static int condition_test_needs_update(Condition *c) { r = safe_atou64(timestamp_str, ×tamp); if (r < 0) { - log_error_errno(-r, "Failed to parse timestamp value '%s' in file '%s', using mtime: %m", - timestamp_str, p); + log_error_errno(r, "Failed to parse timestamp value '%s' in file '%s', using mtime: %m", timestamp_str, p); return true; } - other.st_mtim.tv_nsec = timestamp % NSEC_PER_SEC; + timespec_store(&other.st_mtim, timestamp); } return usr.st_mtim.tv_nsec > other.st_mtim.tv_nsec; diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 8116c7671f..6252cd16a6 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -26,6 +26,7 @@ #include "macro.h" #include "seccomp-util.h" #include "string-util.h" +#include "util.h" const char* seccomp_arch_to_string(uint32_t c) { @@ -73,7 +74,34 @@ int seccomp_arch_from_string(const char *n, uint32_t *ret) { return 0; } -int seccomp_add_secondary_archs(scmp_filter_ctx *c) { +int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action) { + scmp_filter_ctx seccomp; + int r; + + /* Much like seccomp_init(), but tries to be a bit more conservative in its defaults: all secondary archs are + * added by default, and NNP is turned off. */ + + seccomp = seccomp_init(default_action); + if (!seccomp) + return -ENOMEM; + + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) + goto finish; + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + goto finish; + + *ret = seccomp; + return 0; + +finish: + seccomp_release(seccomp); + return r; +} + +int seccomp_add_secondary_archs(scmp_filter_ctx c) { #if defined(__i386__) || defined(__x86_64__) int r; @@ -110,7 +138,6 @@ int seccomp_add_secondary_archs(scmp_filter_ctx *c) { #endif return 0; - } static bool is_basic_seccomp_available(void) { @@ -132,28 +159,30 @@ bool is_seccomp_available(void) { return cached_enabled; } -const SystemCallFilterSet syscall_filter_sets[] = { - { +const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { + [SYSCALL_FILTER_SET_CLOCK] = { /* Clock */ - .set_name = "@clock", + .name = "@clock", .value = "adjtimex\0" "clock_adjtime\0" "clock_settime\0" "settimeofday\0" "stime\0" - }, { + }, + [SYSCALL_FILTER_SET_CPU_EMULATION] = { /* CPU emulation calls */ - .set_name = "@cpu-emulation", + .name = "@cpu-emulation", .value = "modify_ldt\0" "subpage_prot\0" "switch_endian\0" "vm86\0" "vm86old\0" - }, { + }, + [SYSCALL_FILTER_SET_DEBUG] = { /* Debugging/Performance Monitoring/Tracing */ - .set_name = "@debug", + .name = "@debug", .value = "lookup_dcookie\0" "perf_event_open\0" @@ -161,11 +190,14 @@ const SystemCallFilterSet syscall_filter_sets[] = { "process_vm_writev\0" "ptrace\0" "rtas\0" +#ifdef __NR_s390_runtime_instr "s390_runtime_instr\0" +#endif "sys_debug_setcontext\0" - }, { + }, + [SYSCALL_FILTER_SET_DEFAULT] = { /* Default list */ - .set_name = "@default", + .name = "@default", .value = "execve\0" "exit\0" @@ -173,9 +205,10 @@ const SystemCallFilterSet syscall_filter_sets[] = { "getrlimit\0" /* make sure processes can query stack size and such */ "rt_sigreturn\0" "sigreturn\0" - }, { + }, + [SYSCALL_FILTER_SET_IO_EVENT] = { /* Event loop use */ - .set_name = "@io-event", + .name = "@io-event", .value = "_newselect\0" "epoll_create1\0" @@ -191,9 +224,10 @@ const SystemCallFilterSet syscall_filter_sets[] = { "ppoll\0" "pselect6\0" "select\0" - }, { + }, + [SYSCALL_FILTER_SET_IPC] = { /* Message queues, SYSV IPC or other IPC: unusual */ - .set_name = "@ipc", + .name = "@ipc", .value = "ipc\0" "mq_getsetattr\0" "mq_notify\0" @@ -215,33 +249,36 @@ const SystemCallFilterSet syscall_filter_sets[] = { "shmctl\0" "shmdt\0" "shmget\0" - }, { + }, + [SYSCALL_FILTER_SET_KEYRING] = { /* Keyring */ - .set_name = "@keyring", + .name = "@keyring", .value = "add_key\0" "keyctl\0" "request_key\0" - }, { + }, + [SYSCALL_FILTER_SET_MODULE] = { /* Kernel module control */ - .set_name = "@module", + .name = "@module", .value = "delete_module\0" "finit_module\0" "init_module\0" - }, { + }, + [SYSCALL_FILTER_SET_MOUNT] = { /* Mounting */ - .set_name = "@mount", + .name = "@mount", .value = "chroot\0" "mount\0" - "oldumount\0" "pivot_root\0" "umount2\0" "umount\0" - }, { + }, + [SYSCALL_FILTER_SET_NETWORK_IO] = { /* Network or Unix socket IO, should not be needed if not network facing */ - .set_name = "@network-io", + .name = "@network-io", .value = "accept4\0" "accept\0" @@ -264,9 +301,10 @@ const SystemCallFilterSet syscall_filter_sets[] = { "socket\0" "socketcall\0" "socketpair\0" - }, { + }, + [SYSCALL_FILTER_SET_OBSOLETE] = { /* Unusual, obsolete or unimplemented, some unknown even to libseccomp */ - .set_name = "@obsolete", + .name = "@obsolete", .value = "_sysctl\0" "afs_syscall\0" @@ -292,9 +330,10 @@ const SystemCallFilterSet syscall_filter_sets[] = { "uselib\0" "ustat\0" "vserver\0" - }, { + }, + [SYSCALL_FILTER_SET_PRIVILEGED] = { /* Nice grab-bag of all system calls which need superuser capabilities */ - .set_name = "@privileged", + .name = "@privileged", .value = "@clock\0" "@module\0" @@ -331,11 +370,12 @@ const SystemCallFilterSet syscall_filter_sets[] = { "setuid\0" "swapoff\0" "swapon\0" - "sysctl\0" + "_sysctl\0" "vhangup\0" - }, { + }, + [SYSCALL_FILTER_SET_PROCESS] = { /* Process control, execution, namespaces */ - .set_name = "@process", + .name = "@process", .value = "arch_prctl\0" "clone\0" @@ -349,19 +389,90 @@ const SystemCallFilterSet syscall_filter_sets[] = { "tkill\0" "unshare\0" "vfork\0" - }, { + }, + [SYSCALL_FILTER_SET_RAW_IO] = { /* Raw I/O ports */ - .set_name = "@raw-io", + .name = "@raw-io", .value = "ioperm\0" "iopl\0" "pciconfig_iobase\0" "pciconfig_read\0" "pciconfig_write\0" +#ifdef __NR_s390_pci_mmio_read "s390_pci_mmio_read\0" +#endif +#ifdef __NR_s390_pci_mmio_write "s390_pci_mmio_write\0" - }, { - .set_name = NULL, - .value = NULL - } +#endif + }, }; + +const SyscallFilterSet *syscall_filter_set_find(const char *name) { + unsigned i; + + if (isempty(name) || name[0] != '@') + return NULL; + + for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++) + if (streq(syscall_filter_sets[i].name, name)) + return syscall_filter_sets + i; + + return NULL; +} + +int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action) { + const char *sys; + int r; + + assert(seccomp); + assert(set); + + NULSTR_FOREACH(sys, set->value) { + int id; + + if (sys[0] == '@') { + const SyscallFilterSet *other; + + other = syscall_filter_set_find(sys); + if (!other) + return -EINVAL; + + r = seccomp_add_syscall_filter_set(seccomp, other, action); + } else { + id = seccomp_syscall_resolve_name(sys); + if (id == __NR_SCMP_ERROR) + return -EINVAL; + + r = seccomp_rule_add(seccomp, action, id, 0); + } + if (r < 0) + return r; + } + + return 0; +} + +int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) { + scmp_filter_ctx seccomp; + int r; + + assert(set); + + /* The one-stop solution: allocate a seccomp object, add a filter to it, and apply it */ + + r = seccomp_init_conservative(&seccomp, default_action); + if (r < 0) + return r; + + r = seccomp_add_syscall_filter_set(seccomp, set, action); + if (r < 0) + goto finish; + + r = seccomp_load(seccomp); + +finish: + seccomp_release(seccomp); + return r; + +} diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index cca7c17912..8050fc6fbf 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -20,18 +20,45 @@ ***/ #include +#include #include const char* seccomp_arch_to_string(uint32_t c); int seccomp_arch_from_string(const char *n, uint32_t *ret); -int seccomp_add_secondary_archs(scmp_filter_ctx *c); +int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action); + +int seccomp_add_secondary_archs(scmp_filter_ctx c); bool is_seccomp_available(void); -typedef struct SystemCallFilterSet { - const char *set_name; +typedef struct SyscallFilterSet { + const char *name; const char *value; -} SystemCallFilterSet; +} SyscallFilterSet; -extern const SystemCallFilterSet syscall_filter_sets[]; +enum { + SYSCALL_FILTER_SET_CLOCK, + SYSCALL_FILTER_SET_CPU_EMULATION, + SYSCALL_FILTER_SET_DEBUG, + SYSCALL_FILTER_SET_DEFAULT, + SYSCALL_FILTER_SET_IO_EVENT, + SYSCALL_FILTER_SET_IPC, + SYSCALL_FILTER_SET_KEYRING, + SYSCALL_FILTER_SET_MODULE, + SYSCALL_FILTER_SET_MOUNT, + SYSCALL_FILTER_SET_NETWORK_IO, + SYSCALL_FILTER_SET_OBSOLETE, + SYSCALL_FILTER_SET_PRIVILEGED, + SYSCALL_FILTER_SET_PROCESS, + SYSCALL_FILTER_SET_RAW_IO, + _SYSCALL_FILTER_SET_MAX +}; + +extern const SyscallFilterSet syscall_filter_sets[]; + +const SyscallFilterSet *syscall_filter_set_find(const char *name); + +int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action); + +int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action); diff --git a/src/test/test-seccomp.c b/src/test/test-seccomp.c new file mode 100644 index 0000000000..0060ecdf02 --- /dev/null +++ b/src/test/test-seccomp.c @@ -0,0 +1,103 @@ +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include + +#include "fd-util.h" +#include "macro.h" +#include "process-util.h" +#include "seccomp-util.h" + +static void test_seccomp_arch_to_string(void) { + uint32_t a, b; + const char *name; + + a = seccomp_arch_native(); + assert_se(a > 0); + name = seccomp_arch_to_string(a); + assert_se(name); + assert_se(seccomp_arch_from_string(name, &b) >= 0); + assert_se(a == b); +} + +static void test_syscall_filter_set_find(void) { + assert_se(!syscall_filter_set_find(NULL)); + assert_se(!syscall_filter_set_find("")); + assert_se(!syscall_filter_set_find("quux")); + assert_se(!syscall_filter_set_find("@quux")); + + assert_se(syscall_filter_set_find("@clock") == syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK); + assert_se(syscall_filter_set_find("@default") == syscall_filter_sets + SYSCALL_FILTER_SET_DEFAULT); + assert_se(syscall_filter_set_find("@raw-io") == syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO); +} + +static void test_filter_sets(void) { + unsigned i; + int r; + + if (!is_seccomp_available()) + return; + + if (geteuid() != 0) + return; + + for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++) { + pid_t pid; + + log_info("Testing %s", syscall_filter_sets[i].name); + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { /* Child? */ + int fd; + + if (i == SYSCALL_FILTER_SET_DEFAULT) /* if we look at the default set, whitelist instead of blacklist */ + r = seccomp_load_filter_set(SCMP_ACT_ERRNO(EPERM), syscall_filter_sets + i, SCMP_ACT_ALLOW); + else + r = seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + i, SCMP_ACT_ERRNO(EPERM)); + if (r < 0) + _exit(EXIT_FAILURE); + + /* Test the sycall filter with one random system call */ + fd = eventfd(0, EFD_NONBLOCK|EFD_CLOEXEC); + if (IN_SET(i, SYSCALL_FILTER_SET_IO_EVENT, SYSCALL_FILTER_SET_DEFAULT)) + assert_se(fd < 0 && errno == EPERM); + else { + assert_se(fd >= 0); + safe_close(fd); + } + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_warn(syscall_filter_sets[i].name, pid, true) == EXIT_SUCCESS); + } +} + +int main(int argc, char *argv[]) { + + test_seccomp_arch_to_string(); + test_syscall_filter_set_find(); + test_filter_sets(); + + return 0; +} diff --git a/src/update-done/update-done.c b/src/update-done/update-done.c index 5cc5abfddf..48c2a3fff4 100644 --- a/src/update-done/update-done.c +++ b/src/update-done/update-done.c @@ -18,6 +18,7 @@ ***/ #include "fd-util.h" +#include "fileio.h" #include "io-util.h" #include "selinux-util.h" #include "util.h" @@ -32,8 +33,8 @@ static int apply_timestamp(const char *path, struct timespec *ts) { *ts, *ts }; - int fd = -1; _cleanup_fclose_ FILE *f = NULL; + int fd = -1; int r; assert(path); @@ -59,18 +60,20 @@ static int apply_timestamp(const char *path, struct timespec *ts) { return log_error_errno(errno, "Failed to create/open timestamp file %s: %m", path); } - f = fdopen(fd, "w"); + f = fdopen(fd, "we"); if (!f) { safe_close(fd); return log_error_errno(errno, "Failed to fdopen() timestamp file %s: %m", path); } (void) fprintf(f, - "%s" - "TimestampNSec=" NSEC_FMT "\n", - MESSAGE, timespec_load_nsec(ts)); + MESSAGE + "TIMESTAMP_NSEC=" NSEC_FMT "\n", + timespec_load_nsec(ts)); - fflush(f); + r = fflush_and_check(f); + if (r < 0) + return log_error_errno(r, "Failed to write timestamp file: %m"); if (futimens(fd, twice) < 0) return log_error_errno(errno, "Failed to update timestamp on %s: %m", path);