From 915fb3243892b07962dfe3ae0a5b6302ba40bf53 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 29 Apr 2019 11:54:00 +0200 Subject: [PATCH 1/9] seccomp: add scmp_act_kill_process() helper that returns SCMP_ACT_KILL_PROCESS if supported --- src/shared/seccomp-util.c | 15 +++++++++++++++ src/shared/seccomp-util.h | 2 ++ 2 files changed, 17 insertions(+) diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 95e46a6aa4..72920ee7df 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -1964,3 +1964,18 @@ int seccomp_restrict_suid_sgid(void) { return 0; } + +uint32_t scmp_act_kill_process(void) { + + /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never + * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of + * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least + * for single-threaded apps does the right thing. */ + +#ifdef SCMP_ACT_KILL_PROCESS + if (seccomp_api_get() >= 3) + return SCMP_ACT_KILL_PROCESS; +#endif + + return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */ +} diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index 2566d2d17f..1729dc1b6e 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -104,3 +104,5 @@ extern const uint32_t seccomp_local_archs[]; DEFINE_TRIVIAL_CLEANUP_FUNC(scmp_filter_ctx, seccomp_release); int parse_syscall_archs(char **l, Set **archs); + +uint32_t scmp_act_kill_process(void); From ccc16c7842a144af5df6accbb2f01281ee0d3129 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 29 Apr 2019 12:03:58 +0200 Subject: [PATCH 2/9] core: prefer SCMP_ACT_KILL_PROCESS for SystemCallFilter= behaviour If we have it, use it. It makes a ton more sense. Fixes: #11967 --- src/core/execute.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/execute.c b/src/core/execute.c index 9975de1ff5..e90c3ac4f3 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1439,7 +1439,7 @@ static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ if (skip_seccomp_unavailable(u, "SystemCallFilter=")) return 0; - negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno); + negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno); if (c->syscall_whitelist) { default_action = negative_action; From 7bbc229cf7539b70d0b3d89a567176f48ec7b583 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 29 Apr 2019 12:04:34 +0200 Subject: [PATCH 3/9] test: use the new action in our tests This way, we know that it works as intended. --- src/test/test-seccomp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/test-seccomp.c b/src/test/test-seccomp.c index 9b7307cf39..a906070f9a 100644 --- a/src/test/test-seccomp.c +++ b/src/test/test-seccomp.c @@ -635,7 +635,7 @@ static void test_load_syscall_filter_set_raw(void) { assert_se(access("/", F_OK) >= 0); assert_se(poll(NULL, 0, 0) == 0); - assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, NULL, SCMP_ACT_KILL, true) >= 0); + assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, NULL, scmp_act_kill_process(), true) >= 0); assert_se(access("/", F_OK) >= 0); assert_se(poll(NULL, 0, 0) == 0); From f9a3d8e2f3063beb07d72a931c75794786280b3e Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 29 Apr 2019 12:04:55 +0200 Subject: [PATCH 4/9] nspawn: expose the new seccomp actions in the OCI logic --- src/nspawn/nspawn-oci.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/nspawn/nspawn-oci.c b/src/nspawn/nspawn-oci.c index 97323f31dd..b00ff289a6 100644 --- a/src/nspawn/nspawn-oci.c +++ b/src/nspawn/nspawn-oci.c @@ -1656,13 +1656,19 @@ static int oci_seccomp_action_from_string(const char *name, uint32_t *ret) { const char *name; uint32_t action; } table[] = { - { "SCMP_ACT_ALLOW", SCMP_ACT_ALLOW }, - { "SCMP_ACT_ERRNO", SCMP_ACT_ERRNO(EPERM) }, /* the OCI spec doesn't document the error, but it appears EPERM is supposed to be used */ - { "SCMP_ACT_KILL", SCMP_ACT_KILL }, -#ifdef SCMP_ACT_LOG - { "SCMP_ACT_LOG", SCMP_ACT_LOG }, + { "SCMP_ACT_ALLOW", SCMP_ACT_ALLOW }, + { "SCMP_ACT_ERRNO", SCMP_ACT_ERRNO(EPERM) }, /* the OCI spec doesn't document the error, but it appears EPERM is supposed to be used */ + { "SCMP_ACT_KILL", SCMP_ACT_KILL }, +#ifdef SCMP_ACT_KILL_PROCESS + { "SCMP_ACT_KILL_PROCESS", SCMP_ACT_KILL_PROCESS }, #endif - { "SCMP_ACT_TRAP", SCMP_ACT_TRAP }, +#ifdef SCMP_ACT_KILL_THREAD + { "SCMP_ACT_KILL_THREAD", SCMP_ACT_KILL_THREAD }, +#endif +#ifdef SCMP_ACT_LOG + { "SCMP_ACT_LOG", SCMP_ACT_LOG }, +#endif + { "SCMP_ACT_TRAP", SCMP_ACT_TRAP }, /* We don't support SCMP_ACT_TRACE because that requires a tracer, and that doesn't really make sense * here */ From 4cd8263166c2ddd0352e0818f18ac8c0dbdf4b0f Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 29 Apr 2019 12:05:16 +0200 Subject: [PATCH 5/9] NEWS: document the new SystemCallFilter= behaviour --- NEWS | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/NEWS b/NEWS index 78c44db4a6..0592e697bb 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,22 @@ systemd System and Service Manager CHANGES WITH 243 in spe: + * Previously, filters defined with SystemCallFilter= would have the + effect that an calling an offending system call would terminate the + calling thread. This behaviour never made much sense, since killing + individual threads of unexpecting processes is likely to create more + problems than it solves. With this release the default action changed + from killing the thread to killing the whole process. For this to + work correctly both a kernel version (>= 4.14) and a libseccomp + version (>= 2.4.0) supporting this new seccomp action is required. If + an older kernel or libseccomp is used the old behaviour continues to + be used. This change does not affect any services that have no system + call filters defined, or that use SystemCallErrorNumber= (and thus + see EPERM or another error instead of being killed when calling an + offending system call). Note that systemd documentation always + claimed that the whole process is killed. With this change behaviour + is thus adjusted to match the documentation. + * The "kernel.pid_max" sysctl is now bumped to 4194304 by default, i.e. the full 22bit range the kernel allows, up from the old 16bit range. This should improve security and robustness a bit, as PID From 727a1a06077ed8395f72a142b04f37d6058d381b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 29 Apr 2019 12:05:33 +0200 Subject: [PATCH 6/9] service: tweak capitalization of unit description --- units/system-update-cleanup.service | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/units/system-update-cleanup.service b/units/system-update-cleanup.service index d5eca2546b..41abcd631c 100644 --- a/units/system-update-cleanup.service +++ b/units/system-update-cleanup.service @@ -8,7 +8,7 @@ # (at your option) any later version. [Unit] -Description=Remove the Offline System Updates symlink +Description=Remove the Offline System Updates Symlink Documentation=man:systemd.special(7) man:systemd.offline-updates(7) After=system-update.target DefaultDependencies=no From 18f8c5d4661195ac142d4e9554982ffbbe3b199e Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 29 Apr 2019 12:54:26 +0200 Subject: [PATCH 7/9] test-execute: check exit code before exit status The meaning of the status changes depending on the code, hence let's always compare the code first, status second. --- src/test/test-execute.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/test/test-execute.c b/src/test/test-execute.c index 9f1cb0ca38..a27de296a4 100644 --- a/src/test/test-execute.c +++ b/src/test/test-execute.c @@ -62,18 +62,20 @@ static void check(const char *func, Manager *m, Unit *unit, int status_expected, } } exec_status_dump(&service->main_exec_status, stdout, "\t"); - if (service->main_exec_status.status != status_expected) { - log_error("%s: %s: exit status %d, expected %d", - func, unit->id, - service->main_exec_status.status, status_expected); - abort(); - } + if (service->main_exec_status.code != code_expected) { log_error("%s: %s: exit code %d, expected %d", func, unit->id, service->main_exec_status.code, code_expected); abort(); } + + if (service->main_exec_status.status != status_expected) { + log_error("%s: %s: exit status %d, expected %d", + func, unit->id, + service->main_exec_status.status, status_expected); + abort(); + } } static bool check_nobody_user_and_group(void) { From c3ab2c389ee60d92fb8d7fe779ae9c4e3c092e4c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 29 Apr 2019 12:58:55 +0200 Subject: [PATCH 8/9] test-execute: let's ignore the difference between CLD_KILLED and CLD_DUMPED Depending on system configuration and whether SCMP_ACT_KILL_PROCESS or SCMP_ACT_KILL_THREAD is available/used processes might coredump on specific coredumps or are just plain killed. For our test case the difference doesn't really matter, hence let's hide it away. --- src/test/test-execute.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/test/test-execute.c b/src/test/test-execute.c index a27de296a4..7f8ea2be98 100644 --- a/src/test/test-execute.c +++ b/src/test/test-execute.c @@ -33,6 +33,12 @@ static bool can_unshare; typedef void (*test_function_t)(Manager *m); +static int cld_dumped_to_killed(int code) { + /* Depending on the system, seccomp version, … some signals might result in dumping, others in plain + * killing. Let's ignore the difference here, and map both cases to CLD_KILLED */ + return code == CLD_DUMPED ? CLD_KILLED : code; +} + static void check(const char *func, Manager *m, Unit *unit, int status_expected, int code_expected) { Service *service = NULL; usec_t ts; @@ -63,7 +69,7 @@ static void check(const char *func, Manager *m, Unit *unit, int status_expected, } exec_status_dump(&service->main_exec_status, stdout, "\t"); - if (service->main_exec_status.code != code_expected) { + if (cld_dumped_to_killed(service->main_exec_status.code) != cld_dumped_to_killed(code_expected)) { log_error("%s: %s: exit code %d, expected %d", func, unit->id, service->main_exec_status.code, code_expected); From a429223d1767a9d6cb0a95fd2fa3b0d64ce3d4e7 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 29 Apr 2019 13:02:32 +0200 Subject: [PATCH 9/9] test-execute: turn off coredump generation in test services These services are likely to coredump, and we expect that but aren't interested in the coredump. Hence let's turn off processing by setting RLIMIT_CORE to 0/0. --- test/test-execute/exec-systemcallfilter-failing.service | 1 + test/test-execute/exec-systemcallfilter-failing2.service | 1 + 2 files changed, 2 insertions(+) diff --git a/test/test-execute/exec-systemcallfilter-failing.service b/test/test-execute/exec-systemcallfilter-failing.service index bcebc99507..996f859217 100644 --- a/test/test-execute/exec-systemcallfilter-failing.service +++ b/test/test-execute/exec-systemcallfilter-failing.service @@ -4,6 +4,7 @@ Description=Test for SystemCallFilter [Service] ExecStart=/bin/sh -c 'echo "This should not be seen"' Type=oneshot +LimitCORE=0 SystemCallFilter=ioperm SystemCallFilter=~ioperm SystemCallFilter=ioperm diff --git a/test/test-execute/exec-systemcallfilter-failing2.service b/test/test-execute/exec-systemcallfilter-failing2.service index 2fdc0ed772..c74f42248b 100644 --- a/test/test-execute/exec-systemcallfilter-failing2.service +++ b/test/test-execute/exec-systemcallfilter-failing2.service @@ -4,4 +4,5 @@ Description=Test for SystemCallFilter [Service] ExecStart=/bin/sh -c 'echo "This should not be seen"' Type=oneshot +LimitCORE=0 SystemCallFilter=~write open execve exit_group close mmap munmap fstat DONOTEXIST