diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 7a7006b9a0..a9f1d8d74e 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1653,6 +1653,18 @@ personality of the host system's kernel. + + LockPersonality= + + Locks down the personality2 system + call so that the kernel execution domain may not be changed from the default or the personality selected with + Personality= directive. This may be useful to improve security, because odd personality + emulations may be poorly tested and source of vulnerabilities. If running in user mode, or in system mode, but + without the CAP_SYS_ADMIN capability (e.g. setting User=), + NoNewPrivileges=yes is implied. + + RuntimeDirectory= diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 45497ca024..d28e8aafd6 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -853,6 +853,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("SystemCallArchitectures", "as", property_get_syscall_archs, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("SystemCallErrorNumber", "i", property_get_syscall_errno, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Personality", "s", property_get_personality, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("LockPersonality", "b", bus_property_get_bool, offsetof(ExecContext, lock_personality), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RestrictAddressFamilies", "(bas)", property_get_address_families, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RuntimeDirectoryPreserve", "s", property_get_exec_preserve_mode, offsetof(ExecContext, runtime_directory_preserve_mode), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RuntimeDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, directories[EXEC_DIRECTORY_RUNTIME].mode), SD_BUS_VTABLE_PROPERTY_CONST), diff --git a/src/core/execute.c b/src/core/execute.c index d192134b1c..4d285ff250 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1296,7 +1296,8 @@ static bool context_has_no_new_privileges(const ExecContext *c) { c->protect_kernel_modules || c->private_devices || context_has_syscall_filters(c) || - !set_isempty(c->syscall_archs); + !set_isempty(c->syscall_archs) || + c->lock_personality; } #ifdef HAVE_SECCOMP @@ -1455,6 +1456,25 @@ static int apply_restrict_namespaces(Unit *u, const ExecContext *c) { return seccomp_restrict_namespaces(c->restrict_namespaces); } +static int apply_lock_personality(const Unit* u, const ExecContext *c) { + unsigned long personality = c->personality; + + assert(u); + assert(c); + + if (!c->lock_personality) + return 0; + + if (skip_seccomp_unavailable(u, "LockPersonality=")) + return 0; + + /* If personality is not specified, use the default (Linux) */ + if (personality == PERSONALITY_INVALID) + personality = PER_LINUX; + + return seccomp_lock_personality(personality); +} + #endif static void do_idle_pipe_dance(int idle_pipe[4]) { @@ -2972,6 +2992,13 @@ static int exec_child( return r; } + r = apply_lock_personality(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + *error_message = strdup("Failed to lock personalities"); + return r; + } + /* This really should remain the last step before the execve(), to make sure our own code is unaffected * by the filter as little as possible. */ r = apply_syscall_filter(unit, context, needs_ambient_hack); @@ -3733,6 +3760,10 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { "%sPersonality: %s\n", prefix, strna(personality_to_string(c->personality))); + fprintf(f, + "%sLockPersonality: %s\n", + prefix, yes_no(c->lock_personality)); + if (c->syscall_filter) { #ifdef HAVE_SECCOMP Iterator j; diff --git a/src/core/execute.h b/src/core/execute.h index 9a28269283..8a7ce8449b 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -227,6 +227,7 @@ struct ExecContext { bool same_pgrp; unsigned long personality; + bool lock_personality; unsigned long restrict_namespaces; /* The CLONE_NEWxyz flags permitted to the unit's processes */ diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 9a87f0acd3..94f3d657f6 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -60,14 +60,16 @@ $1.SystemCallErrorNumber, config_parse_syscall_errno, 0, $1.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof($1, exec_context.memory_deny_write_execute) $1.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof($1, exec_context) $1.RestrictRealtime, config_parse_bool, 0, offsetof($1, exec_context.restrict_realtime) -$1.RestrictAddressFamilies, config_parse_address_families, 0, offsetof($1, exec_context)', +$1.RestrictAddressFamilies, config_parse_address_families, 0, offsetof($1, exec_context) +$1.LockPersonality, config_parse_bool, 0, offsetof($1, exec_context.lock_personality)', `$1.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 $1.SystemCallArchitectures, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 $1.SystemCallErrorNumber, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 $1.MemoryDenyWriteExecute, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 $1.RestrictNamespaces, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 $1.RestrictRealtime, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 -$1.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0') +$1.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.LockPersonality, config_parse_warn_compat, DISABLED_CONFIGURATION, 0') $1.LimitCPU, config_parse_limit, RLIMIT_CPU, offsetof($1, exec_context.rlimit) $1.LimitFSIZE, config_parse_limit, RLIMIT_FSIZE, offsetof($1, exec_context.rlimit) $1.LimitDATA, config_parse_limit, RLIMIT_DATA, offsetof($1, exec_context.rlimit) diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index dd6d4fbdc7..bf2db28a82 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -29,6 +29,7 @@ #include "alloc-util.h" #include "macro.h" #include "nsflags.h" +#include "process-util.h" #include "seccomp-util.h" #include "set.h" #include "string-util.h" @@ -1402,3 +1403,21 @@ int seccomp_filter_set_add(Set *filter, bool add, const SyscallFilterSet *set) { return 0; } + +int seccomp_lock_personality(unsigned long personality) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + int r; + + seccomp = seccomp_init(SCMP_ACT_ALLOW); + if (!seccomp) + return -ENOMEM; + + r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(personality), + 1, + SCMP_A0(SCMP_CMP_NE, personality)); + if (r < 0) + return r; + + return seccomp_load(seccomp); +} diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index 0edffa116d..ca43ba8659 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -78,6 +78,7 @@ int seccomp_protect_sysctl(void); int seccomp_restrict_address_families(Set *address_families, bool whitelist); int seccomp_restrict_realtime(void); int seccomp_memory_deny_write_execute(void); +int seccomp_lock_personality(unsigned long personality); extern const uint32_t seccomp_local_archs[]; diff --git a/src/test/test-seccomp.c b/src/test/test-seccomp.c index 28fe206507..7ffbc4754e 100644 --- a/src/test/test-seccomp.c +++ b/src/test/test-seccomp.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -565,6 +566,40 @@ static void test_load_syscall_filter_set_raw(void) { assert_se(wait_for_terminate_and_warn("syscallrawseccomp", pid, true) == EXIT_SUCCESS); } +static void test_lock_personality(void) { + pid_t pid; + + if (!is_seccomp_available()) + return; + if (geteuid() != 0) + return; + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + assert_se(seccomp_lock_personality(PER_LINUX) >= 0); + + assert_se(personality(PER_LINUX) == PER_LINUX); + assert_se(personality(PER_LINUX | ADDR_NO_RANDOMIZE) == -1 && errno == EPERM); + assert_se(personality(PER_LINUX | MMAP_PAGE_ZERO) == -1 && errno == EPERM); + assert_se(personality(PER_LINUX | ADDR_COMPAT_LAYOUT) == -1 && errno == EPERM); + assert_se(personality(PER_LINUX | READ_IMPLIES_EXEC) == -1 && errno == EPERM); + assert_se(personality(PER_LINUX_32BIT) == -1 && errno == EPERM); + assert_se(personality(PER_SVR4) == -1 && errno == EPERM); + assert_se(personality(PER_BSD) == -1 && errno == EPERM); + assert_se(personality(PER_LINUX32) == -1 && errno == EPERM); + assert_se(personality(PER_LINUX32_3GB) == -1 && errno == EPERM); + assert_se(personality(PER_UW7) == -1 && errno == EPERM); + assert_se(personality(0x42) == -1 && errno == EPERM); + assert_se(personality(PERSONALITY_INVALID) == -1 && errno == EPERM); /* maybe remove this later */ + assert_se(personality(PER_LINUX) == PER_LINUX); + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_warn("lockpersonalityseccomp", pid, true) == EXIT_SUCCESS); +} + int main(int argc, char *argv[]) { log_set_max_level(LOG_DEBUG); @@ -581,6 +616,7 @@ int main(int argc, char *argv[]) { test_memory_deny_write_execute_shmat(); test_restrict_archs(); test_load_syscall_filter_set_raw(); + test_lock_personality(); return 0; }