diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index dbfc7692f7..ed02666daf 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1413,6 +1413,19 @@ + + RestrictRealtime= + + Takes a boolean argument. If set, any attempts to enable realtime scheduling in a process of + the unit are refused. This restricts access to realtime task scheduling policies such as + SCHED_FIFO, SCHED_RR or SCHED_DEADLINE. See + sched7 for details about + these scheduling policies. Realtime scheduling policies may be used to monopolize CPU time for longer periods + of time, and may hence be used to lock up or otherwise trigger Denial-of-Service situations on the system. It + is hence recommended to restrict access to realtime scheduling to the few programs that actually require + them. Defaults to off. + + diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 4c88c41127..644b9561b5 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -720,6 +720,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("RuntimeDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, runtime_directory_mode), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RuntimeDirectory", "as", NULL, offsetof(ExecContext, runtime_directory), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_VTABLE_END }; @@ -1057,7 +1058,7 @@ int bus_exec_context_set_transient_property( } else if (STR_IN_SET(name, "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "PrivateTmp", "PrivateDevices", "PrivateNetwork", - "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute")) { + "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", "RestrictRealtime")) { int b; r = sd_bus_message_read(message, "b", &b); @@ -1083,6 +1084,8 @@ int bus_exec_context_set_transient_property( c->syslog_level_prefix = b; else if (streq(name, "MemoryDenyWriteExecute")) c->memory_deny_write_execute = b; + else if (streq(name, "RestrictRealtime")) + c->restrict_realtime = b; unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b)); } diff --git a/src/core/execute.c b/src/core/execute.c index 3c3369373f..8cb18dbd5b 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1237,7 +1237,7 @@ static int apply_memory_deny_write_execute(const ExecContext *c) { r = seccomp_rule_add( seccomp, - SCMP_ACT_KILL, + SCMP_ACT_ERRNO(EPERM), SCMP_SYS(mmap), 1, SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE)); @@ -1246,7 +1246,7 @@ static int apply_memory_deny_write_execute(const ExecContext *c) { r = seccomp_rule_add( seccomp, - SCMP_ACT_KILL, + SCMP_ACT_ERRNO(EPERM), SCMP_SYS(mprotect), 1, SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC)); @@ -1264,6 +1264,76 @@ finish: return r; } +static int apply_restrict_realtime(const ExecContext *c) { + static const int permitted_policies[] = { + SCHED_OTHER, + SCHED_BATCH, + SCHED_IDLE, + }; + + scmp_filter_ctx *seccomp; + unsigned i; + int r, p, max_policy = 0; + + assert(c); + + seccomp = seccomp_init(SCMP_ACT_ALLOW); + if (!seccomp) + return -ENOMEM; + + /* Determine the highest policy constant we want to allow */ + for (i = 0; i < ELEMENTSOF(permitted_policies); i++) + if (permitted_policies[i] > max_policy) + max_policy = permitted_policies[i]; + + /* Go through all policies with lower values than that, and block them -- unless they appear in the + * whitelist. */ + for (p = 0; p < max_policy; p++) { + bool good = false; + + /* Check if this is in the whitelist. */ + for (i = 0; i < ELEMENTSOF(permitted_policies); i++) + if (permitted_policies[i] == p) { + good = true; + break; + } + + if (good) + continue; + + /* Deny this policy */ + r = seccomp_rule_add( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(sched_setscheduler), + 1, + SCMP_A1(SCMP_CMP_EQ, p)); + if (r < 0) + goto finish; + } + + /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are unsigned here, + * hence no need no check for < 0 values. */ + r = seccomp_rule_add( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(sched_setscheduler), + 1, + SCMP_A1(SCMP_CMP_GT, max_policy)); + if (r < 0) + goto finish; + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + goto finish; + + r = seccomp_load(seccomp); + +finish: + seccomp_release(seccomp); + return r; +} + #endif static void do_idle_pipe_dance(int idle_pipe[4]) { @@ -1951,10 +2021,20 @@ static int exec_child( int secure_bits = context->secure_bits; for (i = 0; i < _RLIMIT_MAX; i++) { + if (!context->rlimit[i]) continue; - if (setrlimit_closest(i, context->rlimit[i]) < 0) { + r = setrlimit_closest(i, context->rlimit[i]); + if (r < 0) { + *exit_status = EXIT_LIMITS; + return r; + } + } + + /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */ + if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) { + if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) { *exit_status = EXIT_LIMITS; return -errno; } @@ -2015,7 +2095,7 @@ static int exec_child( } if (context->no_new_privileges || - (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || use_syscall_filter))) + (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter))) if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) { *exit_status = EXIT_NO_NEW_PRIVILEGES; return -errno; @@ -2037,6 +2117,15 @@ static int exec_child( return r; } } + + if (context->restrict_realtime) { + r = apply_restrict_realtime(context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return r; + } + } + if (use_syscall_filter) { r = apply_seccomp(context); if (r < 0) { @@ -2472,7 +2561,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { "%sProtectHome: %s\n" "%sProtectSystem: %s\n" "%sIgnoreSIGPIPE: %s\n" - "%sMemoryDenyWriteExecute: %s\n", + "%sMemoryDenyWriteExecute: %s\n" + "%sRestrictRealtime: %s\n", prefix, c->umask, prefix, c->working_directory ? c->working_directory : "/", prefix, c->root_directory ? c->root_directory : "/", @@ -2483,7 +2573,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { prefix, protect_home_to_string(c->protect_home), prefix, protect_system_to_string(c->protect_system), prefix, yes_no(c->ignore_sigpipe), - prefix, yes_no(c->memory_deny_write_execute)); + prefix, yes_no(c->memory_deny_write_execute), + prefix, yes_no(c->restrict_realtime)); STRV_FOREACH(e, c->environment) fprintf(f, "%sEnvironment: %s\n", prefix, *e); diff --git a/src/core/execute.h b/src/core/execute.h index cd1f7b36f6..210eea0e82 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -193,12 +193,14 @@ struct ExecContext { char **runtime_directory; mode_t runtime_directory_mode; + bool memory_deny_write_execute; + bool restrict_realtime; + bool oom_score_adjust_set:1; bool nice_set:1; bool ioprio_set:1; bool cpu_sched_set:1; bool no_new_privileges_set:1; - bool memory_deny_write_execute; }; #include "cgroup-util.h" diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index eb58586523..fe1006830b 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -56,11 +56,13 @@ m4_ifdef(`HAVE_SECCOMP', $1.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof($1, exec_context.syscall_archs) $1.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof($1, exec_context) $1.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof($1, exec_context.memory_deny_write_execute) +$1.RestrictRealtime, config_parse_bool, 0, offsetof($1, exec_context.restrict_realtime) $1.RestrictAddressFamilies, config_parse_address_families, 0, offsetof($1, exec_context)', `$1.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 $1.SystemCallArchitectures, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 $1.SystemCallErrorNumber, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 $1.MemoryDenyWriteExecute, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 +$1.RestrictRealtime, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 $1.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0') $1.LimitCPU, config_parse_limit, RLIMIT_CPU, offsetof($1, exec_context.rlimit) $1.LimitFSIZE, config_parse_limit, RLIMIT_FSIZE, offsetof($1, exec_context.rlimit) diff --git a/src/core/unit.c b/src/core/unit.c index 581962eba6..0a1a5321df 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -3364,6 +3364,7 @@ int unit_write_drop_in(Unit *u, UnitSetPropertiesMode mode, const char *name, co /* When this is a transient unit file in creation, then let's not create a new drop-in but instead * write to the transient unit file. */ fputs(data, u->transient_file); + fputc('\n', u->transient_file); return 0; }