Merge pull request #3583 from poettering/restrict-realtime

add new RestrictRealtime= option to services (and other fixes)
This commit is contained in:
Ronny Chevalier 2016-06-23 13:11:58 +01:00 committed by GitHub
commit 48a97bb4dc
6 changed files with 120 additions and 8 deletions

View File

@ -1413,6 +1413,19 @@
</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>RestrictRealtime=</varname></term>
<listitem><para>Takes a boolean argument. If set, any attempts to enable realtime scheduling in a process of
the unit are refused. This restricts access to realtime task scheduling policies such as
<constant>SCHED_FIFO</constant>, <constant>SCHED_RR</constant> or <constant>SCHED_DEADLINE</constant>. See
<citerefentry><refentrytitle>sched</refentrytitle><manvolnum>7</manvolnum></citerefentry> for details about
these scheduling policies. Realtime scheduling policies may be used to monopolize CPU time for longer periods
of time, and may hence be used to lock up or otherwise trigger Denial-of-Service situations on the system. It
is hence recommended to restrict access to realtime scheduling to the few programs that actually require
them. Defaults to off.</para></listitem>
</varlistentry>
</variablelist>
</refsect1>

View File

@ -720,6 +720,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("RuntimeDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, runtime_directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RuntimeDirectory", "as", NULL, offsetof(ExecContext, runtime_directory), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_VTABLE_END
};
@ -1057,7 +1058,7 @@ int bus_exec_context_set_transient_property(
} else if (STR_IN_SET(name,
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset",
"PrivateTmp", "PrivateDevices", "PrivateNetwork",
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute")) {
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", "RestrictRealtime")) {
int b;
r = sd_bus_message_read(message, "b", &b);
@ -1083,6 +1084,8 @@ int bus_exec_context_set_transient_property(
c->syslog_level_prefix = b;
else if (streq(name, "MemoryDenyWriteExecute"))
c->memory_deny_write_execute = b;
else if (streq(name, "RestrictRealtime"))
c->restrict_realtime = b;
unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
}

View File

@ -1237,7 +1237,7 @@ static int apply_memory_deny_write_execute(const ExecContext *c) {
r = seccomp_rule_add(
seccomp,
SCMP_ACT_KILL,
SCMP_ACT_ERRNO(EPERM),
SCMP_SYS(mmap),
1,
SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
@ -1246,7 +1246,7 @@ static int apply_memory_deny_write_execute(const ExecContext *c) {
r = seccomp_rule_add(
seccomp,
SCMP_ACT_KILL,
SCMP_ACT_ERRNO(EPERM),
SCMP_SYS(mprotect),
1,
SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
@ -1264,6 +1264,76 @@ finish:
return r;
}
static int apply_restrict_realtime(const ExecContext *c) {
static const int permitted_policies[] = {
SCHED_OTHER,
SCHED_BATCH,
SCHED_IDLE,
};
scmp_filter_ctx *seccomp;
unsigned i;
int r, p, max_policy = 0;
assert(c);
seccomp = seccomp_init(SCMP_ACT_ALLOW);
if (!seccomp)
return -ENOMEM;
/* Determine the highest policy constant we want to allow */
for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
if (permitted_policies[i] > max_policy)
max_policy = permitted_policies[i];
/* Go through all policies with lower values than that, and block them -- unless they appear in the
* whitelist. */
for (p = 0; p < max_policy; p++) {
bool good = false;
/* Check if this is in the whitelist. */
for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
if (permitted_policies[i] == p) {
good = true;
break;
}
if (good)
continue;
/* Deny this policy */
r = seccomp_rule_add(
seccomp,
SCMP_ACT_ERRNO(EPERM),
SCMP_SYS(sched_setscheduler),
1,
SCMP_A1(SCMP_CMP_EQ, p));
if (r < 0)
goto finish;
}
/* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are unsigned here,
* hence no need no check for < 0 values. */
r = seccomp_rule_add(
seccomp,
SCMP_ACT_ERRNO(EPERM),
SCMP_SYS(sched_setscheduler),
1,
SCMP_A1(SCMP_CMP_GT, max_policy));
if (r < 0)
goto finish;
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
if (r < 0)
goto finish;
r = seccomp_load(seccomp);
finish:
seccomp_release(seccomp);
return r;
}
#endif
static void do_idle_pipe_dance(int idle_pipe[4]) {
@ -1951,10 +2021,20 @@ static int exec_child(
int secure_bits = context->secure_bits;
for (i = 0; i < _RLIMIT_MAX; i++) {
if (!context->rlimit[i])
continue;
if (setrlimit_closest(i, context->rlimit[i]) < 0) {
r = setrlimit_closest(i, context->rlimit[i]);
if (r < 0) {
*exit_status = EXIT_LIMITS;
return r;
}
}
/* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
*exit_status = EXIT_LIMITS;
return -errno;
}
@ -2015,7 +2095,7 @@ static int exec_child(
}
if (context->no_new_privileges ||
(!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || use_syscall_filter)))
(!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter)))
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
*exit_status = EXIT_NO_NEW_PRIVILEGES;
return -errno;
@ -2037,6 +2117,15 @@ static int exec_child(
return r;
}
}
if (context->restrict_realtime) {
r = apply_restrict_realtime(context);
if (r < 0) {
*exit_status = EXIT_SECCOMP;
return r;
}
}
if (use_syscall_filter) {
r = apply_seccomp(context);
if (r < 0) {
@ -2472,7 +2561,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
"%sProtectHome: %s\n"
"%sProtectSystem: %s\n"
"%sIgnoreSIGPIPE: %s\n"
"%sMemoryDenyWriteExecute: %s\n",
"%sMemoryDenyWriteExecute: %s\n"
"%sRestrictRealtime: %s\n",
prefix, c->umask,
prefix, c->working_directory ? c->working_directory : "/",
prefix, c->root_directory ? c->root_directory : "/",
@ -2483,7 +2573,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
prefix, protect_home_to_string(c->protect_home),
prefix, protect_system_to_string(c->protect_system),
prefix, yes_no(c->ignore_sigpipe),
prefix, yes_no(c->memory_deny_write_execute));
prefix, yes_no(c->memory_deny_write_execute),
prefix, yes_no(c->restrict_realtime));
STRV_FOREACH(e, c->environment)
fprintf(f, "%sEnvironment: %s\n", prefix, *e);

View File

@ -193,12 +193,14 @@ struct ExecContext {
char **runtime_directory;
mode_t runtime_directory_mode;
bool memory_deny_write_execute;
bool restrict_realtime;
bool oom_score_adjust_set:1;
bool nice_set:1;
bool ioprio_set:1;
bool cpu_sched_set:1;
bool no_new_privileges_set:1;
bool memory_deny_write_execute;
};
#include "cgroup-util.h"

View File

@ -56,11 +56,13 @@ m4_ifdef(`HAVE_SECCOMP',
$1.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof($1, exec_context.syscall_archs)
$1.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof($1, exec_context)
$1.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof($1, exec_context.memory_deny_write_execute)
$1.RestrictRealtime, config_parse_bool, 0, offsetof($1, exec_context.restrict_realtime)
$1.RestrictAddressFamilies, config_parse_address_families, 0, offsetof($1, exec_context)',
`$1.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.SystemCallArchitectures, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.SystemCallErrorNumber, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.MemoryDenyWriteExecute, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.RestrictRealtime, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0')
$1.LimitCPU, config_parse_limit, RLIMIT_CPU, offsetof($1, exec_context.rlimit)
$1.LimitFSIZE, config_parse_limit, RLIMIT_FSIZE, offsetof($1, exec_context.rlimit)

View File

@ -3364,6 +3364,7 @@ int unit_write_drop_in(Unit *u, UnitSetPropertiesMode mode, const char *name, co
/* When this is a transient unit file in creation, then let's not create a new drop-in but instead
* write to the transient unit file. */
fputs(data, u->transient_file);
fputc('\n', u->transient_file);
return 0;
}