core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.
This commit is contained in:
Lennart Poettering 2016-08-22 18:43:59 +02:00 committed by Djalal Harouni
parent 72246c2a65
commit 59eeb84ba6
9 changed files with 159 additions and 16 deletions

View File

@ -1059,6 +1059,26 @@
Defaults to off.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>ProtectKernelTunables=</varname></term>
<listitem><para>Takes a boolean argument. If true, kernel variables accessible through
<filename>/proc/sys</filename> and <filename>/sys</filename> will be made read-only to all processes of the
unit. Usually, tunable kernel variables should only be written at boot-time, with the
<citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry> mechanism. Almost
no services need to write to these at runtime; it is hence recommended to turn this on for most
services. Defaults to off.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>ProtectControlGroups=</varname></term>
<listitem><para>Takes a boolean argument. If true, the Linux Control Groups ("cgroups") hierarchies accessible
through <filename>/sys/fs/cgroup</filename> will be made read-only to all processes of the unit. Except for
container managers no services should require write access to the control groups hierarchies; it is hence
recommended to turn this on for most services. Defaults to off.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>MountFlags=</varname></term>

View File

@ -707,6 +707,8 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectHome", "s", bus_property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
@ -1072,7 +1074,8 @@ int bus_exec_context_set_transient_property(
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset",
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
"RestrictRealtime", "DynamicUser", "RemoveIPC")) {
"RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
"ProtectControlGroups")) {
int b;
r = sd_bus_message_read(message, "b", &b);
@ -1106,6 +1109,10 @@ int bus_exec_context_set_transient_property(
c->dynamic_user = b;
else if (streq(name, "RemoveIPC"))
c->remove_ipc = b;
else if (streq(name, "ProtectKernelTunables"))
c->protect_kernel_tunables = b;
else if (streq(name, "ProtectControlGroups"))
c->protect_control_groups = b;
unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
}

View File

@ -1383,6 +1383,45 @@ finish:
return r;
}
static int apply_protect_sysctl(Unit *u, const ExecContext *c) {
scmp_filter_ctx *seccomp;
int r;
assert(c);
/* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
* let's protect even those systems where this is left on in the kernel. */
if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
return 0;
seccomp = seccomp_init(SCMP_ACT_ALLOW);
if (!seccomp)
return -ENOMEM;
r = seccomp_add_secondary_archs(seccomp);
if (r < 0)
goto finish;
r = seccomp_rule_add(
seccomp,
SCMP_ACT_ERRNO(EPERM),
SCMP_SYS(_sysctl),
0);
if (r < 0)
goto finish;
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
if (r < 0)
goto finish;
r = seccomp_load(seccomp);
finish:
seccomp_release(seccomp);
return r;
}
#endif
static void do_idle_pipe_dance(int idle_pipe[4]) {
@ -1589,7 +1628,9 @@ static bool exec_needs_mount_namespace(
if (context->private_devices ||
context->protect_system != PROTECT_SYSTEM_NO ||
context->protect_home != PROTECT_HOME_NO)
context->protect_home != PROTECT_HOME_NO ||
context->protect_kernel_tunables ||
context->protect_control_groups)
return true;
return false;
@ -1804,6 +1845,37 @@ static int close_remaining_fds(
return close_all_fds(dont_close, n_dont_close);
}
static bool context_has_address_families(const ExecContext *c) {
assert(c);
return c->address_families_whitelist ||
!set_isempty(c->address_families);
}
static bool context_has_syscall_filters(const ExecContext *c) {
assert(c);
return c->syscall_whitelist ||
!set_isempty(c->syscall_filter) ||
!set_isempty(c->syscall_archs);
}
static bool context_has_no_new_privileges(const ExecContext *c) {
assert(c);
if (c->no_new_privileges)
return true;
if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
return false;
return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */
c->memory_deny_write_execute ||
c->restrict_realtime ||
c->protect_kernel_tunables ||
context_has_syscall_filters(c);
}
static int send_user_lookup(
Unit *unit,
int user_lookup_fd,
@ -2255,6 +2327,8 @@ static int exec_child(
tmp,
var,
context->private_devices,
context->protect_kernel_tunables,
context->protect_control_groups,
context->protect_home,
context->protect_system,
context->mount_flags);
@ -2343,11 +2417,6 @@ static int exec_child(
if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
bool use_address_families = context->address_families_whitelist ||
!set_isempty(context->address_families);
bool use_syscall_filter = context->syscall_whitelist ||
!set_isempty(context->syscall_filter) ||
!set_isempty(context->syscall_archs);
int secure_bits = context->secure_bits;
for (i = 0; i < _RLIMIT_MAX; i++) {
@ -2424,15 +2493,14 @@ static int exec_child(
return -errno;
}
if (context->no_new_privileges ||
(!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter)))
if (context_has_no_new_privileges(context))
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
*exit_status = EXIT_NO_NEW_PRIVILEGES;
return -errno;
}
#ifdef HAVE_SECCOMP
if (use_address_families) {
if (context_has_address_families(context)) {
r = apply_address_families(unit, context);
if (r < 0) {
*exit_status = EXIT_ADDRESS_FAMILIES;
@ -2456,7 +2524,15 @@ static int exec_child(
}
}
if (use_syscall_filter) {
if (context->protect_kernel_tunables) {
r = apply_protect_sysctl(unit, context);
if (r < 0) {
*exit_status = EXIT_SECCOMP;
return r;
}
}
if (context_has_syscall_filters(context)) {
r = apply_seccomp(unit, context);
if (r < 0) {
*exit_status = EXIT_SECCOMP;
@ -2888,6 +2964,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
"%sNonBlocking: %s\n"
"%sPrivateTmp: %s\n"
"%sPrivateDevices: %s\n"
"%sProtectKernelTunables: %s\n"
"%sProtectControlGroups: %s\n"
"%sPrivateNetwork: %s\n"
"%sPrivateUsers: %s\n"
"%sProtectHome: %s\n"
@ -2901,6 +2979,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
prefix, yes_no(c->non_blocking),
prefix, yes_no(c->private_tmp),
prefix, yes_no(c->private_devices),
prefix, yes_no(c->protect_kernel_tunables),
prefix, yes_no(c->protect_control_groups),
prefix, yes_no(c->private_network),
prefix, yes_no(c->private_users),
prefix, protect_home_to_string(c->protect_home),

View File

@ -174,6 +174,8 @@ struct ExecContext {
bool private_users;
ProtectSystem protect_system;
ProtectHome protect_home;
bool protect_kernel_tunables;
bool protect_control_groups;
bool no_new_privileges;

View File

@ -89,6 +89,8 @@ $1.ReadOnlyPaths, config_parse_namespace_path_strv, 0,
$1.InaccessiblePaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.inaccessible_paths)
$1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp)
$1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices)
$1.ProtectKernelTunables, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_tunables)
$1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups)
$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network)
$1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users)
$1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context)

View File

@ -53,7 +53,7 @@ typedef enum MountMode {
PRIVATE_TMP,
PRIVATE_VAR_TMP,
PRIVATE_DEV,
READWRITE
READWRITE,
} MountMode;
typedef struct BindMount {
@ -366,6 +366,8 @@ int setup_namespace(
const char* tmp_dir,
const char* var_tmp_dir,
bool private_dev,
bool protect_sysctl,
bool protect_cgroups,
ProtectHome protect_home,
ProtectSystem protect_system,
unsigned long mount_flags) {
@ -385,6 +387,8 @@ int setup_namespace(
strv_length(read_only_paths) +
strv_length(inaccessible_paths) +
private_dev +
(protect_sysctl ? 3 : 0) +
(protect_cgroups != protect_sysctl) +
(protect_home != PROTECT_HOME_NO ? 3 : 0) +
(protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
(protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
@ -421,6 +425,27 @@ int setup_namespace(
m++;
}
if (protect_sysctl) {
m->path = prefix_roota(root_directory, "/proc/sys");
m->mode = READONLY;
m++;
m->path = prefix_roota(root_directory, "/proc/sysrq-trigger");
m->mode = READONLY;
m->ignore = true; /* Not always compiled into the kernel */
m++;
m->path = prefix_roota(root_directory, "/sys");
m->mode = READONLY;
m++;
}
if (protect_cgroups != protect_sysctl) {
m->path = prefix_roota(root_directory, "/sys/fs/cgroup");
m->mode = protect_cgroups ? READONLY : READWRITE;
m++;
}
if (protect_home != PROTECT_HOME_NO) {
const char *home_dir, *run_user_dir, *root_dir;
@ -505,9 +530,12 @@ int setup_namespace(
fail:
if (n > 0) {
for (m = mounts; m < mounts + n; ++m)
if (m->done)
(void) umount2(m->path, MNT_DETACH);
for (m = mounts; m < mounts + n; ++m) {
if (!m->done)
continue;
(void) umount2(m->path, MNT_DETACH);
}
}
return r;

View File

@ -46,6 +46,8 @@ int setup_namespace(const char *chroot,
const char *tmp_dir,
const char *var_tmp_dir,
bool private_dev,
bool protect_sysctl,
bool protect_cgroups,
ProtectHome protect_home,
ProtectSystem protect_system,
unsigned long mount_flags);

View File

@ -204,7 +204,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit",
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
"SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
"RestrictRealtime", "DynamicUser", "RemoveIPC")) {
"RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectControlGroups")) {
r = parse_boolean(eq);
if (r < 0)

View File

@ -69,6 +69,8 @@ int main(int argc, char *argv[]) {
tmp_dir,
var_tmp_dir,
true,
true,
true,
PROTECT_HOME_NO,
PROTECT_SYSTEM_NO,
0);