Merge pull request #4243 from endocode/djalal/sandbox-first-protection-kernelmodules-v1

core:sandbox: Add ProtectKernelModules= and some fixes
This commit is contained in:
Lennart Poettering 2016-10-13 18:36:29 +02:00 committed by GitHub
commit 8bfdf29b24
17 changed files with 211 additions and 32 deletions

View File

@ -1639,6 +1639,9 @@ EXTRA_DIST += \
test/test-execute/exec-privatedevices-yes.service \
test/test-execute/exec-privatedevices-no-capability-mknod.service \
test/test-execute/exec-privatedevices-yes-capability-mknod.service \
test/test-execute/exec-protectkernelmodules-no-capabilities.service \
test/test-execute/exec-protectkernelmodules-yes-capabilities.service \
test/test-execute/exec-protectkernelmodules-yes-mount-propagation.service \
test/test-execute/exec-privatetmp-no.service \
test/test-execute/exec-privatetmp-yes.service \
test/test-execute/exec-readonlypaths.service \

View File

@ -946,8 +946,8 @@
<filename>/dev/port</filename> and others. This is useful to securely turn off physical device access by the
executed process. Defaults to false. Enabling this option will install a system call filter to block low-level
I/O system calls that are grouped in the <varname>@raw-io</varname> set, will also remove
<constant>CAP_MKNOD</constant> from the capability bounding set for the unit (see above), and set
<varname>DevicePolicy=closed</varname> (see
<constant>CAP_MKNOD</constant> and <constant>CAP_SYS_RAWIO</constant> from the capability bounding set for
the unit (see above), and set <varname>DevicePolicy=closed</varname> (see
<citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>
for details). Note that using this setting will disconnect propagation of mounts from the service to the host
(propagation in the opposite direction continues to work). This means that this setting may not be used for
@ -1046,7 +1046,10 @@
boot-time, with the <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry>
mechanism. Almost no services need to write to these at runtime; it is hence recommended to turn this on for
most services. For this setting the same restrictions regarding mount propagation and privileges apply as for
<varname>ReadOnlyPaths=</varname> and related calls, see above. Defaults to off.</para></listitem>
<varname>ReadOnlyPaths=</varname> and related calls, see above. Defaults to off.
Note that this option does not prevent kernel tuning through IPC interfaces and exeternal programs. However
<varname>InaccessiblePaths=</varname> can be used to make some IPC file system objects
inaccessible.</para></listitem>
</varlistentry>
<varlistentry>
@ -1404,6 +1407,26 @@
logging. This does not affect commands prefixed with <literal>+</literal>.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>ProtectKernelModules=</varname></term>
<listitem><para>Takes a boolean argument. If true, explicit module loading will
be denied. This allows to turn off module load and unload operations on modular
kernels. It is recomended to turn this on for most services that do not need special
file systems or extra kernel modules to work. Default to off. Enabling this option
removes <constant>CAP_SYS_MODULE</constant> from the capability bounding set for
the unit, and installs a system call filter to block module system calls,
also <filename>/usr/lib/modules</filename> is made inaccessible. For this
setting the same restrictions regarding mount propagation and privileges
apply as for <varname>ReadOnlyPaths=</varname> and related calls, see above.
Note that limited automatic module loading due to user configuration or kernel
mapping tables might still happen as side effect of requested user operations,
both privileged and unprivileged. To disable module auto-load feature please see
<citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry>
<constant>kernel.modules_disabled</constant> mechanism and
<filename>/proc/sys/kernel/modules_disabled</filename> documentation.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>Personality=</varname></term>

View File

@ -708,6 +708,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
@ -1075,7 +1076,7 @@ int bus_exec_context_set_transient_property(
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
"RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
"ProtectControlGroups")) {
"ProtectKernelModules", "ProtectControlGroups")) {
int b;
r = sd_bus_message_read(message, "b", &b);
@ -1111,6 +1112,8 @@ int bus_exec_context_set_transient_property(
c->remove_ipc = b;
else if (streq(name, "ProtectKernelTunables"))
c->protect_kernel_tunables = b;
else if (streq(name, "ProtectKernelModules"))
c->protect_kernel_modules = b;
else if (streq(name, "ProtectControlGroups"))
c->protect_control_groups = b;

View File

@ -1436,6 +1436,50 @@ finish:
return r;
}
static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) {
static const int module_syscalls[] = {
SCMP_SYS(delete_module),
SCMP_SYS(finit_module),
SCMP_SYS(init_module),
};
scmp_filter_ctx *seccomp;
unsigned i;
int r;
assert(c);
/* Turn of module syscalls on ProtectKernelModules=yes */
if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
return 0;
seccomp = seccomp_init(SCMP_ACT_ALLOW);
if (!seccomp)
return -ENOMEM;
r = seccomp_add_secondary_archs(seccomp);
if (r < 0)
goto finish;
for (i = 0; i < ELEMENTSOF(module_syscalls); i++) {
r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM),
module_syscalls[i], 0);
if (r < 0)
goto finish;
}
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
if (r < 0)
goto finish;
r = seccomp_load(seccomp);
finish:
seccomp_release(seccomp);
return r;
}
static int apply_private_devices(Unit *u, const ExecContext *c) {
const SystemCallFilterSet *set;
scmp_filter_ctx *seccomp;
@ -1722,6 +1766,7 @@ static bool exec_needs_mount_namespace(
context->protect_system != PROTECT_SYSTEM_NO ||
context->protect_home != PROTECT_HOME_NO ||
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_control_groups)
return true;
@ -2070,6 +2115,8 @@ static bool context_has_no_new_privileges(const ExecContext *c) {
c->memory_deny_write_execute ||
c->restrict_realtime ||
c->protect_kernel_tunables ||
c->protect_kernel_modules ||
c->private_devices ||
context_has_syscall_filters(c);
}
@ -2449,6 +2496,12 @@ static int exec_child(
if (needs_mount_namespace) {
_cleanup_free_ char **rw = NULL;
char *tmp = NULL, *var = NULL;
NameSpaceInfo ns_info = {
.private_dev = context->private_devices,
.protect_control_groups = context->protect_control_groups,
.protect_kernel_tunables = context->protect_kernel_tunables,
.protect_kernel_modules = context->protect_kernel_modules,
};
/* The runtime struct only contains the parent
* of the private /tmp, which is
@ -2471,14 +2524,12 @@ static int exec_child(
r = setup_namespace(
(params->flags & EXEC_APPLY_CHROOT) ? context->root_directory : NULL,
&ns_info,
rw,
context->read_only_paths,
context->inaccessible_paths,
tmp,
var,
context->private_devices,
context->protect_kernel_tunables,
context->protect_control_groups,
context->protect_home,
context->protect_system,
context->mount_flags);
@ -2690,6 +2741,14 @@ static int exec_child(
}
}
if (context->protect_kernel_modules) {
r = apply_protect_kernel_modules(unit, context);
if (r < 0) {
*exit_status = EXIT_SECCOMP;
return r;
}
}
if (context->private_devices) {
r = apply_private_devices(unit, context);
if (r < 0) {
@ -3131,6 +3190,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
"%sPrivateTmp: %s\n"
"%sPrivateDevices: %s\n"
"%sProtectKernelTunables: %s\n"
"%sProtectKernelModules: %s\n"
"%sProtectControlGroups: %s\n"
"%sPrivateNetwork: %s\n"
"%sPrivateUsers: %s\n"
@ -3146,6 +3206,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
prefix, yes_no(c->private_tmp),
prefix, yes_no(c->private_devices),
prefix, yes_no(c->protect_kernel_tunables),
prefix, yes_no(c->protect_kernel_modules),
prefix, yes_no(c->protect_control_groups),
prefix, yes_no(c->private_network),
prefix, yes_no(c->private_users),

View File

@ -175,6 +175,7 @@ struct ExecContext {
ProtectSystem protect_system;
ProtectHome protect_home;
bool protect_kernel_tunables;
bool protect_kernel_modules;
bool protect_control_groups;
bool no_new_privileges;

View File

@ -90,6 +90,7 @@ $1.InaccessiblePaths, config_parse_namespace_path_strv, 0,
$1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp)
$1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices)
$1.ProtectKernelTunables, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_tunables)
$1.ProtectKernelModules, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_modules)
$1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups)
$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network)
$1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users)

View File

@ -97,6 +97,14 @@ static const TargetMount protect_kernel_tunables_table[] = {
{ "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
};
/* ProtectKernelModules= option */
static const TargetMount protect_kernel_modules_table[] = {
#ifdef HAVE_SPLIT_USR
{ "/lib/modules", INACCESSIBLE, true },
#endif
{ "/usr/lib/modules", INACCESSIBLE, true },
};
/*
* ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
* system should be protected by ProtectSystem=
@ -207,6 +215,13 @@ static int append_protect_kernel_tunables(BindMount **p, const char *root_direct
ELEMENTSOF(protect_kernel_tunables_table));
}
static int append_protect_kernel_modules(BindMount **p, const char *root_directory) {
assert(p);
return append_target_mounts(p, root_directory, protect_kernel_modules_table,
ELEMENTSOF(protect_kernel_modules_table));
}
static int append_protect_home(BindMount **p, const char *root_directory, ProtectHome protect_home) {
int r = 0;
@ -660,14 +675,12 @@ static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned
}
static unsigned namespace_calculate_mounts(
const NameSpaceInfo *ns_info,
char** read_write_paths,
char** read_only_paths,
char** inaccessible_paths,
const char* tmp_dir,
const char* var_tmp_dir,
bool private_dev,
bool protect_sysctl,
bool protect_cgroups,
ProtectHome protect_home,
ProtectSystem protect_system) {
@ -690,22 +703,21 @@ static unsigned namespace_calculate_mounts(
strv_length(read_write_paths) +
strv_length(read_only_paths) +
strv_length(inaccessible_paths) +
private_dev +
(protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
(protect_cgroups ? 1 : 0) +
ns_info->private_dev +
(ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
(ns_info->protect_control_groups ? 1 : 0) +
(ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
protect_home_cnt + protect_system_cnt;
}
int setup_namespace(
const char* root_directory,
const NameSpaceInfo *ns_info,
char** read_write_paths,
char** read_only_paths,
char** inaccessible_paths,
const char* tmp_dir,
const char* var_tmp_dir,
bool private_dev,
bool protect_sysctl,
bool protect_cgroups,
ProtectHome protect_home,
ProtectSystem protect_system,
unsigned long mount_flags) {
@ -718,13 +730,12 @@ int setup_namespace(
if (mount_flags == 0)
mount_flags = MS_SHARED;
n = namespace_calculate_mounts(read_write_paths,
n = namespace_calculate_mounts(ns_info,
read_write_paths,
read_only_paths,
inaccessible_paths,
tmp_dir, var_tmp_dir,
private_dev, protect_sysctl,
protect_cgroups, protect_home,
protect_system);
protect_home, protect_system);
/* Set mount slave mode */
if (root_directory || n > 0)
@ -756,16 +767,25 @@ int setup_namespace(
m++;
}
if (private_dev) {
if (ns_info->private_dev) {
m->path = prefix_roota(root_directory, "/dev");
m->mode = PRIVATE_DEV;
m++;
}
if (protect_sysctl)
append_protect_kernel_tunables(&m, root_directory);
if (ns_info->protect_kernel_tunables) {
r = append_protect_kernel_tunables(&m, root_directory);
if (r < 0)
return r;
}
if (protect_cgroups) {
if (ns_info->protect_kernel_modules) {
r = append_protect_kernel_modules(&m, root_directory);
if (r < 0)
return r;
}
if (ns_info->protect_control_groups) {
m->path = prefix_roota(root_directory, "/sys/fs/cgroup");
m->mode = READONLY;
m++;

View File

@ -4,6 +4,7 @@
This file is part of systemd.
Copyright 2010 Lennart Poettering
Copyright 2016 Djalal Harouni
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
@ -19,6 +20,8 @@
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
typedef struct NameSpaceInfo NameSpaceInfo;
#include <stdbool.h>
#include "macro.h"
@ -40,15 +43,20 @@ typedef enum ProtectSystem {
_PROTECT_SYSTEM_INVALID = -1
} ProtectSystem;
struct NameSpaceInfo {
bool private_dev:1;
bool protect_control_groups:1;
bool protect_kernel_tunables:1;
bool protect_kernel_modules:1;
};
int setup_namespace(const char *chroot,
const NameSpaceInfo *ns_info,
char **read_write_paths,
char **read_only_paths,
char **inaccessible_paths,
const char *tmp_dir,
const char *var_tmp_dir,
bool private_dev,
bool protect_sysctl,
bool protect_cgroups,
ProtectHome protect_home,
ProtectSystem protect_system,
unsigned long mount_flags);

View File

@ -3399,7 +3399,10 @@ int unit_patch_contexts(Unit *u) {
ec->no_new_privileges = true;
if (ec->private_devices)
ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_MKNOD);
ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_MKNOD) | (UINT64_C(1) << CAP_SYS_RAWIO));
if (ec->protect_kernel_modules)
ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE);
if (ec->dynamic_user) {
if (!ec->user) {

View File

@ -204,7 +204,8 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit",
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
"SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
"RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectControlGroups")) {
"RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
"ProtectKernelModules", "ProtectControlGroups")) {
r = parse_boolean(eq);
if (r < 0)

View File

@ -140,6 +140,19 @@ static void test_exec_privatedevices_capabilities(Manager *m) {
}
test(m, "exec-privatedevices-yes-capability-mknod.service", 0, CLD_EXITED);
test(m, "exec-privatedevices-no-capability-mknod.service", 0, CLD_EXITED);
test(m, "exec-privatedevices-yes-capability-sys-rawio.service", 0, CLD_EXITED);
test(m, "exec-privatedevices-no-capability-sys-rawio.service", 0, CLD_EXITED);
}
static void test_exec_protectkernelmodules(Manager *m) {
if (detect_container() > 0) {
log_notice("testing in container, skipping protectkernelmodules tests");
return;
}
test(m, "exec-protectkernelmodules-no-capabilities.service", 0, CLD_EXITED);
test(m, "exec-protectkernelmodules-yes-capabilities.service", 0, CLD_EXITED);
test(m, "exec-protectkernelmodules-yes-mount-propagation.service", 0, CLD_EXITED);
}
static void test_exec_readonlypaths(Manager *m) {
@ -368,6 +381,7 @@ int main(int argc, char *argv[]) {
test_exec_privatetmp,
test_exec_privatedevices,
test_exec_privatedevices_capabilities,
test_exec_protectkernelmodules,
test_exec_readonlypaths,
test_exec_readwritepaths,
test_exec_inaccessiblepaths,

View File

@ -45,6 +45,14 @@ int main(int argc, char *argv[]) {
"/home/lennart/projects",
NULL
};
static const NameSpaceInfo ns_info = {
.private_dev = true,
.protect_control_groups = true,
.protect_kernel_tunables = true,
.protect_kernel_modules = true,
};
char *root_directory;
char *projects_directory;
int r;
@ -69,14 +77,12 @@ int main(int argc, char *argv[]) {
log_info("Not chrooted");
r = setup_namespace(root_directory,
&ns_info,
(char **) writable,
(char **) readonly,
(char **) inaccessible,
tmp_dir,
var_tmp_dir,
true,
true,
true,
PROTECT_HOME_NO,
PROTECT_SYSTEM_NO,
0);

View File

@ -0,0 +1,7 @@
[Unit]
Description=Test CAP_SYS_RAWIO capability for PrivateDevices=no
[Service]
PrivateDevices=no
ExecStart=/bin/sh -x -c 'capsh --print | grep cap_sys_rawio'
Type=oneshot

View File

@ -0,0 +1,7 @@
[Unit]
Description=Test CAP_SYS_RAWIO capability for PrivateDevices=yes
[Service]
PrivateDevices=yes
ExecStart=/bin/sh -x -c '! capsh --print | grep cap_sys_rawio'
Type=oneshot

View File

@ -0,0 +1,7 @@
[Unit]
Description=Test CAP_SYS_MODULE ProtectKernelModules=no
[Service]
ProtectKernelModules=no
ExecStart=/bin/sh -x -c 'capsh --print | grep cap_sys_module'
Type=oneshot

View File

@ -0,0 +1,7 @@
[Unit]
Description=Test CAP_SYS_MODULE for ProtectKernelModules=yes
[Service]
ProtectKernelModules=yes
ExecStart=/bin/sh -x -c '! capsh --print | grep cap_sys_module'
Type=oneshot

View File

@ -0,0 +1,7 @@
[Unit]
Description=Test to make sure that passing ProtectKernelModules=yes disconnect mount propagation
[Service]
ProtectKernelModules=yes
ExecStart=/bin/sh -x -c 'mkdir -p /TEST; mount -t tmpfs tmpfs /TEST; grep TEST /proc/self/mountinfo && ! grep TEST /proc/$${PPID}/mountinfo && ! grep TEST /proc/1/mountinfo'
Type=oneshot