From 502d704e5ed2d288069471f4e3611115cde107d6 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Wed, 12 Oct 2016 13:31:21 +0200 Subject: [PATCH 1/9] core:sandbox: Add ProtectKernelModules= option This is useful to turn off explicit module load and unload operations on modular kernels. This option removes CAP_SYS_MODULE from the capability bounding set for the unit, and installs a system call filter to block module system calls. This option will not prevent the kernel from loading modules using the module auto-load feature which is a system wide operation. --- man/systemd.exec.xml | 17 +++++++++ src/core/dbus-execute.c | 5 ++- src/core/execute.c | 52 +++++++++++++++++++++++++++ src/core/execute.h | 1 + src/core/load-fragment-gperf.gperf.m4 | 1 + src/core/unit.c | 3 ++ src/shared/bus-unit-util.c | 3 +- 7 files changed, 80 insertions(+), 2 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 986985ad35..3bea4976b3 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1404,6 +1404,23 @@ logging. This does not affect commands prefixed with +. + + ProtectKernelModules= + + Takes a boolean argument. If true, explicit module loading will + be denied. This allows to turn off module load and unload operations on modular + kernels. It is recomended to turn this on for most services that do not need special + file systems or extra kernel modules to work. Default to off. Enabling this option + removes CAP_SYS_MODULE from the capability bounding set for + the unit, and installs a system call filter to block module system calls. + Note that limited automatic module loading due to user configuration or kernel + mapping tables might still happen as side effect of requested user operations, + both privileged and unprivileged. To disable module auto-load feature please see + sysctl.d5 + kernel.modules_disabled mechanism and + /proc/sys/kernel/modules_disabled documentation. + + Personality= diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index eec4500c8c..b8720d7d3d 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -708,6 +708,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1075,7 +1076,7 @@ int bus_exec_context_set_transient_property( "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", - "ProtectControlGroups")) { + "ProtectKernelModules", "ProtectControlGroups")) { int b; r = sd_bus_message_read(message, "b", &b); @@ -1111,6 +1112,8 @@ int bus_exec_context_set_transient_property( c->remove_ipc = b; else if (streq(name, "ProtectKernelTunables")) c->protect_kernel_tunables = b; + else if (streq(name, "ProtectKernelModules")) + c->protect_kernel_modules = b; else if (streq(name, "ProtectControlGroups")) c->protect_control_groups = b; diff --git a/src/core/execute.c b/src/core/execute.c index 0c983f4953..7a278b7d31 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1436,6 +1436,50 @@ finish: return r; } +static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { + static const int module_syscalls[] = { + SCMP_SYS(delete_module), + SCMP_SYS(finit_module), + SCMP_SYS(init_module), + }; + + scmp_filter_ctx *seccomp; + unsigned i; + int r; + + assert(c); + + /* Turn of module syscalls on ProtectKernelModules=yes */ + + if (skip_seccomp_unavailable(u, "ProtectKernelModules=")) + return 0; + + seccomp = seccomp_init(SCMP_ACT_ALLOW); + if (!seccomp) + return -ENOMEM; + + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) + goto finish; + + for (i = 0; i < ELEMENTSOF(module_syscalls); i++) { + r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), + module_syscalls[i], 0); + if (r < 0) + goto finish; + } + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + goto finish; + + r = seccomp_load(seccomp); + +finish: + seccomp_release(seccomp); + return r; +} + static int apply_private_devices(Unit *u, const ExecContext *c) { const SystemCallFilterSet *set; scmp_filter_ctx *seccomp; @@ -2690,6 +2734,14 @@ static int exec_child( } } + if (context->protect_kernel_modules) { + r = apply_protect_kernel_modules(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return r; + } + } + if (context->private_devices) { r = apply_private_devices(unit, context); if (r < 0) { diff --git a/src/core/execute.h b/src/core/execute.h index 449180c903..1de439c3ad 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -175,6 +175,7 @@ struct ExecContext { ProtectSystem protect_system; ProtectHome protect_home; bool protect_kernel_tunables; + bool protect_kernel_modules; bool protect_control_groups; bool no_new_privileges; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index c49c1d6732..a700d853cc 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -90,6 +90,7 @@ $1.InaccessiblePaths, config_parse_namespace_path_strv, 0, $1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp) $1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices) $1.ProtectKernelTunables, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_tunables) +$1.ProtectKernelModules, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_modules) $1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups) $1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network) $1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users) diff --git a/src/core/unit.c b/src/core/unit.c index 690f7f7dd9..71f95c0b96 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -3401,6 +3401,9 @@ int unit_patch_contexts(Unit *u) { if (ec->private_devices) ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_MKNOD); + if (ec->protect_kernel_modules) + ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE); + if (ec->dynamic_user) { if (!ec->user) { r = user_from_unit_name(u, &ec->user); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index a550a370b5..f639e0e832 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -204,7 +204,8 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit", "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute", - "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectControlGroups")) { + "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", + "ProtectKernelModules", "ProtectControlGroups")) { r = parse_boolean(eq); if (r < 0) From 3ae33295f00be5e2836f009bf1991b0caddf80b7 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Fri, 7 Oct 2016 19:17:34 +0200 Subject: [PATCH 2/9] test: add capability tests for ProtectKernelModules= This just adds capabilities test. --- Makefile.am | 2 ++ src/test/test-execute.c | 11 +++++++++++ .../exec-protectkernelmodules-no-capabilities.service | 7 +++++++ ...exec-protectkernelmodules-yes-capabilities.service | 7 +++++++ 4 files changed, 27 insertions(+) create mode 100644 test/test-execute/exec-protectkernelmodules-no-capabilities.service create mode 100644 test/test-execute/exec-protectkernelmodules-yes-capabilities.service diff --git a/Makefile.am b/Makefile.am index b09b0cf167..1ea25bb688 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1639,6 +1639,8 @@ EXTRA_DIST += \ test/test-execute/exec-privatedevices-yes.service \ test/test-execute/exec-privatedevices-no-capability-mknod.service \ test/test-execute/exec-privatedevices-yes-capability-mknod.service \ + test/test-execute/exec-protectkernelmodules-no-capabilities.service \ + test/test-execute/exec-protectkernelmodules-yes-capabilities.service \ test/test-execute/exec-privatetmp-no.service \ test/test-execute/exec-privatetmp-yes.service \ test/test-execute/exec-readonlypaths.service \ diff --git a/src/test/test-execute.c b/src/test/test-execute.c index 8b4ff22495..f7d38fb0f3 100644 --- a/src/test/test-execute.c +++ b/src/test/test-execute.c @@ -142,6 +142,16 @@ static void test_exec_privatedevices_capabilities(Manager *m) { test(m, "exec-privatedevices-no-capability-mknod.service", 0, CLD_EXITED); } +static void test_exec_protectkernelmodules_capabilities(Manager *m) { + if (detect_container() > 0) { + log_notice("testing in container, skipping protectkernelmodules tests"); + return; + } + + test(m, "exec-protectkernelmodules-no-capabilities.service", 0, CLD_EXITED); + test(m, "exec-protectkernelmodules-yes-capabilities.service", 0, CLD_EXITED); +} + static void test_exec_readonlypaths(Manager *m) { test(m, "exec-readonlypaths.service", 0, CLD_EXITED); test(m, "exec-readonlypaths-mount-propagation.service", 0, CLD_EXITED); @@ -368,6 +378,7 @@ int main(int argc, char *argv[]) { test_exec_privatetmp, test_exec_privatedevices, test_exec_privatedevices_capabilities, + test_exec_protectkernelmodules_capabilities, test_exec_readonlypaths, test_exec_readwritepaths, test_exec_inaccessiblepaths, diff --git a/test/test-execute/exec-protectkernelmodules-no-capabilities.service b/test/test-execute/exec-protectkernelmodules-no-capabilities.service new file mode 100644 index 0000000000..b2f2cd6b8a --- /dev/null +++ b/test/test-execute/exec-protectkernelmodules-no-capabilities.service @@ -0,0 +1,7 @@ +[Unit] +Description=Test CAP_SYS_MODULE ProtectKernelModules=no + +[Service] +ProtectKernelModules=no +ExecStart=/bin/sh -x -c 'capsh --print | grep cap_sys_module' +Type=oneshot diff --git a/test/test-execute/exec-protectkernelmodules-yes-capabilities.service b/test/test-execute/exec-protectkernelmodules-yes-capabilities.service new file mode 100644 index 0000000000..84bf39be56 --- /dev/null +++ b/test/test-execute/exec-protectkernelmodules-yes-capabilities.service @@ -0,0 +1,7 @@ +[Unit] +Description=Test CAP_SYS_MODULE for ProtectKernelModules=yes + +[Service] +ProtectKernelModules=yes +ExecStart=/bin/sh -x -c '! capsh --print | grep cap_sys_module' +Type=oneshot From 2cd0a735470894bd2d25147442285744764633a1 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Fri, 7 Oct 2016 20:38:05 +0200 Subject: [PATCH 3/9] core:sandbox: remove CAP_SYS_RAWIO on PrivateDevices=yes The rawio system calls were filtered, but CAP_SYS_RAWIO allows to access raw data through /proc, ioctl and some other exotic system calls... --- man/systemd.exec.xml | 4 ++-- src/core/unit.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 3bea4976b3..c46c0f6dd8 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -946,8 +946,8 @@ /dev/port and others. This is useful to securely turn off physical device access by the executed process. Defaults to false. Enabling this option will install a system call filter to block low-level I/O system calls that are grouped in the @raw-io set, will also remove - CAP_MKNOD from the capability bounding set for the unit (see above), and set - DevicePolicy=closed (see + CAP_MKNOD and CAP_SYS_RAWIO from the capability bounding set for + the unit (see above), and set DevicePolicy=closed (see systemd.resource-control5 for details). Note that using this setting will disconnect propagation of mounts from the service to the host (propagation in the opposite direction continues to work). This means that this setting may not be used for diff --git a/src/core/unit.c b/src/core/unit.c index 71f95c0b96..67668bdc48 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -3399,7 +3399,7 @@ int unit_patch_contexts(Unit *u) { ec->no_new_privileges = true; if (ec->private_devices) - ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_MKNOD); + ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_MKNOD) | (UINT64_C(1) << CAP_SYS_RAWIO)); if (ec->protect_kernel_modules) ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE); From 625d8769fa6394a302b024eaee45043e6eb0c87a Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Fri, 7 Oct 2016 20:41:38 +0200 Subject: [PATCH 4/9] test: add test to make sure that CAP_SYS_RAWIO was removed on PrivateDevices=yes --- src/test/test-execute.c | 2 ++ .../exec-privatedevices-no-capability-sys-rawio.service | 7 +++++++ .../exec-privatedevices-yes-capability-sys-rawio.service | 7 +++++++ 3 files changed, 16 insertions(+) create mode 100644 test/test-execute/exec-privatedevices-no-capability-sys-rawio.service create mode 100644 test/test-execute/exec-privatedevices-yes-capability-sys-rawio.service diff --git a/src/test/test-execute.c b/src/test/test-execute.c index f7d38fb0f3..1eade98ed3 100644 --- a/src/test/test-execute.c +++ b/src/test/test-execute.c @@ -140,6 +140,8 @@ static void test_exec_privatedevices_capabilities(Manager *m) { } test(m, "exec-privatedevices-yes-capability-mknod.service", 0, CLD_EXITED); test(m, "exec-privatedevices-no-capability-mknod.service", 0, CLD_EXITED); + test(m, "exec-privatedevices-yes-capability-sys-rawio.service", 0, CLD_EXITED); + test(m, "exec-privatedevices-no-capability-sys-rawio.service", 0, CLD_EXITED); } static void test_exec_protectkernelmodules_capabilities(Manager *m) { diff --git a/test/test-execute/exec-privatedevices-no-capability-sys-rawio.service b/test/test-execute/exec-privatedevices-no-capability-sys-rawio.service new file mode 100644 index 0000000000..e7f529c44c --- /dev/null +++ b/test/test-execute/exec-privatedevices-no-capability-sys-rawio.service @@ -0,0 +1,7 @@ +[Unit] +Description=Test CAP_SYS_RAWIO capability for PrivateDevices=no + +[Service] +PrivateDevices=no +ExecStart=/bin/sh -x -c 'capsh --print | grep cap_sys_rawio' +Type=oneshot diff --git a/test/test-execute/exec-privatedevices-yes-capability-sys-rawio.service b/test/test-execute/exec-privatedevices-yes-capability-sys-rawio.service new file mode 100644 index 0000000000..cebc493a7a --- /dev/null +++ b/test/test-execute/exec-privatedevices-yes-capability-sys-rawio.service @@ -0,0 +1,7 @@ +[Unit] +Description=Test CAP_SYS_RAWIO capability for PrivateDevices=yes + +[Service] +PrivateDevices=yes +ExecStart=/bin/sh -x -c '! capsh --print | grep cap_sys_rawio' +Type=oneshot From ac246d9868bd476297e2702e0a7ef52294f9cfa8 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Sat, 8 Oct 2016 17:48:35 +0200 Subject: [PATCH 5/9] doc: minor hint about InaccessiblePaths= in regard of ProtectKernelTunables= --- man/systemd.exec.xml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index c46c0f6dd8..4a68695348 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1046,7 +1046,10 @@ boot-time, with the sysctl.d5 mechanism. Almost no services need to write to these at runtime; it is hence recommended to turn this on for most services. For this setting the same restrictions regarding mount propagation and privileges apply as for - ReadOnlyPaths= and related calls, see above. Defaults to off. + ReadOnlyPaths= and related calls, see above. Defaults to off. + Note that this option does not prevent kernel tuning through IPC interfaces and exeternal programs. However + InaccessiblePaths= can be used to make some IPC file system objects + inaccessible. From c575770b75b6cd15684fbacd249147bf5fd6ead7 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Wed, 12 Oct 2016 14:11:16 +0200 Subject: [PATCH 6/9] core:sandbox: lets make /lib/modules/ inaccessible on ProtectKernelModules= Lets go further and make /lib/modules/ inaccessible for services that do not have business with modules, this is a minor improvment but it may help on setups with custom modules and they are limited... in regard of kernel auto-load feature. This change introduce NameSpaceInfo struct which we may embed later inside ExecContext but for now lets just reduce the argument number to setup_namespace() and merge ProtectKernelModules feature. --- man/systemd.exec.xml | 5 +++- src/core/execute.c | 11 ++++++--- src/core/namespace.c | 54 ++++++++++++++++++++++++++++++-------------- src/core/namespace.h | 14 +++++++++--- src/test/test-ns.c | 12 +++++++--- 5 files changed, 69 insertions(+), 27 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 4a68695348..249fcb0363 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1415,7 +1415,10 @@ kernels. It is recomended to turn this on for most services that do not need special file systems or extra kernel modules to work. Default to off. Enabling this option removes CAP_SYS_MODULE from the capability bounding set for - the unit, and installs a system call filter to block module system calls. + the unit, and installs a system call filter to block module system calls, + also /usr/lib/modules is made inaccessible. For this + setting the same restrictions regarding mount propagation and privileges + apply as for ReadOnlyPaths= and related calls, see above. Note that limited automatic module loading due to user configuration or kernel mapping tables might still happen as side effect of requested user operations, both privileged and unprivileged. To disable module auto-load feature please see diff --git a/src/core/execute.c b/src/core/execute.c index 7a278b7d31..dc078d96f0 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1766,6 +1766,7 @@ static bool exec_needs_mount_namespace( context->protect_system != PROTECT_SYSTEM_NO || context->protect_home != PROTECT_HOME_NO || context->protect_kernel_tunables || + context->protect_kernel_modules || context->protect_control_groups) return true; @@ -2493,6 +2494,12 @@ static int exec_child( if (needs_mount_namespace) { _cleanup_free_ char **rw = NULL; char *tmp = NULL, *var = NULL; + NameSpaceInfo ns_info = { + .private_dev = context->private_devices, + .protect_control_groups = context->protect_control_groups, + .protect_kernel_tunables = context->protect_kernel_tunables, + .protect_kernel_modules = context->protect_kernel_modules, + }; /* The runtime struct only contains the parent * of the private /tmp, which is @@ -2515,14 +2522,12 @@ static int exec_child( r = setup_namespace( (params->flags & EXEC_APPLY_CHROOT) ? context->root_directory : NULL, + &ns_info, rw, context->read_only_paths, context->inaccessible_paths, tmp, var, - context->private_devices, - context->protect_kernel_tunables, - context->protect_control_groups, context->protect_home, context->protect_system, context->mount_flags); diff --git a/src/core/namespace.c b/src/core/namespace.c index 43a2f4ba6e..1195e9a854 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -97,6 +97,14 @@ static const TargetMount protect_kernel_tunables_table[] = { { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */ }; +/* ProtectKernelModules= option */ +static const TargetMount protect_kernel_modules_table[] = { +#ifdef HAVE_SPLIT_USR + { "/lib/modules", INACCESSIBLE, true }, +#endif + { "/usr/lib/modules", INACCESSIBLE, true }, +}; + /* * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of * system should be protected by ProtectSystem= @@ -207,6 +215,13 @@ static int append_protect_kernel_tunables(BindMount **p, const char *root_direct ELEMENTSOF(protect_kernel_tunables_table)); } +static int append_protect_kernel_modules(BindMount **p, const char *root_directory) { + assert(p); + + return append_target_mounts(p, root_directory, protect_kernel_modules_table, + ELEMENTSOF(protect_kernel_modules_table)); +} + static int append_protect_home(BindMount **p, const char *root_directory, ProtectHome protect_home) { int r = 0; @@ -660,14 +675,12 @@ static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned } static unsigned namespace_calculate_mounts( + const NameSpaceInfo *ns_info, char** read_write_paths, char** read_only_paths, char** inaccessible_paths, const char* tmp_dir, const char* var_tmp_dir, - bool private_dev, - bool protect_sysctl, - bool protect_cgroups, ProtectHome protect_home, ProtectSystem protect_system) { @@ -690,22 +703,21 @@ static unsigned namespace_calculate_mounts( strv_length(read_write_paths) + strv_length(read_only_paths) + strv_length(inaccessible_paths) + - private_dev + - (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + - (protect_cgroups ? 1 : 0) + + ns_info->private_dev + + (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + + (ns_info->protect_control_groups ? 1 : 0) + + (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) + protect_home_cnt + protect_system_cnt; } int setup_namespace( const char* root_directory, + const NameSpaceInfo *ns_info, char** read_write_paths, char** read_only_paths, char** inaccessible_paths, const char* tmp_dir, const char* var_tmp_dir, - bool private_dev, - bool protect_sysctl, - bool protect_cgroups, ProtectHome protect_home, ProtectSystem protect_system, unsigned long mount_flags) { @@ -718,13 +730,12 @@ int setup_namespace( if (mount_flags == 0) mount_flags = MS_SHARED; - n = namespace_calculate_mounts(read_write_paths, + n = namespace_calculate_mounts(ns_info, + read_write_paths, read_only_paths, inaccessible_paths, tmp_dir, var_tmp_dir, - private_dev, protect_sysctl, - protect_cgroups, protect_home, - protect_system); + protect_home, protect_system); /* Set mount slave mode */ if (root_directory || n > 0) @@ -756,16 +767,25 @@ int setup_namespace( m++; } - if (private_dev) { + if (ns_info->private_dev) { m->path = prefix_roota(root_directory, "/dev"); m->mode = PRIVATE_DEV; m++; } - if (protect_sysctl) - append_protect_kernel_tunables(&m, root_directory); + if (ns_info->protect_kernel_tunables) { + r = append_protect_kernel_tunables(&m, root_directory); + if (r < 0) + return r; + } - if (protect_cgroups) { + if (ns_info->protect_kernel_modules) { + r = append_protect_kernel_modules(&m, root_directory); + if (r < 0) + return r; + } + + if (ns_info->protect_control_groups) { m->path = prefix_roota(root_directory, "/sys/fs/cgroup"); m->mode = READONLY; m++; diff --git a/src/core/namespace.h b/src/core/namespace.h index 6505bcc499..6310638e9a 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -4,6 +4,7 @@ This file is part of systemd. Copyright 2010 Lennart Poettering + Copyright 2016 Djalal Harouni systemd is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by @@ -19,6 +20,8 @@ along with systemd; If not, see . ***/ +typedef struct NameSpaceInfo NameSpaceInfo; + #include #include "macro.h" @@ -40,15 +43,20 @@ typedef enum ProtectSystem { _PROTECT_SYSTEM_INVALID = -1 } ProtectSystem; +struct NameSpaceInfo { + bool private_dev:1; + bool protect_control_groups:1; + bool protect_kernel_tunables:1; + bool protect_kernel_modules:1; +}; + int setup_namespace(const char *chroot, + const NameSpaceInfo *ns_info, char **read_write_paths, char **read_only_paths, char **inaccessible_paths, const char *tmp_dir, const char *var_tmp_dir, - bool private_dev, - bool protect_sysctl, - bool protect_cgroups, ProtectHome protect_home, ProtectSystem protect_system, unsigned long mount_flags); diff --git a/src/test/test-ns.c b/src/test/test-ns.c index c4d4da6d05..da7a8b0565 100644 --- a/src/test/test-ns.c +++ b/src/test/test-ns.c @@ -45,6 +45,14 @@ int main(int argc, char *argv[]) { "/home/lennart/projects", NULL }; + + static const NameSpaceInfo ns_info = { + .private_dev = true, + .protect_control_groups = true, + .protect_kernel_tunables = true, + .protect_kernel_modules = true, + }; + char *root_directory; char *projects_directory; int r; @@ -69,14 +77,12 @@ int main(int argc, char *argv[]) { log_info("Not chrooted"); r = setup_namespace(root_directory, + &ns_info, (char **) writable, (char **) readonly, (char **) inaccessible, tmp_dir, var_tmp_dir, - true, - true, - true, PROTECT_HOME_NO, PROTECT_SYSTEM_NO, 0); From 4084e8fc8947566092fd4ee5a07405570fdbf84d Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Sun, 9 Oct 2016 12:28:25 +0200 Subject: [PATCH 7/9] core: check protect_kernel_modules and private_devices in order to setup NNP --- src/core/execute.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/core/execute.c b/src/core/execute.c index dc078d96f0..71439bc3c2 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2115,6 +2115,8 @@ static bool context_has_no_new_privileges(const ExecContext *c) { c->memory_deny_write_execute || c->restrict_realtime || c->protect_kernel_tunables || + c->protect_kernel_modules || + c->private_devices || context_has_syscall_filters(c); } From e66a2f658b182b1fe8e4bc46b384b9967abd2bf2 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Sun, 9 Oct 2016 12:31:51 +0200 Subject: [PATCH 8/9] core: make sure to dump ProtectKernelModules= value --- src/core/execute.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/core/execute.c b/src/core/execute.c index 71439bc3c2..869522704a 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -3190,6 +3190,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { "%sPrivateTmp: %s\n" "%sPrivateDevices: %s\n" "%sProtectKernelTunables: %s\n" + "%sProtectKernelModules: %s\n" "%sProtectControlGroups: %s\n" "%sPrivateNetwork: %s\n" "%sPrivateUsers: %s\n" @@ -3205,6 +3206,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { prefix, yes_no(c->private_tmp), prefix, yes_no(c->private_devices), prefix, yes_no(c->protect_kernel_tunables), + prefix, yes_no(c->protect_kernel_modules), prefix, yes_no(c->protect_control_groups), prefix, yes_no(c->private_network), prefix, yes_no(c->private_users), From 4982dbcc300d4599aa6ac143e922d6fbee31a860 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Sun, 9 Oct 2016 12:38:45 +0200 Subject: [PATCH 9/9] test: add test to make sure that ProtectKernelModules=yes disconnect mount propagation --- Makefile.am | 1 + src/test/test-execute.c | 5 +++-- ...exec-protectkernelmodules-yes-mount-propagation.service | 7 +++++++ 3 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 test/test-execute/exec-protectkernelmodules-yes-mount-propagation.service diff --git a/Makefile.am b/Makefile.am index 1ea25bb688..4a69236090 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1641,6 +1641,7 @@ EXTRA_DIST += \ test/test-execute/exec-privatedevices-yes-capability-mknod.service \ test/test-execute/exec-protectkernelmodules-no-capabilities.service \ test/test-execute/exec-protectkernelmodules-yes-capabilities.service \ + test/test-execute/exec-protectkernelmodules-yes-mount-propagation.service \ test/test-execute/exec-privatetmp-no.service \ test/test-execute/exec-privatetmp-yes.service \ test/test-execute/exec-readonlypaths.service \ diff --git a/src/test/test-execute.c b/src/test/test-execute.c index 1eade98ed3..e8ff02adaf 100644 --- a/src/test/test-execute.c +++ b/src/test/test-execute.c @@ -144,7 +144,7 @@ static void test_exec_privatedevices_capabilities(Manager *m) { test(m, "exec-privatedevices-no-capability-sys-rawio.service", 0, CLD_EXITED); } -static void test_exec_protectkernelmodules_capabilities(Manager *m) { +static void test_exec_protectkernelmodules(Manager *m) { if (detect_container() > 0) { log_notice("testing in container, skipping protectkernelmodules tests"); return; @@ -152,6 +152,7 @@ static void test_exec_protectkernelmodules_capabilities(Manager *m) { test(m, "exec-protectkernelmodules-no-capabilities.service", 0, CLD_EXITED); test(m, "exec-protectkernelmodules-yes-capabilities.service", 0, CLD_EXITED); + test(m, "exec-protectkernelmodules-yes-mount-propagation.service", 0, CLD_EXITED); } static void test_exec_readonlypaths(Manager *m) { @@ -380,7 +381,7 @@ int main(int argc, char *argv[]) { test_exec_privatetmp, test_exec_privatedevices, test_exec_privatedevices_capabilities, - test_exec_protectkernelmodules_capabilities, + test_exec_protectkernelmodules, test_exec_readonlypaths, test_exec_readwritepaths, test_exec_inaccessiblepaths, diff --git a/test/test-execute/exec-protectkernelmodules-yes-mount-propagation.service b/test/test-execute/exec-protectkernelmodules-yes-mount-propagation.service new file mode 100644 index 0000000000..e438783df3 --- /dev/null +++ b/test/test-execute/exec-protectkernelmodules-yes-mount-propagation.service @@ -0,0 +1,7 @@ +[Unit] +Description=Test to make sure that passing ProtectKernelModules=yes disconnect mount propagation + +[Service] +ProtectKernelModules=yes +ExecStart=/bin/sh -x -c 'mkdir -p /TEST; mount -t tmpfs tmpfs /TEST; grep TEST /proc/self/mountinfo && ! grep TEST /proc/$${PPID}/mountinfo && ! grep TEST /proc/1/mountinfo' +Type=oneshot