diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 986985ad35..3bea4976b3 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1404,6 +1404,23 @@ logging. This does not affect commands prefixed with +. + + ProtectKernelModules= + + Takes a boolean argument. If true, explicit module loading will + be denied. This allows to turn off module load and unload operations on modular + kernels. It is recomended to turn this on for most services that do not need special + file systems or extra kernel modules to work. Default to off. Enabling this option + removes CAP_SYS_MODULE from the capability bounding set for + the unit, and installs a system call filter to block module system calls. + Note that limited automatic module loading due to user configuration or kernel + mapping tables might still happen as side effect of requested user operations, + both privileged and unprivileged. To disable module auto-load feature please see + sysctl.d5 + kernel.modules_disabled mechanism and + /proc/sys/kernel/modules_disabled documentation. + + Personality= diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index eec4500c8c..b8720d7d3d 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -708,6 +708,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1075,7 +1076,7 @@ int bus_exec_context_set_transient_property( "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", - "ProtectControlGroups")) { + "ProtectKernelModules", "ProtectControlGroups")) { int b; r = sd_bus_message_read(message, "b", &b); @@ -1111,6 +1112,8 @@ int bus_exec_context_set_transient_property( c->remove_ipc = b; else if (streq(name, "ProtectKernelTunables")) c->protect_kernel_tunables = b; + else if (streq(name, "ProtectKernelModules")) + c->protect_kernel_modules = b; else if (streq(name, "ProtectControlGroups")) c->protect_control_groups = b; diff --git a/src/core/execute.c b/src/core/execute.c index 0c983f4953..7a278b7d31 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1436,6 +1436,50 @@ finish: return r; } +static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { + static const int module_syscalls[] = { + SCMP_SYS(delete_module), + SCMP_SYS(finit_module), + SCMP_SYS(init_module), + }; + + scmp_filter_ctx *seccomp; + unsigned i; + int r; + + assert(c); + + /* Turn of module syscalls on ProtectKernelModules=yes */ + + if (skip_seccomp_unavailable(u, "ProtectKernelModules=")) + return 0; + + seccomp = seccomp_init(SCMP_ACT_ALLOW); + if (!seccomp) + return -ENOMEM; + + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) + goto finish; + + for (i = 0; i < ELEMENTSOF(module_syscalls); i++) { + r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), + module_syscalls[i], 0); + if (r < 0) + goto finish; + } + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + goto finish; + + r = seccomp_load(seccomp); + +finish: + seccomp_release(seccomp); + return r; +} + static int apply_private_devices(Unit *u, const ExecContext *c) { const SystemCallFilterSet *set; scmp_filter_ctx *seccomp; @@ -2690,6 +2734,14 @@ static int exec_child( } } + if (context->protect_kernel_modules) { + r = apply_protect_kernel_modules(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return r; + } + } + if (context->private_devices) { r = apply_private_devices(unit, context); if (r < 0) { diff --git a/src/core/execute.h b/src/core/execute.h index 449180c903..1de439c3ad 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -175,6 +175,7 @@ struct ExecContext { ProtectSystem protect_system; ProtectHome protect_home; bool protect_kernel_tunables; + bool protect_kernel_modules; bool protect_control_groups; bool no_new_privileges; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index c49c1d6732..a700d853cc 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -90,6 +90,7 @@ $1.InaccessiblePaths, config_parse_namespace_path_strv, 0, $1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp) $1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices) $1.ProtectKernelTunables, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_tunables) +$1.ProtectKernelModules, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_modules) $1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups) $1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network) $1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users) diff --git a/src/core/unit.c b/src/core/unit.c index 690f7f7dd9..71f95c0b96 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -3401,6 +3401,9 @@ int unit_patch_contexts(Unit *u) { if (ec->private_devices) ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_MKNOD); + if (ec->protect_kernel_modules) + ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE); + if (ec->dynamic_user) { if (!ec->user) { r = user_from_unit_name(u, &ec->user); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index a550a370b5..f639e0e832 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -204,7 +204,8 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit", "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute", - "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectControlGroups")) { + "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", + "ProtectKernelModules", "ProtectControlGroups")) { r = parse_boolean(eq); if (r < 0)