From 228af36fff15430825dddb64d6dc6eeb47491aae Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 1 Jun 2018 11:10:49 +0200 Subject: [PATCH 1/4] core: add new PrivateMounts= unit setting This new setting is supposed to be useful in most cases where "MountFlags=slave" is currently used, i.e. as an explicit way to run a service in its own mount namespace and decouple propagation from all mounts of the new mount namespace towards the host. The effect of MountFlags=slave and PrivateMounts=yes is mostly the same, as both cause a CLONE_NEWNS namespace to be opened, and both will result in all mounts within it to be mounted MS_SLAVE. The difference is mostly on the conceptual/philosophical level: configuring the propagation mode is nothing people should have to think about, in particular as the matter is not precisely easyto grok. Moreover, MountFlags= allows configuration of "private" and "slave" modes which don't really make much sense to use in real-life and are quite confusing. In particular PrivateMounts=private means mounts made on the host stay pinned for good by the service which is particularly nasty for removable media mount. And PrivateMounts=shared is in most ways a NOP when used a alone... The main technical difference between setting only MountFlags=slave or only PrivateMounts=yes in a unit file is that the former remounts all mounts to MS_SLAVE and leaves them there, while that latter remounts them to MS_SHARED again right after. The latter is generally a nicer approach, since it disables propagation, while MS_SHARED is afterwards in effect, which is really nice as that means further namespacing down the tree will get MS_SHARED logic by default and we unify how applications see our mounts as we always pass them as MS_SHARED regardless whether any mount namespacing is used or not. The effect of PrivateMounts=yes was implied already by all the other mount namespacing options. With this new option we add an explicit knob for it, to request it without any other option used as well. See: #4393 --- src/core/dbus-execute.c | 4 ++++ src/core/execute.c | 24 ++++++++++++++---------- src/core/execute.h | 1 + src/core/load-fragment-gperf.gperf.m4 | 1 + src/core/namespace.c | 5 ++--- src/core/namespace.h | 1 + src/shared/bus-unit-util.c | 2 +- 7 files changed, 24 insertions(+), 14 deletions(-) diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 8c752ceaa6..747b9d8eeb 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -744,6 +744,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateMounts", "b", bus_property_get_bool, offsetof(ExecContext, private_mounts), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectHome", "s", property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectSystem", "s", property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1110,6 +1111,9 @@ int bus_exec_context_set_transient_property( if (streq(name, "PrivateDevices")) return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error); + if (streq(name, "PrivateMounts")) + return bus_set_transient_bool(u, name, &c->private_mounts, message, flags, error); + if (streq(name, "PrivateNetwork")) return bus_set_transient_bool(u, name, &c->private_network, message, flags, error); diff --git a/src/core/execute.c b/src/core/execute.c index 2c64e08176..6aa4ec9c78 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1780,6 +1780,7 @@ static bool exec_needs_mount_namespace( return true; if (context->private_devices || + context->private_mounts || context->protect_system != PROTECT_SYSTEM_NO || context->protect_home != PROTECT_HOME_NO || context->protect_kernel_tunables || @@ -2312,7 +2313,7 @@ static int apply_mount_namespace( _cleanup_strv_free_ char **empty_directories = NULL; char *tmp = NULL, *var = NULL; const char *root_dir = NULL, *root_image = NULL; - NamespaceInfo ns_info = {}; + NamespaceInfo ns_info; bool needs_sandboxing; BindMount *bind_mounts = NULL; size_t n_bind_mounts = 0; @@ -2342,16 +2343,7 @@ static int apply_mount_namespace( if (r < 0) return r; - /* - * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed - * sandbox info, otherwise enforce it, don't ignore protected paths and - * fail if we are enable to apply the sandbox inside the mount namespace. - */ - if (!context->dynamic_user && root_dir) - ns_info.ignore_protect_paths = true; - needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED); - if (needs_sandboxing) ns_info = (NamespaceInfo) { .ignore_protect_paths = false, @@ -2360,7 +2352,19 @@ static int apply_mount_namespace( .protect_kernel_tunables = context->protect_kernel_tunables, .protect_kernel_modules = context->protect_kernel_modules, .mount_apivfs = context->mount_apivfs, + .private_mounts = context->private_mounts, }; + else if (!context->dynamic_user && root_dir) + /* + * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed + * sandbox info, otherwise enforce it, don't ignore protected paths and + * fail if we are enable to apply the sandbox inside the mount namespace. + */ + ns_info = (NamespaceInfo) { + .ignore_protect_paths = true, + }; + else + ns_info = (NamespaceInfo) {}; r = setup_namespace(root_dir, root_image, &ns_info, context->read_write_paths, diff --git a/src/core/execute.h b/src/core/execute.h index ace94338e7..f434d10e7e 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -228,6 +228,7 @@ struct ExecContext { bool private_network; bool private_devices; bool private_users; + bool private_mounts; ProtectSystem protect_system; ProtectHome protect_home; bool protect_kernel_tunables; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 44c9978c54..15fb47838c 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -114,6 +114,7 @@ $1.ProtectKernelModules, config_parse_bool, 0, $1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups) $1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network) $1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users) +$1.PrivateMounts, config_parse_bool, 0, offsetof($1, exec_context.private_mounts) $1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context.protect_system) $1.ProtectHome, config_parse_protect_home, 0, offsetof($1, exec_context.protect_home) $1.MountFlags, config_parse_exec_mount_flags, 0, offsetof($1, exec_context.mount_flags) diff --git a/src/core/namespace.c b/src/core/namespace.c index 24da3b8a64..2523c2a47f 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1133,9 +1133,9 @@ int setup_namespace( _cleanup_free_ void *root_hash = NULL; MountEntry *m, *mounts = NULL; size_t root_hash_size = 0; - bool make_slave = false; const char *root; size_t n_mounts; + bool make_slave; bool require_prefix = false; int r = 0; @@ -1200,8 +1200,7 @@ int setup_namespace( protect_home, protect_system); /* Set mount slave mode */ - if (root || n_mounts > 0) - make_slave = true; + make_slave = root || n_mounts > 0 || ns_info->private_mounts; if (n_mounts > 0) { m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry)); diff --git a/src/core/namespace.h b/src/core/namespace.h index 705eb4e13a..e0e8e09e0f 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -50,6 +50,7 @@ typedef enum ProtectSystem { struct NamespaceInfo { bool ignore_protect_paths:1; bool private_dev:1; + bool private_mounts:1; bool protect_control_groups:1; bool protect_kernel_tunables:1; bool protect_kernel_modules:1; diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 64b7ac8d69..01d820349a 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -699,7 +699,7 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con if (STR_IN_SET(field, "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "TTYVTDisallocate", "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", - "NoNewPrivileges", "SyslogLevelPrefix", + "PrivateMounts", "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectKernelModules", "ProtectControlGroups", "MountAPIVFS", "CPUSchedulingResetOnFork", "LockPersonality")) From 2f2e14b251b9929e84e8b690d0187b766dfbae20 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 1 Jun 2018 11:23:51 +0200 Subject: [PATCH 2/4] man: document the new PrivateMounts= setting Also, extend the documentation on MountFlags= substantially, hopefully addressing all the questions of #4393 Fixes: #4393 --- man/systemd.exec.xml | 79 +++++++++++++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 19 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 2e01326bb9..de4c53c475 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1277,28 +1277,69 @@ RestrictNamespaces=~cgroup net stopped. This setting is implied if DynamicUser= is set. + + PrivateMounts= + + Takes a boolean parameter. If set, the processes of this unit will be run in their own private + file system (mount) namespace with all mount propagation from the processes towards the host's main file system + namespace turned off. This means any file system mount points established or removed by the unit's processes + will be private to them and not be visible to the host. However, file system mount points established or + removed on the host will be propagated to the unit's processes. See mount_namespaces7 for + details on file system namespaces. Defaults to off. + + When turned on, this executes three operations for each invoked process: a new + CLONE_NEWNS namespace is created, after which all existing mounts are remounted to + MS_SLAVE to disable propagation from the unit's processes to the host (but leaving + propagation in the opposite direction in effect). Finally, the mounts are remounted again to the propagation + mode configured with MountFlags=, see below. + + File system namespaces are set up individually for each process forked off by the service manager. Mounts + established in the namespace of the process created by ExecStartPre= will hence be cleaned + up automatically as soon as that process exits and will not be available to subsequent processes forked off for + ExecStart= (and similar applies to the various other commands configured for + units). Similarly, JoinsNamespaceOf= does not permit sharing kernel mount namespaces between + units, it only enables sharing of the /tmp/ and /var/tmp/ + directories. + + Other file system namespace unit settings — PrivateMounts=, + PrivateTmp=, PrivateDevices=, ProtectSystem=, + ProtectHome=, ReadOnlyPaths=, InaccessiblePaths=, + ReadWritePaths=, … — also enable file system namespacing in a fashion equivalent to this + option. Hence it is primarily useful to explicitly request this behaviour if none of the other settings are + used. + + MountFlags= - Takes a mount propagation flag: , or - , which control whether mounts in the file system namespace set up for this unit's - processes will receive or propagate mounts and unmounts. See mount2 for - details. Defaults to . Use to ensure that mounts and unmounts - are propagated from systemd's namespace to the service's namespace and vice versa. Use - to run processes so that none of their mounts and unmounts will propagate to the host. Use - to also ensure that no mounts and unmounts from the host will propagate into the unit - processes' namespace. If this is set to or , any mounts created - by spawned processes will be unmounted after the completion of the current command line of - ExecStartPre=, ExecStartPost=, ExecStart=, and - ExecStopPost=. Note that means that file systems mounted on the host - might stay mounted continuously in the unit's namespace, and thus keep the device busy. Note that the file - system namespace related options (PrivateTmp=, PrivateDevices=, - ProtectSystem=, ProtectHome=, ProtectKernelTunables=, - ProtectControlGroups=, ReadOnlyPaths=, - InaccessiblePaths=, ReadWritePaths=) require that mount and unmount - propagation from the unit's file system namespace is disabled, and hence downgrade to - . + Takes a mount propagation setting: , or + , which controls whether file system mount points in the file system namespaces set up + for this unit's processes will receive or propagate mounts and unmounts from other file system namespaces. See + mount2 + for details on mount propagation, and the three propagation flags in particular. + + This setting only controls the final propagation setting in effect on all mount + points of the file system namespace created for each process of this unit. Other file system namespacing unit + settings (see the discussion in PrivateMounts= above) will implicitly disable mount and + unmount propagation from the unit's processes towards the host by changing the propagation setting of all mount + points in the unit's file system namepace to first. Setting this option to + does not reestablish propagation in that case. Conversely, if this option is set, but + no other file system namespace setting is used, then new file system namespaces will be created for the unit's + processes and this propagation flag will be applied right away to all mounts within it, without the + intermediary application of . + + If not set – but file system namespaces are enabled through another file system namespace unit setting – + mount propagation is used, but — as mentioned — as is applied + first, propagation from the unit's processes to the host is still turned off. + + It is not recommended to to use mount propagation for units, as this means + temporary mounts (such as removable media) of the host will stay mounted and thus indefinitely busy in forked + off processes, as unmount propagation events won't be received by the file system namespace of the unit. + + Usually, it is best to leave this setting unmodified, and use higher level file system namespacing + options instead, in particular PrivateMounts=, see above. + From b2e8ae7380d009ab9f9260a34e251ac5990b01ca Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 1 Jun 2018 11:24:40 +0200 Subject: [PATCH 3/4] units: switch udev service to use PrivateMounts=yes Given that PrivateMounts=yes is the "successor" to MountFlags=slave in unit files, let's make use of it for udevd. --- units/systemd-udevd.service.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/units/systemd-udevd.service.in b/units/systemd-udevd.service.in index 8557522e7b..2b9fa69d9b 100644 --- a/units/systemd-udevd.service.in +++ b/units/systemd-udevd.service.in @@ -25,7 +25,7 @@ ExecStart=@rootlibexecdir@/systemd-udevd KillMode=mixed WatchdogSec=3min TasksMax=infinity -MountFlags=slave +PrivateMounts=yes MemoryDenyWriteExecute=yes RestrictRealtime=yes RestrictAddressFamilies=AF_UNIX AF_NETLINK AF_INET AF_INET6 From c49a7cbd63ef9e08c4e51a2fc71736b450501828 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 12 Jun 2018 16:26:36 +0200 Subject: [PATCH 4/4] update NEWS with new PrivateMounts= blurb --- NEWS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/NEWS b/NEWS index 1e61189c38..3ad0464132 100644 --- a/NEWS +++ b/NEWS @@ -279,6 +279,15 @@ CHANGES WITH 239 in spe: query the default, built-in $PATH PID 1 will pass to the services it manages. + * A new unit file setting PrivateMounts= has been added. It's a boolean + option. If enabled the unit's processes are invoked in their own file + system namespace. Note that this behaviour is also implied if any + other file system namespacing options (such as PrivateTmp=, + PrivateDevices=, ProtectSystem=, …) are used. This option is hence + primarily useful for services that do not use any of the other file + system namespacing options. One such service is systemd-udevd.service + wher this is now used by default. + Contributions from: Adam Duskett, Alan Jenkins, Alessandro Casale, Alexander Kurtz, Alex Gartrell, Anssi Hannula, Antique, Arnaud Rebillout, Brian J. Murrell, Bruno Vernay, Chris Lesiak, Christian