diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 0b650fc67a..4cee4a508a 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -750,6 +750,15 @@ CapabilityBoundingSet=~CAP_B CAP_C Sandboxing + The following sandboxing options are an effective way to limit the exposure of the system towards the unit's + processes. It is recommended to turn on as many of these options for each unit as is possible without negatively + affecting the process' ability to operate. Note that many of these sandboxing features are gracefully turned off on + systems where the underlying security mechanism is not available. For example, ProtectSystem= + has no effect if the kernel is built without file system namespacing or if the service manager runs in a container + manager that makes file system namespacing unavailable to its payload. Similar, + RestrictRealtime= has no effect on systems that lack support for SECCOMP system call filtering, + or in containers where support for this is turned off. + diff --git a/src/core/execute.c b/src/core/execute.c index 881c73595f..501b367eae 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2396,10 +2396,15 @@ static int apply_mount_namespace( bind_mount_free_many(bind_mounts, n_bind_mounts); - /* If we couldn't set up the namespace this is probably due to a - * missing capability. In this case, silently proceeed. */ - if (IN_SET(r, -EPERM, -EACCES)) { - log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m"); + /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports + * that with a special, recognizable error ENOANO. In this case, silently proceeed, but only if exclusively + * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a + * completely different execution environment. */ + if (r == -ENOANO && + n_bind_mounts == 0 && context->n_temporary_filesystems == 0 && + !root_dir && !root_image && + !context->dynamic_user) { + log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring."); return 0; } diff --git a/src/core/namespace.c b/src/core/namespace.c index 9031061df8..c164bc5793 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1313,8 +1313,17 @@ int setup_namespace( normalize_mounts(root, mounts, &n_mounts); } + /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */ + if (unshare(CLONE_NEWNS) < 0) { r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m"); + if (IN_SET(r, -EACCES, -EPERM, -EOPNOTSUPP, -ENOSYS)) + /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter in place + * that doesn't allow us to create namespaces (or a missing cap), then propagate a recognizable + * error back, which the caller can use to detect this case (and only this) and optionally + * continue without namespacing applied. */ + r = -ENOANO; + goto finish; }