From 1beab8b0d0ff2d7d1436b52d4a0c3d56dc908962 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 10 Aug 2018 15:07:14 +0200
Subject: [PATCH 1/2] namespace: be more careful when handling namespacing
 failures gracefully

This makes two changes to the namespacing code:

1. We'll only gracefully skip service namespacing on access failure if
   exclusively sandboxing options where selected, and not mount-related
   options that result in a very different view of the world. For example,
   ignoring RootDirectory=, RootImage= or Bind= is really probablematic,
   but ReadOnlyPaths= is just a weaker sandbox.

2. The namespacing code will now return a clearly recognizable error
   code when it cannot enforce its namespacing, so that we cannot
   confuse EPERM errors from mount() with those from unshare(). Only the
   errors from the first unshare() are now taken as hint to gracefully
   disable namespacing.

Fixes: #9844 #9835
---
 src/core/execute.c   | 13 +++++++++----
 src/core/namespace.c |  9 +++++++++
 2 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/src/core/execute.c b/src/core/execute.c
index 67f6dbd600..ae90af9570 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -2386,10 +2386,15 @@ static int apply_mount_namespace(
 
         bind_mount_free_many(bind_mounts, n_bind_mounts);
 
-        /* If we couldn't set up the namespace this is probably due to a
-         * missing capability. In this case, silently proceeed. */
-        if (IN_SET(r, -EPERM, -EACCES)) {
-                log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
+        /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
+         * that with a special, recognizable error ENOANO. In this case, silently proceeed, but only if exclusively
+         * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
+         * completely different execution environment. */
+        if (r == -ENOANO &&
+            n_bind_mounts == 0 && context->n_temporary_filesystems == 0 &&
+            !root_dir && !root_image &&
+            !context->dynamic_user) {
+                log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
                 return 0;
         }
 
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 3488758e82..62518e1c4c 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -1290,8 +1290,17 @@ int setup_namespace(
                 normalize_mounts(root, mounts, &n_mounts);
         }
 
+        /* All above is just preparation, figuring out what to do. Let's now actually start doing something. */
+
         if (unshare(CLONE_NEWNS) < 0) {
                 r = log_debug_errno(errno, "Failed to unshare the mount namespace: %m");
+                if (IN_SET(r, -EACCES, -EPERM, -EOPNOTSUPP, -ENOSYS))
+                        /* If the kernel doesn't support namespaces, or when there's a MAC or seccomp filter in place
+                         * that doesn't allow us to create namespaces (or a missing cap), then propagate a recognizable
+                         * error back, which the caller can use to detect this case (and only this) and optionally
+                         * continue without namespacing applied. */
+                        r = -ENOANO;
+
                 goto finish;
         }
 

From 2d2224e407c553d68c2b556f3abc8225f68ad803 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 10 Aug 2018 15:26:32 +0200
Subject: [PATCH 2/2] man: document that most sandboxing options are best
 effort only

---
 man/systemd.exec.xml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 0b650fc67a..4cee4a508a 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -750,6 +750,15 @@ CapabilityBoundingSet=~CAP_B CAP_C</programlisting>
   <refsect1>
     <title>Sandboxing</title>
 
+    <para>The following sandboxing options are an effective way to limit the exposure of the system towards the unit's
+    processes. It is recommended to turn on as many of these options for each unit as is possible without negatively
+    affecting the process' ability to operate. Note that many of these sandboxing features are gracefully turned off on
+    systems where the underlying security mechanism is not available. For example, <varname>ProtectSystem=</varname>
+    has no effect if the kernel is built without file system namespacing or if the service manager runs in a container
+    manager that makes file system namespacing unavailable to its payload. Similar,
+    <varname>RestrictRealtime=</varname> has no effect on systems that lack support for SECCOMP system call filtering,
+    or in containers where support for this is turned off.</para>
+
     <variablelist>
 
       <varlistentry>