core: add a per-unit setting MountAPIVFS= for mounting /dev, /proc, /sys in conjunction with RootDirectory=

This adds a boolean unit file setting MountAPIVFS=. If set, the three main API VFS mounts will be mounted for the service. This only has an effect on RootDirectory=, which it makes a ton times more useful. (This is basically the /dev + /proc + /sys mounting code posted in the original #4727, but rebased on current git, and with the automatic logic replaced by explicit logic controlled by a unit file setting)
2016-12-22 23:34:35 +01:00 · 2016-12-22 23:34:35 +01:00 · 5d997827e2
parent 1eb7e08e20
commit 5d997827e2
8 changed files with 129 additions and 16 deletions
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@ -132,8 +132,22 @@
        the <function>chroot()</function> jail. Note that setting this parameter might result in additional
        dependencies to be added to the unit (see above).</para>

-        <para>The <varname>PrivateUsers=</varname> setting is particularly useful in conjunction with
-        <varname>RootDirectory=</varname>. For details, see below.</para></listitem>
+        <para>The <varname>MountAPIVFS=</varname> and <varname>PrivateUsers=</varname> settings are particularly useful
+        in conjunction with <varname>RootDirectory=</varname>. For details, see below.</para></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><varname>MountAPIVFS=</varname></term>
+
+        <listitem><para>Takes a boolean argument. If on, a private mount namespace for the unit's processes is created
+        and the API file systems <filename>/proc</filename>, <filename>/sys</filename> and <filename>/dev</filename>
+        will be mounted inside of it, unless they are already mounted. Note that this option has no effect unless used
+        in conjunction with <varname>RootDirectory=</varname> as these three mounts are generally mounted in the host
+        anyway, and unless the root directory is changed the private mount namespace will be a 1:1 copy of the host's,
+        and include these three mounts. Note that the <filename>/dev</filename> file system of the host is bind mounted
+        if this option is used without <varname>PrivateDevices=</varname>. To run the service with a private, minimal
+        version of <filename>/dev/</filename>, combine this option with
+        <varname>PrivateDevices=</varname>.</para></listitem>
      </varlistentry>

      <varlistentry>
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@ -828,6 +828,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
        SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),
        SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
        SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("MountAPIVFS", "b", bus_property_get_bool, offsetof(ExecContext, mount_apivfs), SD_BUS_VTABLE_PROPERTY_CONST),
        SD_BUS_VTABLE_END
 };

@ -1207,7 +1208,7 @@ int bus_exec_context_set_transient_property(
                              "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
                              "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
                              "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
-                              "ProtectKernelModules", "ProtectControlGroups")) {
+                              "ProtectKernelModules", "ProtectControlGroups", "MountAPIVFS")) {
                int b;

                r = sd_bus_message_read(message, "b", &b);
@ -1247,6 +1248,8 @@ int bus_exec_context_set_transient_property(
                                c->protect_kernel_modules = b;
                        else if (streq(name, "ProtectControlGroups"))
                                c->protect_control_groups = b;
+                        else if (streq(name, "MountAPIVFS"))
+                                c->mount_apivfs = b;

                        unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
                }
--- a/src/core/execute.c
+++ b/src/core/execute.c
@ -1662,6 +1662,9 @@ static bool exec_needs_mount_namespace(
            context->protect_control_groups)
                return true;

+        if (context->mount_apivfs)
+                return true;
+
        return false;
 }

@ -1942,6 +1945,7 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context,
                .protect_control_groups = context->protect_control_groups,
                .protect_kernel_tunables = context->protect_kernel_tunables,
                .protect_kernel_modules = context->protect_kernel_modules,
+                .mount_apivfs = context->mount_apivfs,
        };

        assert(context);
@ -3294,6 +3298,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                "%sPrivateUsers: %s\n"
                "%sProtectHome: %s\n"
                "%sProtectSystem: %s\n"
+                "%sMountAPIVFS: %s\n"
                "%sIgnoreSIGPIPE: %s\n"
                "%sMemoryDenyWriteExecute: %s\n"
                "%sRestrictRealtime: %s\n",
@ -3310,6 +3315,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                prefix, yes_no(c->private_users),
                prefix, protect_home_to_string(c->protect_home),
                prefix, protect_system_to_string(c->protect_system),
+                prefix, yes_no(c->mount_apivfs),
                prefix, yes_no(c->ignore_sigpipe),
                prefix, yes_no(c->memory_deny_write_execute),
                prefix, yes_no(c->restrict_realtime));
--- a/src/core/execute.h
+++ b/src/core/execute.h
@ -183,6 +183,7 @@ struct ExecContext {
        bool protect_kernel_tunables;
        bool protect_kernel_modules;
        bool protect_control_groups;
+        bool mount_apivfs;

        bool no_new_privileges;

--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@ -101,6 +101,7 @@ $1.PrivateUsers,                 config_parse_bool,                  0,
 $1.ProtectSystem,                config_parse_protect_system,        0,                             offsetof($1, exec_context)
 $1.ProtectHome,                  config_parse_protect_home,          0,                             offsetof($1, exec_context)
 $1.MountFlags,                   config_parse_exec_mount_flags,      0,                             offsetof($1, exec_context)
+$1.MountAPIVFS,                  config_parse_bool,                  0,                             offsetof($1, exec_context.mount_apivfs)
 $1.Personality,                  config_parse_personality,           0,                             offsetof($1, exec_context.personality)
 $1.RuntimeDirectoryMode,         config_parse_mode,                  0,                             offsetof($1, exec_context.runtime_directory_mode)
 $1.RuntimeDirectory,             config_parse_runtime_directory,     0,                             offsetof($1, exec_context.runtime_directory)
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@ -52,10 +52,13 @@ typedef enum MountMode {
        INACCESSIBLE,
        BIND_MOUNT,
        BIND_MOUNT_RECURSIVE,
-        READONLY,
        PRIVATE_TMP,
        PRIVATE_VAR_TMP,
        PRIVATE_DEV,
+        BIND_DEV,
+        SYSFS,
+        PROCFS,
+        READONLY,
        READWRITE,
 } MountMode;

@ -70,13 +73,13 @@ typedef struct MountEntry {
        char *source_malloc;
 } MountEntry;

-/*
- * The following Protect tables are to protect paths and mark some of them
- * READONLY, in case a path is covered by an option from another table, then
- * it is marked READWRITE in the current one, and the more restrictive mode is
- * applied from that other table. This way all options can be combined in a
- * safe and comprehensible way for users.
- */
+/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
+ * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
+static const MountEntry apivfs_table[] = {
+        { "/proc",               PROCFS,       false },
+        { "/dev",                BIND_DEV,     false },
+        { "/sys",                SYSFS,        false },
+};

 /* ProtectKernelTunables= option and the related filesystem APIs */
 static const MountEntry protect_kernel_tunables_table[] = {
@ -465,7 +468,7 @@ static void drop_outside_root(const char *root_directory, MountEntry *m, unsigne
        *n = t - m;
 }

-static int mount_dev(MountEntry *m) {
+static int mount_private_dev(MountEntry *m) {
        static const char devnodes[] =
                "/dev/null\0"
                "/dev/zero\0"
@ -604,6 +607,62 @@ fail:
        return r;
 }

+static int mount_bind_dev(MountEntry *m) {
+        int r;
+
+        assert(m);
+
+        /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
+         * /dev. This is only used when RootDirectory= is set. */
+
+        r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+        if (r < 0)
+                return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
+        if (r > 0) /* make this a NOP if /dev is already a mount point */
+                return 0;
+
+        if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
+                return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
+
+        return 1;
+}
+
+static int mount_sysfs(MountEntry *m) {
+        int r;
+
+        assert(m);
+
+        r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+        if (r < 0)
+                return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
+        if (r > 0) /* make this a NOP if /sys is already a mount point */
+                return 0;
+
+        /* Bind mount the host's version so that we get all child mounts of it, too. */
+        if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
+                return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
+
+        return 1;
+}
+
+static int mount_procfs(MountEntry *m) {
+        int r;
+
+        assert(m);
+
+        r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+        if (r < 0)
+                return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
+        if (r > 0) /* make this a NOP if /proc is already a mount point */
+                return 0;
+
+        /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
+        if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
+                return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
+
+        return 1;
+}
+
 static int mount_entry_chase(
                const char *root_directory,
                MountEntry *m,
@ -691,6 +750,7 @@ static int apply_mount(

        case BIND_MOUNT_RECURSIVE:
                /* Also chase the source mount */
+
                r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc);
                if (r <= 0)
                        return r;
@ -707,7 +767,16 @@ static int apply_mount(
                break;

        case PRIVATE_DEV:
-                return mount_dev(m);
+                return mount_private_dev(m);
+
+        case BIND_DEV:
+                return mount_bind_dev(m);
+
+        case SYSFS:
+                return mount_sysfs(m);
+
+        case PROCFS:
+                return mount_procfs(m);

        default:
                assert_not_reached("Unknown mode");
@ -729,7 +798,7 @@ static int make_read_only(MountEntry *m, char **blacklist) {

        if (mount_entry_read_only(m))
                r = bind_remount_recursive(mount_entry_path(m), true, blacklist);
-        else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/
+        else if (m->mode == PRIVATE_DEV) { /* Superblock can be readonly but the submounts can't*/
                if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
                        r = -errno;
        } else
@ -745,6 +814,17 @@ static int make_read_only(MountEntry *m, char **blacklist) {
        return r;
 }

+static bool namespace_info_mount_apivfs(const NameSpaceInfo *ns_info) {
+        assert(ns_info);
+
+        /* ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=, since to protect the API VFS mounts,
+         * they need to be around in the first place... */
+
+        return ns_info->mount_apivfs ||
+                ns_info->protect_control_groups ||
+                ns_info->protect_kernel_tunables;
+}
+
 static unsigned namespace_calculate_mounts(
                const NameSpaceInfo *ns_info,
                char** read_write_paths,
@ -781,7 +861,8 @@ static unsigned namespace_calculate_mounts(
                (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
                (ns_info->protect_control_groups ? 1 : 0) +
                (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
-                protect_home_cnt + protect_system_cnt;
+                protect_home_cnt + protect_system_cnt +
+                (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0);
 }

 int setup_namespace(
@ -885,6 +966,12 @@ int setup_namespace(
                if (r < 0)
                        goto finish;

+                if (namespace_info_mount_apivfs(ns_info)) {
+                        r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
+                        if (r < 0)
+                                goto finish;
+                }
+
                assert(mounts + n_mounts == m);

                /* Prepend the root directory where that's necessary */
--- a/src/core/namespace.h
+++ b/src/core/namespace.h
@ -50,6 +50,7 @@ struct NameSpaceInfo {
        bool protect_control_groups:1;
        bool protect_kernel_tunables:1;
        bool protect_kernel_modules:1;
+        bool mount_apivfs:1;
 };

 struct BindMount {
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@ -208,7 +208,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
                              "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
                              "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
                              "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
-                              "ProtectKernelModules", "ProtectControlGroups")) {
+                              "ProtectKernelModules", "ProtectControlGroups", "MountAPIVFS")) {

                r = parse_boolean(eq);
                if (r < 0)