From 6c47cd7d3bf35c8158a0737f34fe2c5dc95e72d6 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 28 Sep 2017 18:55:45 +0200 Subject: [PATCH] execute: make StateDirectory= and friends compatible with DynamicUser=1 and RootDirectory=/RootImage= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's clean up the interaction of StateDirectory= (and friends) to DynamicUser=1: instead of creating these directories directly below /var/lib, place them in /var/lib/private instead if DynamicUser=1 is set, making that directory 0700 and owned by root:root. This way, if a dynamic UID is later reused, access to the old run's state directory is prohibited for that user. Then, use file system namespacing inside the service to make /var/lib/private a readable tmpfs, hiding all state directories that are not listed in StateDirectory=, and making access to the actual state directory possible. Mount all directories listed in StateDirectory= to the same places inside the service (which means they'll now be mounted into the tmpfs instance). Finally, add a symlink from the state directory name in /var/lib/ to the one in /var/lib/private, so that both the host and the service can access the path under the same location. Here's an example: let's say a service runs with StateDirectory=foo. When DynamicUser=0 is set, it will get the following setup, and no difference between what the unit and what the host sees: /var/lib/foo (created as directory) Now, if DynamicUser=1 is set, we'll instead get this on the host: /var/lib/private (created as directory with mode 0700, root:root) /var/lib/private/foo (created as directory) /var/lib/foo → private/foo (created as symlink) And from inside the unit: /var/lib/private (a tmpfs mount with mode 0755, root:root) /var/lib/private/foo (bind mounted from the host) /var/lib/foo → private/foo (the same symlink as above) This takes inspiration from how container trees are protected below /var/lib/machines: they generally reuse UIDs/GIDs of the host, but because /var/lib/machines itself is set to 0700 host users cannot access files in the container tree even if the UIDs/GIDs are reused. However, for this commit we add one further trick: inside and outside of the unit /var/lib/private is a different thing: outside it is a plain, inaccessible directory, and inside it is a world-readable tmpfs mount with only the whitelisted subdirs below it, bind mounte din. This means, from the outside the dir acts as an access barrier, but from the inside it does not. And the symlink created in /var/lib/foo itself points across the barrier in both cases, so that root and the unit's user always have access to these dirs without knowing the details of this mounting magic. This logic resolves a major shortcoming of DynamicUser=1 units: previously they couldn't safely store persistant data. With this change they can have their own private state, log and data directories, which they can write to, but which are protected from UID recycling. With this change, if RootDirectory= or RootImage= are used it is ensured that the specified state/log/cache directories are always mounted in from the host. This change of semantics I think is much preferable since this means the root directory/image logic can be used easily for read-only resource bundling (as all writable data resides outside of the image). Note that this is a change of behaviour, but given that we haven't released any systemd version with StateDirectory= and friends implemented this should be a safe change to make (in particular as previously it wasn't clear what would actually happen when used in combination). Moreover, by making this change we can later add a "+" modifier to these setings too working similar to the same modifier in ReadOnlyPaths= and friends, making specified paths relative to the container itself. --- src/core/execute.c | 258 +++++++++++++++++++++++++++++++++++++++++-- src/core/namespace.c | 48 ++++++++ src/core/namespace.h | 1 + src/test/test-ns.c | 1 + 4 files changed, 297 insertions(+), 11 deletions(-) diff --git a/src/core/execute.c b/src/core/execute.c index e7dfbfd010..1faa8e34dc 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1728,6 +1728,13 @@ static bool exec_needs_mount_namespace( if (context->mount_apivfs && (context->root_image || context->root_directory)) return true; + if (context->dynamic_user && + (!strv_isempty(context->directories[EXEC_DIRECTORY_RUNTIME].paths) || + !strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) || + !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) || + !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths))) + return true; + return false; } @@ -1924,7 +1931,8 @@ static int setup_exec_directory( } STRV_FOREACH(rt, context->directories[type].paths) { - _cleanup_free_ char *p; + _cleanup_free_ char *p = NULL, *pp = NULL; + const char *effective; p = strjoin(params->prefix[type], "/", *rt); if (!p) { @@ -1936,12 +1944,83 @@ static int setup_exec_directory( if (r < 0) goto fail; - r = mkdir_label(p, context->directories[type].mode); - if (r < 0 && r != -EEXIST) - goto fail; + if (context->dynamic_user && type != EXEC_DIRECTORY_CONFIGURATION) { + _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL; + + /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we + * want to avoid leaving a directory around fully accessible that is owned by a dynamic user + * whose UID is later on reused. To lock this down we use the same trick used by container + * managers to prohibit host users to get access to files of the same UID in containers: we + * place everything inside a directory that has an access mode of 0700 and is owned root:root, + * so that it acts as security boundary for unprivileged host code. We then use fs namespacing + * to make this directory permeable for the service itself. + * + * Specifically: for a service which wants a special directory "foo/" we first create a + * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of + * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way, + * privileged host users can access "foo/" as usual, but unprivileged host users can't look + * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally + * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus + * disabling the access boundary for the service and making sure it only gets access to the + * dirs it needs but no others. Tricky? Yes, absolutely, but it works! + * + * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be + * owned by the service itself. */ + + private_root = strjoin(params->prefix[type], "/private"); + if (!private_root) { + r = -ENOMEM; + goto fail; + } + + /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */ + r = mkdir_safe_label(private_root, 0700, 0, 0); + if (r < 0) + goto fail; + + pp = strjoin(private_root, "/", *rt); + if (!pp) { + r = -ENOMEM; + goto fail; + } + + /* Create all directories between the configured directory and this private root, and mark them 0755 */ + r = mkdir_parents_label(pp, 0755); + if (r < 0) + goto fail; + + /* Finally, create the actual directory for the service */ + r = mkdir_label(pp, context->directories[type].mode); + if (r < 0 && r != -EEXIST) + goto fail; + + parent = dirname_malloc(p); + if (!parent) { + r = -ENOMEM; + goto fail; + } + + r = path_make_relative(parent, pp, &relative); + if (r < 0) + goto fail; + + /* And link it up from the original place */ + r = symlink_idempotent(relative, p); + if (r < 0) + goto fail; + + effective = pp; + + } else { + r = mkdir_label(p, context->directories[type].mode); + if (r < 0 && r != -EEXIST) + goto fail; + + effective = p; + } /* First lock down the access mode */ - if (chmod(p, context->directories[type].mode) < 0) { + if (chmod(effective, context->directories[type].mode) < 0) { r = -errno; goto fail; } @@ -1952,7 +2031,7 @@ static int setup_exec_directory( continue; /* Then, change the ownership of the whole tree, if necessary */ - r = path_chown_recursive(p, uid, gid); + r = path_chown_recursive(effective, uid, gid); if (r < 0) goto fail; } @@ -2044,6 +2123,143 @@ static int compile_read_write_paths( return 0; } +static int compile_bind_mounts( + const ExecContext *context, + const ExecParameters *params, + BindMount **ret_bind_mounts, + unsigned *ret_n_bind_mounts, + char ***ret_empty_directories) { + + _cleanup_strv_free_ char **empty_directories = NULL; + BindMount *bind_mounts; + unsigned n, h = 0, i; + ExecDirectoryType t; + int r; + + assert(context); + assert(params); + assert(ret_bind_mounts); + assert(ret_n_bind_mounts); + assert(ret_empty_directories); + + n = context->n_bind_mounts; + for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + if (!params->prefix[t]) + continue; + + n += strv_length(context->directories[t].paths); + } + + if (n <= 0) { + *ret_bind_mounts = NULL; + *ret_n_bind_mounts = 0; + *ret_empty_directories = NULL; + return 0; + } + + bind_mounts = new(BindMount, n); + if (!bind_mounts) + return -ENOMEM; + + for (i = 0; context->n_bind_mounts; i++) { + BindMount *item = context->bind_mounts + i; + char *s, *d; + + s = strdup(item->source); + if (!s) { + r = -ENOMEM; + goto finish; + } + + d = strdup(item->destination); + if (!d) { + free(s); + r = -ENOMEM; + goto finish; + } + + bind_mounts[h++] = (BindMount) { + .source = s, + .destination = d, + .read_only = item->read_only, + .recursive = item->recursive, + .ignore_enoent = item->ignore_enoent, + }; + } + + for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + char **suffix; + + if (!params->prefix[t]) + continue; + + if (strv_isempty(context->directories[t].paths)) + continue; + + if (context->dynamic_user && t != EXEC_DIRECTORY_CONFIGURATION) { + char *private_root; + + /* So this is for a dynamic user, and we need to make sure the process can access its own + * directory. For that we overmount the usually inaccessible "private" subdirectory with a + * tmpfs that makes it accessible and is empty except for the submounts we do this for. */ + + private_root = strjoin(params->prefix[t], "/private"); + if (!private_root) { + r = -ENOMEM; + goto finish; + } + + r = strv_consume(&empty_directories, private_root); + if (r < 0) { + r = -ENOMEM; + goto finish; + } + } + + STRV_FOREACH(suffix, context->directories[t].paths) { + char *s, *d; + + if (context->dynamic_user && t != EXEC_DIRECTORY_CONFIGURATION) + s = strjoin(params->prefix[t], "/private/", *suffix); + else + s = strjoin(params->prefix[t], "/", *suffix); + if (!s) { + r = -ENOMEM; + goto finish; + } + + d = strdup(s); + if (!d) { + free(s); + r = -ENOMEM; + goto finish; + } + + bind_mounts[h++] = (BindMount) { + .source = s, + .destination = d, + .read_only = false, + .recursive = true, + .ignore_enoent = false, + }; + } + } + + assert(h == n); + + *ret_bind_mounts = bind_mounts; + *ret_n_bind_mounts = n; + *ret_empty_directories = empty_directories; + + empty_directories = NULL; + + return (int) n; + +finish: + bind_mount_free_many(bind_mounts, h); + return r; +} + static int apply_mount_namespace( Unit *u, ExecCommand *command, @@ -2051,7 +2267,7 @@ static int apply_mount_namespace( const ExecParameters *params, ExecRuntime *runtime) { - _cleanup_strv_free_ char **rw = NULL; + _cleanup_strv_free_ char **rw = NULL, **empty_directories = NULL; char *tmp = NULL, *var = NULL; const char *root_dir = NULL, *root_image = NULL; NameSpaceInfo ns_info = { @@ -2063,6 +2279,8 @@ static int apply_mount_namespace( .mount_apivfs = context->mount_apivfs, }; bool needs_sandboxing; + BindMount *bind_mounts = NULL; + unsigned n_bind_mounts = 0; int r; assert(context); @@ -2089,6 +2307,10 @@ static int apply_mount_namespace( root_dir = context->root_directory; } + r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories); + if (r < 0) + return r; + /* * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed * sandbox info, otherwise enforce it, don't ignore protected paths and @@ -2103,8 +2325,9 @@ static int apply_mount_namespace( &ns_info, rw, needs_sandboxing ? context->read_only_paths : NULL, needs_sandboxing ? context->inaccessible_paths : NULL, - context->bind_mounts, - context->n_bind_mounts, + empty_directories, + bind_mounts, + n_bind_mounts, tmp, var, needs_sandboxing ? context->protect_home : PROTECT_HOME_NO, @@ -2112,6 +2335,8 @@ static int apply_mount_namespace( context->mount_flags, DISSECT_IMAGE_DISCARD_ON_LOOP); + bind_mount_free_many(bind_mounts, n_bind_mounts); + /* If we couldn't set up the namespace this is probably due to a * missing capability. In this case, silently proceeed. */ if (IN_SET(r, -EPERM, -EACCES)) { @@ -3319,10 +3544,21 @@ int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_p if (!p) return -ENOMEM; - /* We execute this synchronously, since we need to be - * sure this is gone when we start the service + /* We execute this synchronously, since we need to be sure this is gone when we start the service * next. */ (void) rm_rf(p, REMOVE_ROOT); + + /* Also destroy any matching subdirectory below /private/. This is done to support DynamicUser=1 + * setups. Note that we don't conditionalize here on that though, as the namespace is same way, and it + * makes us a bit more robust towards changing unit settings. Or to say this differently: in the worst + * case this is a NOP. */ + + free(p); + p = strjoin(runtime_prefix, "/private/", *i); + if (!p) + return -ENOMEM; + + (void) rm_rf(p, REMOVE_ROOT); } return 0; diff --git a/src/core/namespace.c b/src/core/namespace.c index 03a18d45eb..932fbe5c54 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -59,6 +59,7 @@ typedef enum MountMode { PRIVATE_VAR_TMP, PRIVATE_DEV, BIND_DEV, + EMPTY_DIR, SYSFS, PROCFS, READONLY, @@ -225,6 +226,28 @@ static int append_access_mounts(MountEntry **p, char **strv, MountMode mode) { return 0; } +static int append_empty_dir_mounts(MountEntry **p, char **strv) { + char **i; + + assert(p); + + /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the + * "/private/" boundary directories for DynamicUser=1. */ + + STRV_FOREACH(i, strv) { + + *((*p)++) = (MountEntry) { + .path_const = *i, + .mode = EMPTY_DIR, + .ignore = false, + .has_prefix = false, + .read_only = true, + }; + } + + return 0; +} + static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) { unsigned i; @@ -673,6 +696,20 @@ static int mount_procfs(MountEntry *m) { return 1; } +static int mount_empty_dir(MountEntry *m) { + assert(m); + + /* First, get rid of everything that is below if there is anything. Then, overmount with our new empty dir */ + + (void) mkdir_p_label(mount_entry_path(m), 0755); + (void) umount_recursive(mount_entry_path(m), 0); + + if (mount("tmpfs", mount_entry_path(m), "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0) + return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m)); + + return 1; +} + static int mount_entry_chase( const char *root_directory, MountEntry *m, @@ -771,6 +808,9 @@ static int apply_mount( make = true; break; + case EMPTY_DIR: + return mount_empty_dir(m); + case PRIVATE_TMP: what = tmp_dir; make = true; @@ -880,6 +920,7 @@ static unsigned namespace_calculate_mounts( char** read_write_paths, char** read_only_paths, char** inaccessible_paths, + char** empty_directories, const BindMount *bind_mounts, unsigned n_bind_mounts, const char* tmp_dir, @@ -906,6 +947,7 @@ static unsigned namespace_calculate_mounts( strv_length(read_write_paths) + strv_length(read_only_paths) + strv_length(inaccessible_paths) + + strv_length(empty_directories) + n_bind_mounts + ns_info->private_dev + (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + @@ -922,6 +964,7 @@ int setup_namespace( char** read_write_paths, char** read_only_paths, char** inaccessible_paths, + char** empty_directories, const BindMount *bind_mounts, unsigned n_bind_mounts, const char* tmp_dir, @@ -993,6 +1036,7 @@ int setup_namespace( read_write_paths, read_only_paths, inaccessible_paths, + empty_directories, bind_mounts, n_bind_mounts, tmp_dir, var_tmp_dir, protect_home, protect_system); @@ -1015,6 +1059,10 @@ int setup_namespace( if (r < 0) goto finish; + r = append_empty_dir_mounts(&m, empty_directories); + if (r < 0) + goto finish; + r = append_bind_mounts(&m, bind_mounts, n_bind_mounts); if (r < 0) goto finish; diff --git a/src/core/namespace.h b/src/core/namespace.h index f54954bd86..da8d85dbc5 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -69,6 +69,7 @@ int setup_namespace( char **read_write_paths, char **read_only_paths, char **inaccessible_paths, + char **empty_directories, const BindMount *bind_mounts, unsigned n_bind_mounts, const char *tmp_dir, diff --git a/src/test/test-ns.c b/src/test/test-ns.c index 0125d905a6..b142c3a115 100644 --- a/src/test/test-ns.c +++ b/src/test/test-ns.c @@ -82,6 +82,7 @@ int main(int argc, char *argv[]) { (char **) writable, (char **) readonly, (char **) inaccessible, + NULL, &(BindMount) { .source = (char*) "/usr/bin", .destination = (char*) "/etc/systemd", .read_only = true }, 1, tmp_dir, var_tmp_dir,