diff --git a/src/core/execute.c b/src/core/execute.c index e7dfbfd010..1faa8e34dc 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1728,6 +1728,13 @@ static bool exec_needs_mount_namespace( if (context->mount_apivfs && (context->root_image || context->root_directory)) return true; + if (context->dynamic_user && + (!strv_isempty(context->directories[EXEC_DIRECTORY_RUNTIME].paths) || + !strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) || + !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) || + !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths))) + return true; + return false; } @@ -1924,7 +1931,8 @@ static int setup_exec_directory( } STRV_FOREACH(rt, context->directories[type].paths) { - _cleanup_free_ char *p; + _cleanup_free_ char *p = NULL, *pp = NULL; + const char *effective; p = strjoin(params->prefix[type], "/", *rt); if (!p) { @@ -1936,12 +1944,83 @@ static int setup_exec_directory( if (r < 0) goto fail; - r = mkdir_label(p, context->directories[type].mode); - if (r < 0 && r != -EEXIST) - goto fail; + if (context->dynamic_user && type != EXEC_DIRECTORY_CONFIGURATION) { + _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL; + + /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we + * want to avoid leaving a directory around fully accessible that is owned by a dynamic user + * whose UID is later on reused. To lock this down we use the same trick used by container + * managers to prohibit host users to get access to files of the same UID in containers: we + * place everything inside a directory that has an access mode of 0700 and is owned root:root, + * so that it acts as security boundary for unprivileged host code. We then use fs namespacing + * to make this directory permeable for the service itself. + * + * Specifically: for a service which wants a special directory "foo/" we first create a + * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of + * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way, + * privileged host users can access "foo/" as usual, but unprivileged host users can't look + * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally + * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus + * disabling the access boundary for the service and making sure it only gets access to the + * dirs it needs but no others. Tricky? Yes, absolutely, but it works! + * + * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be + * owned by the service itself. */ + + private_root = strjoin(params->prefix[type], "/private"); + if (!private_root) { + r = -ENOMEM; + goto fail; + } + + /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */ + r = mkdir_safe_label(private_root, 0700, 0, 0); + if (r < 0) + goto fail; + + pp = strjoin(private_root, "/", *rt); + if (!pp) { + r = -ENOMEM; + goto fail; + } + + /* Create all directories between the configured directory and this private root, and mark them 0755 */ + r = mkdir_parents_label(pp, 0755); + if (r < 0) + goto fail; + + /* Finally, create the actual directory for the service */ + r = mkdir_label(pp, context->directories[type].mode); + if (r < 0 && r != -EEXIST) + goto fail; + + parent = dirname_malloc(p); + if (!parent) { + r = -ENOMEM; + goto fail; + } + + r = path_make_relative(parent, pp, &relative); + if (r < 0) + goto fail; + + /* And link it up from the original place */ + r = symlink_idempotent(relative, p); + if (r < 0) + goto fail; + + effective = pp; + + } else { + r = mkdir_label(p, context->directories[type].mode); + if (r < 0 && r != -EEXIST) + goto fail; + + effective = p; + } /* First lock down the access mode */ - if (chmod(p, context->directories[type].mode) < 0) { + if (chmod(effective, context->directories[type].mode) < 0) { r = -errno; goto fail; } @@ -1952,7 +2031,7 @@ static int setup_exec_directory( continue; /* Then, change the ownership of the whole tree, if necessary */ - r = path_chown_recursive(p, uid, gid); + r = path_chown_recursive(effective, uid, gid); if (r < 0) goto fail; } @@ -2044,6 +2123,143 @@ static int compile_read_write_paths( return 0; } +static int compile_bind_mounts( + const ExecContext *context, + const ExecParameters *params, + BindMount **ret_bind_mounts, + unsigned *ret_n_bind_mounts, + char ***ret_empty_directories) { + + _cleanup_strv_free_ char **empty_directories = NULL; + BindMount *bind_mounts; + unsigned n, h = 0, i; + ExecDirectoryType t; + int r; + + assert(context); + assert(params); + assert(ret_bind_mounts); + assert(ret_n_bind_mounts); + assert(ret_empty_directories); + + n = context->n_bind_mounts; + for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + if (!params->prefix[t]) + continue; + + n += strv_length(context->directories[t].paths); + } + + if (n <= 0) { + *ret_bind_mounts = NULL; + *ret_n_bind_mounts = 0; + *ret_empty_directories = NULL; + return 0; + } + + bind_mounts = new(BindMount, n); + if (!bind_mounts) + return -ENOMEM; + + for (i = 0; context->n_bind_mounts; i++) { + BindMount *item = context->bind_mounts + i; + char *s, *d; + + s = strdup(item->source); + if (!s) { + r = -ENOMEM; + goto finish; + } + + d = strdup(item->destination); + if (!d) { + free(s); + r = -ENOMEM; + goto finish; + } + + bind_mounts[h++] = (BindMount) { + .source = s, + .destination = d, + .read_only = item->read_only, + .recursive = item->recursive, + .ignore_enoent = item->ignore_enoent, + }; + } + + for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + char **suffix; + + if (!params->prefix[t]) + continue; + + if (strv_isempty(context->directories[t].paths)) + continue; + + if (context->dynamic_user && t != EXEC_DIRECTORY_CONFIGURATION) { + char *private_root; + + /* So this is for a dynamic user, and we need to make sure the process can access its own + * directory. For that we overmount the usually inaccessible "private" subdirectory with a + * tmpfs that makes it accessible and is empty except for the submounts we do this for. */ + + private_root = strjoin(params->prefix[t], "/private"); + if (!private_root) { + r = -ENOMEM; + goto finish; + } + + r = strv_consume(&empty_directories, private_root); + if (r < 0) { + r = -ENOMEM; + goto finish; + } + } + + STRV_FOREACH(suffix, context->directories[t].paths) { + char *s, *d; + + if (context->dynamic_user && t != EXEC_DIRECTORY_CONFIGURATION) + s = strjoin(params->prefix[t], "/private/", *suffix); + else + s = strjoin(params->prefix[t], "/", *suffix); + if (!s) { + r = -ENOMEM; + goto finish; + } + + d = strdup(s); + if (!d) { + free(s); + r = -ENOMEM; + goto finish; + } + + bind_mounts[h++] = (BindMount) { + .source = s, + .destination = d, + .read_only = false, + .recursive = true, + .ignore_enoent = false, + }; + } + } + + assert(h == n); + + *ret_bind_mounts = bind_mounts; + *ret_n_bind_mounts = n; + *ret_empty_directories = empty_directories; + + empty_directories = NULL; + + return (int) n; + +finish: + bind_mount_free_many(bind_mounts, h); + return r; +} + static int apply_mount_namespace( Unit *u, ExecCommand *command, @@ -2051,7 +2267,7 @@ static int apply_mount_namespace( const ExecParameters *params, ExecRuntime *runtime) { - _cleanup_strv_free_ char **rw = NULL; + _cleanup_strv_free_ char **rw = NULL, **empty_directories = NULL; char *tmp = NULL, *var = NULL; const char *root_dir = NULL, *root_image = NULL; NameSpaceInfo ns_info = { @@ -2063,6 +2279,8 @@ static int apply_mount_namespace( .mount_apivfs = context->mount_apivfs, }; bool needs_sandboxing; + BindMount *bind_mounts = NULL; + unsigned n_bind_mounts = 0; int r; assert(context); @@ -2089,6 +2307,10 @@ static int apply_mount_namespace( root_dir = context->root_directory; } + r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories); + if (r < 0) + return r; + /* * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed * sandbox info, otherwise enforce it, don't ignore protected paths and @@ -2103,8 +2325,9 @@ static int apply_mount_namespace( &ns_info, rw, needs_sandboxing ? context->read_only_paths : NULL, needs_sandboxing ? context->inaccessible_paths : NULL, - context->bind_mounts, - context->n_bind_mounts, + empty_directories, + bind_mounts, + n_bind_mounts, tmp, var, needs_sandboxing ? context->protect_home : PROTECT_HOME_NO, @@ -2112,6 +2335,8 @@ static int apply_mount_namespace( context->mount_flags, DISSECT_IMAGE_DISCARD_ON_LOOP); + bind_mount_free_many(bind_mounts, n_bind_mounts); + /* If we couldn't set up the namespace this is probably due to a * missing capability. In this case, silently proceeed. */ if (IN_SET(r, -EPERM, -EACCES)) { @@ -3319,10 +3544,21 @@ int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_p if (!p) return -ENOMEM; - /* We execute this synchronously, since we need to be - * sure this is gone when we start the service + /* We execute this synchronously, since we need to be sure this is gone when we start the service * next. */ (void) rm_rf(p, REMOVE_ROOT); + + /* Also destroy any matching subdirectory below /private/. This is done to support DynamicUser=1 + * setups. Note that we don't conditionalize here on that though, as the namespace is same way, and it + * makes us a bit more robust towards changing unit settings. Or to say this differently: in the worst + * case this is a NOP. */ + + free(p); + p = strjoin(runtime_prefix, "/private/", *i); + if (!p) + return -ENOMEM; + + (void) rm_rf(p, REMOVE_ROOT); } return 0; diff --git a/src/core/namespace.c b/src/core/namespace.c index 03a18d45eb..932fbe5c54 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -59,6 +59,7 @@ typedef enum MountMode { PRIVATE_VAR_TMP, PRIVATE_DEV, BIND_DEV, + EMPTY_DIR, SYSFS, PROCFS, READONLY, @@ -225,6 +226,28 @@ static int append_access_mounts(MountEntry **p, char **strv, MountMode mode) { return 0; } +static int append_empty_dir_mounts(MountEntry **p, char **strv) { + char **i; + + assert(p); + + /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the + * "/private/" boundary directories for DynamicUser=1. */ + + STRV_FOREACH(i, strv) { + + *((*p)++) = (MountEntry) { + .path_const = *i, + .mode = EMPTY_DIR, + .ignore = false, + .has_prefix = false, + .read_only = true, + }; + } + + return 0; +} + static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) { unsigned i; @@ -673,6 +696,20 @@ static int mount_procfs(MountEntry *m) { return 1; } +static int mount_empty_dir(MountEntry *m) { + assert(m); + + /* First, get rid of everything that is below if there is anything. Then, overmount with our new empty dir */ + + (void) mkdir_p_label(mount_entry_path(m), 0755); + (void) umount_recursive(mount_entry_path(m), 0); + + if (mount("tmpfs", mount_entry_path(m), "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0) + return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m)); + + return 1; +} + static int mount_entry_chase( const char *root_directory, MountEntry *m, @@ -771,6 +808,9 @@ static int apply_mount( make = true; break; + case EMPTY_DIR: + return mount_empty_dir(m); + case PRIVATE_TMP: what = tmp_dir; make = true; @@ -880,6 +920,7 @@ static unsigned namespace_calculate_mounts( char** read_write_paths, char** read_only_paths, char** inaccessible_paths, + char** empty_directories, const BindMount *bind_mounts, unsigned n_bind_mounts, const char* tmp_dir, @@ -906,6 +947,7 @@ static unsigned namespace_calculate_mounts( strv_length(read_write_paths) + strv_length(read_only_paths) + strv_length(inaccessible_paths) + + strv_length(empty_directories) + n_bind_mounts + ns_info->private_dev + (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + @@ -922,6 +964,7 @@ int setup_namespace( char** read_write_paths, char** read_only_paths, char** inaccessible_paths, + char** empty_directories, const BindMount *bind_mounts, unsigned n_bind_mounts, const char* tmp_dir, @@ -993,6 +1036,7 @@ int setup_namespace( read_write_paths, read_only_paths, inaccessible_paths, + empty_directories, bind_mounts, n_bind_mounts, tmp_dir, var_tmp_dir, protect_home, protect_system); @@ -1015,6 +1059,10 @@ int setup_namespace( if (r < 0) goto finish; + r = append_empty_dir_mounts(&m, empty_directories); + if (r < 0) + goto finish; + r = append_bind_mounts(&m, bind_mounts, n_bind_mounts); if (r < 0) goto finish; diff --git a/src/core/namespace.h b/src/core/namespace.h index f54954bd86..da8d85dbc5 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -69,6 +69,7 @@ int setup_namespace( char **read_write_paths, char **read_only_paths, char **inaccessible_paths, + char **empty_directories, const BindMount *bind_mounts, unsigned n_bind_mounts, const char *tmp_dir, diff --git a/src/test/test-ns.c b/src/test/test-ns.c index 0125d905a6..b142c3a115 100644 --- a/src/test/test-ns.c +++ b/src/test/test-ns.c @@ -82,6 +82,7 @@ int main(int argc, char *argv[]) { (char **) writable, (char **) readonly, (char **) inaccessible, + NULL, &(BindMount) { .source = (char*) "/usr/bin", .destination = (char*) "/etc/systemd", .read_only = true }, 1, tmp_dir, var_tmp_dir,