From 9b68367b3a076c168c9f0e8659d0122237a55197 Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Sun, 29 Jul 2018 00:38:36 +0900 Subject: [PATCH 1/4] core/namespace: drop conditions depends on `root` is empty or not After 0722b359342d2a9f9e0d453875624387a0ba1be2, the variable `root` is always set. --- src/core/namespace.c | 52 +++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 32 deletions(-) diff --git a/src/core/namespace.c b/src/core/namespace.c index e4930db15c..c1ee84779a 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1029,24 +1029,21 @@ static int make_read_only(const MountEntry *m, char **blacklist, FILE *proc_self return r; } -static bool namespace_info_mount_apivfs(const char *root_directory, const NamespaceInfo *ns_info) { +static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) { assert(ns_info); /* * ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=, * since to protect the API VFS mounts, they need to be around in the - * first place... and RootDirectory= or RootImage= need to be set. + * first place... */ - /* root_directory should point to a mount point */ - return root_directory && - (ns_info->mount_apivfs || - ns_info->protect_control_groups || - ns_info->protect_kernel_tunables); + return ns_info->mount_apivfs || + ns_info->protect_control_groups || + ns_info->protect_kernel_tunables; } static size_t namespace_calculate_mounts( - const char* root_directory, const NamespaceInfo *ns_info, char** read_write_paths, char** read_only_paths, @@ -1088,10 +1085,11 @@ static size_t namespace_calculate_mounts( (ns_info->protect_control_groups ? 1 : 0) + (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) + protect_home_cnt + protect_system_cnt + - (namespace_info_mount_apivfs(root_directory, ns_info) ? ELEMENTSOF(apivfs_table) : 0); + (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0); } static void normalize_mounts(const char *root_directory, MountEntry *mounts, size_t *n_mounts) { + assert(root_directory); assert(n_mounts); assert(mounts || *n_mounts == 0); @@ -1127,11 +1125,9 @@ int setup_namespace( _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; _cleanup_free_ void *root_hash = NULL; MountEntry *m, *mounts = NULL; - size_t root_hash_size = 0; - const char *root; - size_t n_mounts; - bool make_slave; + size_t n_mounts, root_hash_size = 0; bool require_prefix = false; + const char *root; int r = 0; assert(ns_info); @@ -1181,7 +1177,6 @@ int setup_namespace( } n_mounts = namespace_calculate_mounts( - root, ns_info, read_write_paths, read_only_paths, @@ -1192,9 +1187,6 @@ int setup_namespace( tmp_dir, var_tmp_dir, protect_home, protect_system); - /* Set mount slave mode */ - make_slave = root || n_mounts > 0 || ns_info->private_mounts; - if (n_mounts > 0) { m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry)); r = append_access_mounts(&m, read_write_paths, READWRITE, require_prefix); @@ -1271,7 +1263,7 @@ int setup_namespace( if (r < 0) goto finish; - if (namespace_info_mount_apivfs(root, ns_info)) { + if (namespace_info_mount_apivfs(ns_info)) { r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths); if (r < 0) goto finish; @@ -1292,13 +1284,11 @@ int setup_namespace( goto finish; } - if (make_slave) { - /* Remount / as SLAVE so that nothing now mounted in the namespace - shows up in the parent */ - if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) { - r = -errno; - goto finish; - } + /* Remount / as SLAVE so that nothing now mounted in the namespace + * shows up in the parent */ + if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) { + r = -errno; + goto finish; } if (root_image) { @@ -1328,7 +1318,7 @@ int setup_namespace( } } - } else if (root) { + } else { /* Let's mount the main root directory to the root directory to use */ if (mount("/", root, NULL, MS_BIND|MS_REC, NULL) < 0) { @@ -1402,12 +1392,10 @@ int setup_namespace( } } - if (root) { - /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */ - r = mount_move_root(root); - if (r < 0) - goto finish; - } + /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */ + r = mount_move_root(root); + if (r < 0) + goto finish; /* Remount / as the desired mode. Note that this will not * reestablish propagation from our side to the host, since From 839f18775317bb2e9f8f46588d6f79ca09547e8a Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Sun, 29 Jul 2018 00:42:41 +0900 Subject: [PATCH 2/4] core/namespace: drop mount points outside of root even if RootDirectory= is not set --- src/core/namespace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/namespace.c b/src/core/namespace.c index c1ee84779a..201192a5a5 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1276,7 +1276,7 @@ int setup_namespace( if (r < 0) goto finish; - normalize_mounts(root_directory, mounts, &n_mounts); + normalize_mounts(root, mounts, &n_mounts); } if (unshare(CLONE_NEWNS) < 0) { @@ -1375,7 +1375,7 @@ int setup_namespace( if (!again) break; - normalize_mounts(root_directory, mounts, &n_mounts); + normalize_mounts(root, mounts, &n_mounts); } /* Create a blacklist we can pass to bind_mount_recursive() */ From fd870bac25c2dd36affaed0251b5a7023f635306 Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Mon, 6 Aug 2018 13:42:14 +0900 Subject: [PATCH 3/4] core: introduce cgroup_add_device_allow() --- src/core/cgroup.c | 29 +++++++++++++++++++++++++++++ src/core/cgroup.h | 2 ++ src/core/load-fragment.c | 13 +------------ 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index eea30b21ff..0ea1b54040 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -304,6 +304,35 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { } } +int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode) { + _cleanup_free_ CGroupDeviceAllow *a = NULL; + _cleanup_free_ char *d = NULL; + + assert(c); + assert(dev); + assert(isempty(mode) || in_charset(mode, "rwm")); + + a = new(CGroupDeviceAllow, 1); + if (!a) + return -ENOMEM; + + d = strdup(dev); + if (!d) + return -ENOMEM; + + *a = (CGroupDeviceAllow) { + .path = TAKE_PTR(d), + .r = isempty(mode) || !!strchr(mode, 'r'), + .w = isempty(mode) || !!strchr(mode, 'w'), + .m = isempty(mode) || !!strchr(mode, 'm'), + }; + + LIST_PREPEND(device_allow, c->device_allow, a); + TAKE_PTR(a); + + return 0; +} + static int lookup_block_device(const char *p, dev_t *ret) { struct stat st; int r; diff --git a/src/core/cgroup.h b/src/core/cgroup.h index bda139c30b..2ce87f3b99 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -137,6 +137,8 @@ void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit * void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w); void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b); +int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode); + CGroupMask unit_get_own_mask(Unit *u); CGroupMask unit_get_delegate_mask(Unit *u); CGroupMask unit_get_members_mask(Unit *u); diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 908d16c9f2..07fcbaacc0 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -3212,7 +3212,6 @@ int config_parse_device_allow( _cleanup_free_ char *path = NULL, *resolved = NULL; CGroupContext *c = data; - CGroupDeviceAllow *a; const char *p = rvalue; int r; @@ -3261,17 +3260,7 @@ int config_parse_device_allow( return 0; } - a = new0(CGroupDeviceAllow, 1); - if (!a) - return log_oom(); - - a->path = TAKE_PTR(resolved); - a->r = isempty(p) || !!strchr(p, 'r'); - a->w = isempty(p) || !!strchr(p, 'w'); - a->m = isempty(p) || !!strchr(p, 'm'); - - LIST_PREPEND(device_allow, c->device_allow, a); - return 0; + return cgroup_add_device_allow(c, resolved, p); } int config_parse_io_device_weight( From fe65e88ba6ad876baf759461fd99162f706dd35e Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Mon, 6 Aug 2018 14:02:28 +0900 Subject: [PATCH 4/4] namespace: implicitly adds DeviceAllow= when RootImage= is set RootImage= may require the following settings ``` DeviceAllow=/dev/loop-control rw DeviceAllow=block-loop rwm DeviceAllow=block-blkext rwm ``` This adds the following settings implicitly when RootImage= is specified. Fixes #9737. --- man/systemd.exec.xml | 11 ++++++++++- src/core/unit.c | 22 +++++++++++++++++++--- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index c898d226a7..0b650fc67a 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -124,7 +124,16 @@ partition table, or a file system within an MBR/MS-DOS or GPT partition table with only a single Linux-compatible partition, or a set of file systems within a GPT partition table that follows the Discoverable Partitions - Specification. + Specification. + + When DevicePolicy= is set to closed or strict, + or set to auto and DeviceAllow= is set, then this setting adds + /dev/loop-control with rw mode, block-loop and + block-blkext with rwm mode to DeviceAllow=. See + systemd.resource-control5 + for the details about DevicePolicy= or DeviceAllow=. Also, see + PrivateDevices= below, as it may change the setting of DevicePolicy=. + diff --git a/src/core/unit.c b/src/core/unit.c index 23433be31c..17f4ff3ebd 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -4143,12 +4143,28 @@ int unit_patch_contexts(Unit *u) { } cc = unit_get_cgroup_context(u); - if (cc) { + if (cc && ec) { - if (ec && - ec->private_devices && + if (ec->private_devices && cc->device_policy == CGROUP_AUTO) cc->device_policy = CGROUP_CLOSED; + + if (ec->root_image && + (cc->device_policy != CGROUP_AUTO || cc->device_allow)) { + + /* When RootImage= is specified, the following devices are touched. */ + r = cgroup_add_device_allow(cc, "/dev/loop-control", "rw"); + if (r < 0) + return r; + + r = cgroup_add_device_allow(cc, "block-loop", "rwm"); + if (r < 0) + return r; + + r = cgroup_add_device_allow(cc, "block-blkext", "rwm"); + if (r < 0) + return r; + } } return 0;