From 1eb7e08e20a329b1f074968c88fee5d8adf3bbaf Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Dec 2016 17:11:06 +0100 Subject: [PATCH 1/9] core: fix minor memleak in namespace.c The source_malloc field wants to be freed, too. --- src/core/namespace.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/core/namespace.c b/src/core/namespace.c index 834883267c..d0fdc3d8c5 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -176,6 +176,13 @@ static const char *mount_entry_source(const MountEntry *p) { return p->source_malloc ?: p->source_const; } +static void mount_entry_done(MountEntry *p) { + assert(p); + + p->path_malloc = mfree(p->path_malloc); + p->source_malloc = mfree(p->source_malloc); +} + static int append_access_mounts(MountEntry **p, char **strv, MountMode mode) { char **i; @@ -351,7 +358,7 @@ static void drop_duplicates(MountEntry *m, unsigned *n) { if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) { log_debug("%s is duplicate.", mount_entry_path(f)); previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */ - f->path_malloc = mfree(f->path_malloc); + mount_entry_done(f); continue; } @@ -379,7 +386,7 @@ static void drop_inaccessible(MountEntry *m, unsigned *n) { * it, as inaccessible paths really should drop the entire subtree. */ if (clear && path_startswith(mount_entry_path(f), clear)) { log_debug("%s is masked by %s.", mount_entry_path(f), clear); - f->path_malloc = mfree(f->path_malloc); + mount_entry_done(f); continue; } @@ -419,7 +426,7 @@ static void drop_nop(MountEntry *m, unsigned *n) { /* We found it, let's see if it's the same mode, if so, we can drop this entry */ if (found && p->mode == f->mode) { log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p)); - f->path_malloc = mfree(f->path_malloc); + mount_entry_done(f); continue; } } @@ -447,7 +454,7 @@ static void drop_outside_root(const char *root_directory, MountEntry *m, unsigne if (!path_startswith(mount_entry_path(f), root_directory)) { log_debug("%s is outside of root directory.", mount_entry_path(f)); - f->path_malloc = mfree(f->path_malloc); + mount_entry_done(f); continue; } @@ -964,7 +971,7 @@ int setup_namespace( finish: for (m = mounts; m < mounts + n_mounts; m++) - free(m->path_malloc); + mount_entry_done(m); return r; } From 5d997827e2ebe5d4f438748d1ac87c10c29045c6 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 22 Dec 2016 23:34:35 +0100 Subject: [PATCH 2/9] core: add a per-unit setting MountAPIVFS= for mounting /dev, /proc, /sys in conjunction with RootDirectory= This adds a boolean unit file setting MountAPIVFS=. If set, the three main API VFS mounts will be mounted for the service. This only has an effect on RootDirectory=, which it makes a ton times more useful. (This is basically the /dev + /proc + /sys mounting code posted in the original #4727, but rebased on current git, and with the automatic logic replaced by explicit logic controlled by a unit file setting) --- man/systemd.exec.xml | 18 ++++- src/core/dbus-execute.c | 5 +- src/core/execute.c | 6 ++ src/core/execute.h | 1 + src/core/load-fragment-gperf.gperf.m4 | 1 + src/core/namespace.c | 111 +++++++++++++++++++++++--- src/core/namespace.h | 1 + src/shared/bus-unit-util.c | 2 +- 8 files changed, 129 insertions(+), 16 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index bb38ea2467..e594dc1b0c 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -132,8 +132,22 @@ the chroot() jail. Note that setting this parameter might result in additional dependencies to be added to the unit (see above). - The PrivateUsers= setting is particularly useful in conjunction with - RootDirectory=. For details, see below. + The MountAPIVFS= and PrivateUsers= settings are particularly useful + in conjunction with RootDirectory=. For details, see below. + + + + MountAPIVFS= + + Takes a boolean argument. If on, a private mount namespace for the unit's processes is created + and the API file systems /proc, /sys and /dev + will be mounted inside of it, unless they are already mounted. Note that this option has no effect unless used + in conjunction with RootDirectory= as these three mounts are generally mounted in the host + anyway, and unless the root directory is changed the private mount namespace will be a 1:1 copy of the host's, + and include these three mounts. Note that the /dev file system of the host is bind mounted + if this option is used without PrivateDevices=. To run the service with a private, minimal + version of /dev/, combine this option with + PrivateDevices=. diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index cc10e2d8e7..60b0288bb0 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -828,6 +828,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MountAPIVFS", "b", bus_property_get_bool, offsetof(ExecContext, mount_apivfs), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_VTABLE_END }; @@ -1207,7 +1208,7 @@ int bus_exec_context_set_transient_property( "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", - "ProtectKernelModules", "ProtectControlGroups")) { + "ProtectKernelModules", "ProtectControlGroups", "MountAPIVFS")) { int b; r = sd_bus_message_read(message, "b", &b); @@ -1247,6 +1248,8 @@ int bus_exec_context_set_transient_property( c->protect_kernel_modules = b; else if (streq(name, "ProtectControlGroups")) c->protect_control_groups = b; + else if (streq(name, "MountAPIVFS")) + c->mount_apivfs = b; unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b)); } diff --git a/src/core/execute.c b/src/core/execute.c index aa0ddb564e..54f6418c5a 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1662,6 +1662,9 @@ static bool exec_needs_mount_namespace( context->protect_control_groups) return true; + if (context->mount_apivfs) + return true; + return false; } @@ -1942,6 +1945,7 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context, .protect_control_groups = context->protect_control_groups, .protect_kernel_tunables = context->protect_kernel_tunables, .protect_kernel_modules = context->protect_kernel_modules, + .mount_apivfs = context->mount_apivfs, }; assert(context); @@ -3294,6 +3298,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { "%sPrivateUsers: %s\n" "%sProtectHome: %s\n" "%sProtectSystem: %s\n" + "%sMountAPIVFS: %s\n" "%sIgnoreSIGPIPE: %s\n" "%sMemoryDenyWriteExecute: %s\n" "%sRestrictRealtime: %s\n", @@ -3310,6 +3315,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { prefix, yes_no(c->private_users), prefix, protect_home_to_string(c->protect_home), prefix, protect_system_to_string(c->protect_system), + prefix, yes_no(c->mount_apivfs), prefix, yes_no(c->ignore_sigpipe), prefix, yes_no(c->memory_deny_write_execute), prefix, yes_no(c->restrict_realtime)); diff --git a/src/core/execute.h b/src/core/execute.h index f8694ef520..6fd5a6e5ce 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -183,6 +183,7 @@ struct ExecContext { bool protect_kernel_tunables; bool protect_kernel_modules; bool protect_control_groups; + bool mount_apivfs; bool no_new_privileges; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 15f22a2681..07f2a70c8f 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -101,6 +101,7 @@ $1.PrivateUsers, config_parse_bool, 0, $1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context) $1.ProtectHome, config_parse_protect_home, 0, offsetof($1, exec_context) $1.MountFlags, config_parse_exec_mount_flags, 0, offsetof($1, exec_context) +$1.MountAPIVFS, config_parse_bool, 0, offsetof($1, exec_context.mount_apivfs) $1.Personality, config_parse_personality, 0, offsetof($1, exec_context.personality) $1.RuntimeDirectoryMode, config_parse_mode, 0, offsetof($1, exec_context.runtime_directory_mode) $1.RuntimeDirectory, config_parse_runtime_directory, 0, offsetof($1, exec_context.runtime_directory) diff --git a/src/core/namespace.c b/src/core/namespace.c index d0fdc3d8c5..10917f7b70 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -52,10 +52,13 @@ typedef enum MountMode { INACCESSIBLE, BIND_MOUNT, BIND_MOUNT_RECURSIVE, - READONLY, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV, + BIND_DEV, + SYSFS, + PROCFS, + READONLY, READWRITE, } MountMode; @@ -70,13 +73,13 @@ typedef struct MountEntry { char *source_malloc; } MountEntry; -/* - * The following Protect tables are to protect paths and mark some of them - * READONLY, in case a path is covered by an option from another table, then - * it is marked READWRITE in the current one, and the more restrictive mode is - * applied from that other table. This way all options can be combined in a - * safe and comprehensible way for users. - */ +/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted + * something there already. These mounts are hence overriden by any other explicitly configured mounts. */ +static const MountEntry apivfs_table[] = { + { "/proc", PROCFS, false }, + { "/dev", BIND_DEV, false }, + { "/sys", SYSFS, false }, +}; /* ProtectKernelTunables= option and the related filesystem APIs */ static const MountEntry protect_kernel_tunables_table[] = { @@ -465,7 +468,7 @@ static void drop_outside_root(const char *root_directory, MountEntry *m, unsigne *n = t - m; } -static int mount_dev(MountEntry *m) { +static int mount_private_dev(MountEntry *m) { static const char devnodes[] = "/dev/null\0" "/dev/zero\0" @@ -604,6 +607,62 @@ fail: return r; } +static int mount_bind_dev(MountEntry *m) { + int r; + + assert(m); + + /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's + * /dev. This is only used when RootDirectory= is set. */ + + r = path_is_mount_point(mount_entry_path(m), NULL, 0); + if (r < 0) + return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m"); + if (r > 0) /* make this a NOP if /dev is already a mount point */ + return 0; + + if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0) + return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m)); + + return 1; +} + +static int mount_sysfs(MountEntry *m) { + int r; + + assert(m); + + r = path_is_mount_point(mount_entry_path(m), NULL, 0); + if (r < 0) + return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m"); + if (r > 0) /* make this a NOP if /sys is already a mount point */ + return 0; + + /* Bind mount the host's version so that we get all child mounts of it, too. */ + if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0) + return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m)); + + return 1; +} + +static int mount_procfs(MountEntry *m) { + int r; + + assert(m); + + r = path_is_mount_point(mount_entry_path(m), NULL, 0); + if (r < 0) + return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m"); + if (r > 0) /* make this a NOP if /proc is already a mount point */ + return 0; + + /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */ + if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0) + return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m)); + + return 1; +} + static int mount_entry_chase( const char *root_directory, MountEntry *m, @@ -691,6 +750,7 @@ static int apply_mount( case BIND_MOUNT_RECURSIVE: /* Also chase the source mount */ + r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc); if (r <= 0) return r; @@ -707,7 +767,16 @@ static int apply_mount( break; case PRIVATE_DEV: - return mount_dev(m); + return mount_private_dev(m); + + case BIND_DEV: + return mount_bind_dev(m); + + case SYSFS: + return mount_sysfs(m); + + case PROCFS: + return mount_procfs(m); default: assert_not_reached("Unknown mode"); @@ -729,7 +798,7 @@ static int make_read_only(MountEntry *m, char **blacklist) { if (mount_entry_read_only(m)) r = bind_remount_recursive(mount_entry_path(m), true, blacklist); - else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/ + else if (m->mode == PRIVATE_DEV) { /* Superblock can be readonly but the submounts can't*/ if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0) r = -errno; } else @@ -745,6 +814,17 @@ static int make_read_only(MountEntry *m, char **blacklist) { return r; } +static bool namespace_info_mount_apivfs(const NameSpaceInfo *ns_info) { + assert(ns_info); + + /* ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=, since to protect the API VFS mounts, + * they need to be around in the first place... */ + + return ns_info->mount_apivfs || + ns_info->protect_control_groups || + ns_info->protect_kernel_tunables; +} + static unsigned namespace_calculate_mounts( const NameSpaceInfo *ns_info, char** read_write_paths, @@ -781,7 +861,8 @@ static unsigned namespace_calculate_mounts( (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + (ns_info->protect_control_groups ? 1 : 0) + (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) + - protect_home_cnt + protect_system_cnt; + protect_home_cnt + protect_system_cnt + + (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0); } int setup_namespace( @@ -885,6 +966,12 @@ int setup_namespace( if (r < 0) goto finish; + if (namespace_info_mount_apivfs(ns_info)) { + r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths); + if (r < 0) + goto finish; + } + assert(mounts + n_mounts == m); /* Prepend the root directory where that's necessary */ diff --git a/src/core/namespace.h b/src/core/namespace.h index de3edc419c..bb9de9857c 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -50,6 +50,7 @@ struct NameSpaceInfo { bool protect_control_groups:1; bool protect_kernel_tunables:1; bool protect_kernel_modules:1; + bool mount_apivfs:1; }; struct BindMount { diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 829be2c6da..b6da20a04a 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -208,7 +208,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute", "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", - "ProtectKernelModules", "ProtectControlGroups")) { + "ProtectKernelModules", "ProtectControlGroups", "MountAPIVFS")) { r = parse_boolean(eq); if (r < 0) From 20b7a0070c2f5a31442f59bece47f7f0875da3cd Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Dec 2016 01:16:43 +0100 Subject: [PATCH 3/9] core: actually make "+" prefix in ReadOnlyPaths=, InaccessiblePaths=, ReadWritablePaths= work 5327c910d2fc1ae91bd0b891be92b30379c7467b claimed to add support for "+" for prefixing paths with the configured RootDirectory=. But actually it only implemented it in the backend, it did not add support for it to the configuration file parsers. Fix that now. --- src/core/dbus-execute.c | 12 +++++++----- src/core/load-fragment.c | 19 +++++++++++++++---- src/shared/bus-unit-util.c | 4 +++- 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 60b0288bb0..c57af5aaaf 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -1498,12 +1498,15 @@ int bus_exec_context_set_transient_property( return r; STRV_FOREACH(p, l) { - int offset; - if (!utf8_is_valid(*p)) + const char *i = *p; + size_t offset; + + if (!utf8_is_valid(i)) return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s", name); - offset = **p == '-'; - if (!path_is_absolute(*p + offset)) + offset = i[0] == '-'; + offset += i[offset] == '+'; + if (!path_is_absolute(i + offset)) return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s", name); } @@ -1522,7 +1525,6 @@ int bus_exec_context_set_transient_property( unit_write_drop_in_private_format(u, mode, name, "%s=", name); } else { r = strv_extend_strv(dirs, l, true); - if (r < 0) return -ENOMEM; diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 243c288885..5b7471c0d0 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -3839,7 +3839,8 @@ int config_parse_namespace_path_strv( cur = rvalue; for (;;) { _cleanup_free_ char *word = NULL, *resolved = NULL, *joined = NULL; - bool ignore_enoent; + const char *w; + bool ignore_enoent = false, shall_prefix = false; r = extract_first_word(&cur, &word, NULL, EXTRACT_QUOTES); if (r == 0) @@ -3856,9 +3857,17 @@ int config_parse_namespace_path_strv( continue; } - ignore_enoent = word[0] == '-'; + w = word; + if (startswith(w, "-")) { + ignore_enoent = true; + w++; + } + if (startswith(w, "+")) { + shall_prefix = true; + w++; + } - r = unit_full_printf(u, word + ignore_enoent, &resolved); + r = unit_full_printf(u, w, &resolved); if (r < 0) { log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve specifiers in %s: %m", word); continue; @@ -3871,7 +3880,9 @@ int config_parse_namespace_path_strv( path_kill_slashes(resolved); - joined = strjoin(ignore_enoent ? "-" : "", resolved); + joined = strjoin(ignore_enoent ? "-" : "", + shall_prefix ? "+" : "", + resolved); r = strv_push(sv, joined); if (r < 0) diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index b6da20a04a..a4677bef27 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -484,7 +484,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen for (p = eq;;) { _cleanup_free_ char *word = NULL; - int offset; + size_t offset; r = extract_first_word(&p, &word, NULL, EXTRACT_QUOTES); if (r < 0) { @@ -500,6 +500,8 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen } offset = word[0] == '-'; + offset += word[offset] == '+'; + if (!path_is_absolute(word + offset)) { log_error("Failed to parse %s value %s", field, eq); return -EINVAL; From 2eedfd2d8b3441e8cf6dae4bdc9afaefbda19c39 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Dec 2016 11:09:47 +0100 Subject: [PATCH 4/9] dissect: make sure to manually follow symlinks when mounting dissected image If the dissected image contains symlinks for the mount points we need we need to make sure to follow this with chase_symlinks() so that we don't leave the image. --- src/shared/dissect-image.c | 39 +++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c index 878cb008aa..5fc2ce25f0 100644 --- a/src/shared/dissect-image.c +++ b/src/shared/dissect-image.c @@ -28,6 +28,7 @@ #include "blkid-util.h" #include "dissect-image.h" #include "fd-util.h" +#include "fs-util.h" #include "gpt.h" #include "mount-util.h" #include "path-util.h" @@ -35,6 +36,7 @@ #include "stdio-util.h" #include "string-table.h" #include "string-util.h" +#include "strv.h" #include "udev-util.h" static int probe_filesystem(const char *node, char **ret_fstype) { @@ -624,7 +626,9 @@ static int mount_partition( DissectImageFlags flags) { const char *p, *options = NULL, *node, *fstype; + _cleanup_free_ char *chased = NULL; bool rw; + int r; assert(m); assert(where); @@ -641,9 +645,13 @@ static int mount_partition( rw = m->rw && !(flags & DISSECT_IMAGE_READ_ONLY); - if (directory) - p = strjoina(where, directory); - else + if (directory) { + r = chase_symlinks(directory, where, CHASE_PREFIX_ROOT, &chased); + if (r < 0) + return r; + + p = chased; + } else p = where; /* If requested, turn on discard support. */ @@ -677,22 +685,23 @@ int dissected_image_mount(DissectedImage *m, const char *where, DissectImageFlag return r; if (m->partitions[PARTITION_ESP].found) { - const char *mp, *x; + const char *mp; /* Mount the ESP to /efi if it exists and is empty. If it doesn't exist, use /boot instead. */ - mp = "/efi"; - x = strjoina(where, mp); - r = dir_is_empty(x); - if (r == -ENOENT) { - mp = "/boot"; - x = strjoina(where, mp); - r = dir_is_empty(x); - } - if (r > 0) { - r = mount_partition(m->partitions + PARTITION_ESP, where, mp, flags); + FOREACH_STRING(mp, "/efi", "/boot") { + _cleanup_free_ char *p = NULL; + + r = chase_symlinks(mp, where, CHASE_PREFIX_ROOT, &p); if (r < 0) - return r; + continue; + + r = dir_is_empty(p); + if (r > 0) { + r = mount_partition(m->partitions + PARTITION_ESP, where, mp, flags); + if (r < 0) + return r; + } } } From 915e6d1676cf73c4f927f3bbfa21ee82640b1832 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Dec 2016 14:26:05 +0100 Subject: [PATCH 5/9] core: add RootImage= setting for using a specific image file as root directory for a service This is similar to RootDirectory= but mounts the root file system from a block device or loopback file instead of another directory. This reuses the image dissector code now used by nspawn and gpt-auto-discovery. --- man/systemd.exec.xml | 53 ++++++++++++++++----------- src/core/dbus-execute.c | 5 ++- src/core/execute.c | 33 ++++++++++++----- src/core/execute.h | 2 +- src/core/load-fragment-gperf.gperf.m4 | 1 + src/core/namespace.c | 44 +++++++++++++++++++++- src/core/namespace.h | 5 ++- src/core/unit.c | 6 +++ src/shared/bus-unit-util.c | 2 +- src/test/test-ns.c | 2 + 10 files changed, 117 insertions(+), 36 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index e594dc1b0c..09e78c6786 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -86,12 +86,10 @@ A few execution parameters result in additional, automatic dependencies to be added. - Units with WorkingDirectory= or - RootDirectory= set automatically gain - dependencies of type Requires= and - After= on all mount units required to access - the specified paths. This is equivalent to having them listed - explicitly in RequiresMountsFor=. + Units with WorkingDirectory=, RootDirectory= or + RootImage= set automatically gain dependencies of type Requires= and + After= on all mount units required to access the specified paths. This is equivalent to having + them listed explicitly in RequiresMountsFor=. Similar, units with PrivateTmp= enabled automatically get mount unit dependencies for all mounts required to access /tmp and /var/tmp. They will also gain an @@ -117,9 +115,10 @@ User= is used. If not set, defaults to the root directory when systemd is running as a system instance and the respective user's home directory if run as user. If the setting is prefixed with the - character, a missing working directory is not considered fatal. If - RootDirectory= is not set, then WorkingDirectory= is relative to the root - of the system running the service manager. Note that setting this parameter might result in additional - dependencies to be added to the unit (see above). + RootDirectory=/RootImage= is not set, then + WorkingDirectory= is relative to the root of the system running the service manager. Note + that setting this parameter might result in additional dependencies to be added to the unit (see + above). @@ -136,13 +135,24 @@ in conjunction with RootDirectory=. For details, see below. + + RootImage= + Takes a path to a block device node or regular file as argument. This call is similar to + RootDirectory= however mounts a file system hierarchy from a block device node or loopack + file instead of a directory. The device node or file system image file needs to contain a file system without a + partition table, or a file system within an MBR/MS-DOS or GPT partition table with only a single + Linux-compatible partition, or a set of file systems within a GPT partition table that follows the Discoverable Partitions + Specification. + + MountAPIVFS= Takes a boolean argument. If on, a private mount namespace for the unit's processes is created and the API file systems /proc, /sys and /dev will be mounted inside of it, unless they are already mounted. Note that this option has no effect unless used - in conjunction with RootDirectory= as these three mounts are generally mounted in the host + in conjunction with RootDirectory=/RootImage= as these three mounts are generally mounted in the host anyway, and unless the root directory is changed the private mount namespace will be a 1:1 copy of the host's, and include these three mounts. Note that the /dev file system of the host is bind mounted if this option is used without PrivateDevices=. To run the service with a private, minimal @@ -952,7 +962,7 @@ access a process might have to the file system hierarchy. Each setting takes a space-separated list of paths relative to the host's root directory (i.e. the system running the service manager). Note that if paths contain symlinks, they are resolved relative to the root directory set with - RootDirectory=. + RootDirectory=/RootImage=. Paths listed in ReadWritePaths= are accessible from within the namespace with the same access modes as from outside of it. Paths listed in ReadOnlyPaths= are accessible for @@ -971,9 +981,10 @@ Paths in ReadWritePaths=, ReadOnlyPaths= and InaccessiblePaths= may be prefixed with -, in which case they will be ignored when they do not exist. If prefixed with + the paths are taken relative to the root - directory of the unit, as configured with RootDirectory=, instead of relative to the root - directory of the host (see above). When combining - and + on the same - path make sure to specify - first, and + second. + directory of the unit, as configured with RootDirectory=/RootImage=, + instead of relative to the root directory of the host (see above). When combining - and + + on the same path make sure to specify - first, and + + second. Note that using this setting will disconnect propagation of mounts from the service to the host (propagation in the opposite direction continues to work). This means that this setting may not be used for @@ -1004,9 +1015,9 @@ that in this case both read-only and regular bind mounts are reset, regardless which of the two settings is used. - This option is particularly useful when RootDirectory= is used. In this case the - source path refers to a path on the host file system, while the destination path refers to a path below the - root directory of the unit. + This option is particularly useful when RootDirectory=/RootImage= + is used. In this case the source path refers to a path on the host file system, while the destination path + refers to a path below the root directory of the unit. @@ -1094,10 +1105,10 @@ such as CapabilityBoundingSet= will affect only the latter, and there's no way to acquire additional capabilities in the host's user namespace. Defaults to off. - This setting is particularly useful in conjunction with RootDirectory=, as the need to - synchronize the user and group databases in the root directory and on the host is reduced, as the only users - and groups who need to be matched are root, nobody and the unit's own - user and group. + This setting is particularly useful in conjunction with + RootDirectory=/RootImage=, as the need to synchronize the user and group + databases in the root directory and on the host is reduced, as the only users and groups who need to be matched + are root, nobody and the unit's own user and group. diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index c57af5aaaf..7df4cab3f6 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -758,6 +758,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("LimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("WorkingDirectory", "s", property_get_working_directory, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RootDirectory", "s", NULL, offsetof(ExecContext, root_directory), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootImage", "s", NULL, offsetof(ExecContext, root_image), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("OOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Nice", "i", property_get_nice, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("IOScheduling", "i", property_get_ioprio, 0, SD_BUS_VTABLE_PROPERTY_CONST), @@ -1048,7 +1049,7 @@ int bus_exec_context_set_transient_property( return 1; - } else if (STR_IN_SET(name, "TTYPath", "RootDirectory")) { + } else if (STR_IN_SET(name, "TTYPath", "RootDirectory", "RootImage")) { const char *s; r = sd_bus_message_read(message, "s", &s); @@ -1061,6 +1062,8 @@ int bus_exec_context_set_transient_property( if (mode != UNIT_CHECK) { if (streq(name, "TTYPath")) r = free_and_strdup(&c->tty_path, s); + else if (streq(name, "RootImage")) + r = free_and_strdup(&c->root_image, s); else { assert(streq(name, "RootDirectory")); r = free_and_strdup(&c->root_directory, s); diff --git a/src/core/execute.c b/src/core/execute.c index 54f6418c5a..f57eb26388 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1640,6 +1640,9 @@ static bool exec_needs_mount_namespace( assert(context); assert(params); + if (context->root_image) + return true; + if (!strv_isempty(context->read_write_paths) || !strv_isempty(context->read_only_paths) || !strv_isempty(context->inaccessible_paths)) @@ -1938,7 +1941,7 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context, int r; _cleanup_strv_free_ char **rw = NULL; char *tmp = NULL, *var = NULL; - const char *root_dir = NULL; + const char *root_dir = NULL, *root_image = NULL; NameSpaceInfo ns_info = { .ignore_protect_paths = false, .private_dev = context->private_devices, @@ -1965,8 +1968,12 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context, if (r < 0) return r; - if (params->flags & EXEC_APPLY_CHROOT) - root_dir = context->root_directory; + if (params->flags & EXEC_APPLY_CHROOT) { + root_image = context->root_image; + + if (!root_image) + root_dir = context->root_directory; + } /* * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed @@ -1976,7 +1983,8 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context, if (!context->dynamic_user && root_dir) ns_info.ignore_protect_paths = true; - r = setup_namespace(root_dir, &ns_info, rw, + r = setup_namespace(root_dir, root_image, + &ns_info, rw, context->read_only_paths, context->inaccessible_paths, context->bind_mounts, @@ -1985,7 +1993,8 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context, var, context->protect_home, context->protect_system, - context->mount_flags); + context->mount_flags, + DISSECT_IMAGE_DISCARD_ON_LOOP); /* If we couldn't set up the namespace this is probably due to a * missing capability. In this case, silently proceeed. */ @@ -1999,10 +2008,12 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context, return r; } -static int apply_working_directory(const ExecContext *context, - const ExecParameters *params, - const char *home, - const bool needs_mount_ns) { +static int apply_working_directory( + const ExecContext *context, + const ExecParameters *params, + const char *home, + const bool needs_mount_ns) { + const char *d; const char *wd; @@ -2983,6 +2994,7 @@ void exec_context_done(ExecContext *c) { c->working_directory = mfree(c->working_directory); c->root_directory = mfree(c->root_directory); + c->root_image = mfree(c->root_image); c->tty_path = mfree(c->tty_path); c->syslog_identifier = mfree(c->syslog_identifier); c->user = mfree(c->user); @@ -3320,6 +3332,9 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { prefix, yes_no(c->memory_deny_write_execute), prefix, yes_no(c->restrict_realtime)); + if (c->root_image) + fprintf(f, "%sRootImage: %s\n", prefix, c->root_image); + STRV_FOREACH(e, c->environment) fprintf(f, "%sEnvironment: %s\n", prefix, *e); diff --git a/src/core/execute.h b/src/core/execute.h index 6fd5a6e5ce..9f2b6fd39e 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -106,7 +106,7 @@ struct ExecContext { char **pass_environment; struct rlimit *rlimit[_RLIMIT_MAX]; - char *working_directory, *root_directory; + char *working_directory, *root_directory, *root_image; bool working_directory_missing_ok; bool working_directory_home; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 07f2a70c8f..cb9e6fea27 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -19,6 +19,7 @@ m4_dnl Define the context options only once m4_define(`EXEC_CONTEXT_CONFIG_ITEMS', `$1.WorkingDirectory, config_parse_working_directory, 0, offsetof($1, exec_context) $1.RootDirectory, config_parse_unit_path_printf, 0, offsetof($1, exec_context.root_directory) +$1.RootImage, config_parse_unit_path_printf, 0, offsetof($1, exec_context.root_image) $1.User, config_parse_user_group, 0, offsetof($1, exec_context.user) $1.Group, config_parse_user_group, 0, offsetof($1, exec_context.group) $1.SupplementaryGroups, config_parse_user_group_strv, 0, offsetof($1, exec_context.supplementary_groups) diff --git a/src/core/namespace.c b/src/core/namespace.c index 10917f7b70..0ae5f704c7 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -30,6 +30,7 @@ #include "dev-setup.h" #include "fd-util.h" #include "fs-util.h" +#include "loop-util.h" #include "loopback-setup.h" #include "missing.h" #include "mkdir.h" @@ -867,6 +868,7 @@ static unsigned namespace_calculate_mounts( int setup_namespace( const char* root_directory, + const char* root_image, const NameSpaceInfo *ns_info, char** read_write_paths, char** read_only_paths, @@ -877,16 +879,46 @@ int setup_namespace( const char* var_tmp_dir, ProtectHome protect_home, ProtectSystem protect_system, - unsigned long mount_flags) { + unsigned long mount_flags, + DissectImageFlags dissect_image_flags) { + _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; MountEntry *m, *mounts = NULL; bool make_slave = false; unsigned n_mounts; int r = 0; + assert(ns_info); + if (mount_flags == 0) mount_flags = MS_SHARED; + if (root_image) { + dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT; + + if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths)) + dissect_image_flags |= DISSECT_IMAGE_READ_ONLY; + + r = loop_device_make_by_path(root_image, + dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR, + &loop_device); + if (r < 0) + return r; + + r = dissect_image(loop_device->fd, NULL, 0, dissect_image_flags, &dissected_image); + if (r < 0) + return r; + + if (!root_directory) { + /* Create a mount point for the image, if it's still missing. We use the same mount point for + * all images, which is safe, since they all live in their own namespaces after all, and hence + * won't see each other. */ + root_directory = "/run/systemd/unit-root"; + (void) mkdir(root_directory, 0700); + } + } + n_mounts = namespace_calculate_mounts( ns_info, read_write_paths, @@ -1001,7 +1033,15 @@ int setup_namespace( } } - if (root_directory) { + if (root_image) { + r = dissected_image_mount(dissected_image, root_directory, dissect_image_flags); + if (r < 0) + goto finish; + + loop_device_relinquish(loop_device); + + } else if (root_directory) { + /* Turn directory into bind mount, if it isn't one yet */ r = path_is_mount_point(root_directory, NULL, AT_SYMLINK_FOLLOW); if (r < 0) diff --git a/src/core/namespace.h b/src/core/namespace.h index bb9de9857c..f54954bd86 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -25,6 +25,7 @@ typedef struct BindMount BindMount; #include +#include "dissect-image.h" #include "macro.h" typedef enum ProtectHome { @@ -63,6 +64,7 @@ struct BindMount { int setup_namespace( const char *root_directory, + const char *root_image, const NameSpaceInfo *ns_info, char **read_write_paths, char **read_only_paths, @@ -73,7 +75,8 @@ int setup_namespace( const char *var_tmp_dir, ProtectHome protect_home, ProtectSystem protect_system, - unsigned long mount_flags); + unsigned long mount_flags, + DissectImageFlags dissected_image_flags); int setup_tmp_dirs( const char *id, diff --git a/src/core/unit.c b/src/core/unit.c index 44f1d5e206..90d7eea956 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -862,6 +862,12 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) { return r; } + if (c->root_image) { + r = unit_require_mounts_for(u, c->root_image); + if (r < 0) + return r; + } + if (!MANAGER_IS_SYSTEM(u->manager)) return 0; diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index a4677bef27..20c1085697 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -266,7 +266,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen "StandardInput", "StandardOutput", "StandardError", "Description", "Slice", "Type", "WorkingDirectory", "RootDirectory", "SyslogIdentifier", "ProtectSystem", - "ProtectHome", "SELinuxContext", "Restart")) + "ProtectHome", "SELinuxContext", "Restart", "RootImage")) r = sd_bus_message_append(m, "v", "s", eq); else if (streq(field, "SyslogLevel")) { diff --git a/src/test/test-ns.c b/src/test/test-ns.c index c99bcb371b..0125d905a6 100644 --- a/src/test/test-ns.c +++ b/src/test/test-ns.c @@ -77,6 +77,7 @@ int main(int argc, char *argv[]) { log_info("Not chrooted"); r = setup_namespace(root_directory, + NULL, &ns_info, (char **) writable, (char **) readonly, @@ -86,6 +87,7 @@ int main(int argc, char *argv[]) { var_tmp_dir, PROTECT_HOME_NO, PROTECT_SYSTEM_NO, + 0, 0); if (r < 0) { log_error_errno(r, "Failed to setup namespace: %m"); From 78ebe98061eb527f17691929f470f262a7ab2c8f Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Dec 2016 17:10:42 +0100 Subject: [PATCH 6/9] core,nspawn,dissect: make nspawn's .roothash file search reusable This makes nspawn's logic of automatically discovering the root hash of an image file generic, and then reuses it in systemd-dissect and in PID1's RootImage= logic, so that verity is automatically set up whenever we can. --- src/core/namespace.c | 17 +++++++++++- src/dissect/dissect.c | 8 ++++++ src/nspawn/nspawn.c | 57 +++++--------------------------------- src/shared/dissect-image.c | 51 ++++++++++++++++++++++++++++++++++ src/shared/dissect-image.h | 2 ++ 5 files changed, 84 insertions(+), 51 deletions(-) diff --git a/src/core/namespace.c b/src/core/namespace.c index 0ae5f704c7..75dca5b791 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -883,8 +883,11 @@ int setup_namespace( DissectImageFlags dissect_image_flags) { _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; + _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL; _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; + _cleanup_free_ void *root_hash = NULL; MountEntry *m, *mounts = NULL; + size_t root_hash_size = 0; bool make_slave = false; unsigned n_mounts; int r = 0; @@ -906,7 +909,15 @@ int setup_namespace( if (r < 0) return r; - r = dissect_image(loop_device->fd, NULL, 0, dissect_image_flags, &dissected_image); + r = root_hash_load(root_image, &root_hash, &root_hash_size); + if (r < 0) + return r; + + r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image); + if (r < 0) + return r; + + r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image); if (r < 0) return r; @@ -1038,6 +1049,10 @@ int setup_namespace( if (r < 0) goto finish; + r = decrypted_image_relinquish(decrypted_image); + if (r < 0) + goto finish; + loop_device_relinquish(loop_device); } else if (root_directory) { diff --git a/src/dissect/dissect.c b/src/dissect/dissect.c index fd9db5ba87..59bd7d9e84 100644 --- a/src/dissect/dissect.c +++ b/src/dissect/dissect.c @@ -191,6 +191,14 @@ int main(int argc, char *argv[]) { goto finish; } + if (!arg_root_hash) { + r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size); + if (r < 0) { + log_error_errno(r, "Failed to read root hash file for %s: %m", arg_image); + goto finish; + } + } + r = dissect_image(d->fd, arg_root_hash, arg_root_hash_size, arg_flags, &m); if (r == -ENOPKG) { log_error_errno(r, "Couldn't identify a suitable partition table or file system in %s.", arg_image); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 5594b87efa..213f50f796 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -3480,53 +3480,6 @@ static int run(int master, return 1; /* loop again */ } -static int load_root_hash(const char *image) { - _cleanup_free_ char *text = NULL, *fn = NULL; - char *n, *e; - void *k; - size_t l; - int r; - - assert_se(image); - - /* Try to load the root hash from a file next to the image file if it exists. */ - - if (arg_root_hash) - return 0; - - fn = new(char, strlen(image) + strlen(".roothash") + 1); - if (!fn) - return log_oom(); - - n = stpcpy(fn, image); - e = endswith(fn, ".raw"); - if (e) - n = e; - - strcpy(n, ".roothash"); - - r = read_one_line_file(fn, &text); - if (r == -ENOENT) - return 0; - if (r < 0) { - log_warning_errno(r, "Failed to read %s, ignoring: %m", fn); - return 0; - } - - r = unhexmem(text, strlen(text), &k, &l); - if (r < 0) - return log_error_errno(r, "Invalid root hash: %s", text); - if (l < sizeof(sd_id128_t)) { - free(k); - return log_error_errno(r, "Root hash too short: %s", text); - } - - arg_root_hash = k; - arg_root_hash_size = l; - - return 0; -} - int main(int argc, char *argv[]) { _cleanup_free_ char *console = NULL; @@ -3742,9 +3695,13 @@ int main(int argc, char *argv[]) { goto finish; } - r = load_root_hash(arg_image); - if (r < 0) - goto finish; + if (!arg_root_hash) { + r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size); + if (r < 0) { + log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image); + goto finish; + } + } } if (!mkdtemp(tmprootdir)) { diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c index 5fc2ce25f0..f3cd663602 100644 --- a/src/shared/dissect-image.c +++ b/src/shared/dissect-image.c @@ -28,8 +28,10 @@ #include "blkid-util.h" #include "dissect-image.h" #include "fd-util.h" +#include "fileio.h" #include "fs-util.h" #include "gpt.h" +#include "hexdecoct.h" #include "mount-util.h" #include "path-util.h" #include "stat-util.h" @@ -1087,6 +1089,55 @@ int decrypted_image_relinquish(DecryptedImage *d) { return 0; } +int root_hash_load(const char *image, void **ret, size_t *ret_size) { + _cleanup_free_ char *text = NULL; + _cleanup_free_ void *k = NULL; + char *fn, *e, *n; + size_t l; + int r; + + assert(image); + assert(ret); + assert(ret_size); + + if (is_device_path(image)) { + /* If we are asked to load the root hash for a device node, exit early */ + *ret = NULL; + *ret_size = 0; + return 0; + } + + fn = newa(char, strlen(image) + strlen(".roothash") + 1); + n = stpcpy(fn, image); + e = endswith(fn, ".raw"); + if (e) + n = e; + + strcpy(n, ".roothash"); + + r = read_one_line_file(fn, &text); + if (r == -ENOENT) { + *ret = NULL; + *ret_size = 0; + return 0; + } + if (r < 0) + return r; + + r = unhexmem(text, strlen(text), &k, &l); + if (r < 0) + return r; + if (l < sizeof(sd_id128_t)) + return -EINVAL; + + *ret = k; + *ret_size = l; + + k = NULL; + + return 1; +} + static const char *const partition_designator_table[] = { [PARTITION_ROOT] = "root", [PARTITION_ROOT_SECONDARY] = "root-secondary", diff --git a/src/shared/dissect-image.h b/src/shared/dissect-image.h index 26319bd8e7..cdb083be6f 100644 --- a/src/shared/dissect-image.h +++ b/src/shared/dissect-image.h @@ -94,3 +94,5 @@ int decrypted_image_relinquish(DecryptedImage *d); const char* partition_designator_to_string(int i) _const_; int partition_designator_from_string(const char *name) _pure_; + +int root_hash_load(const char *image, void **ret, size_t *ret_size); From 41488e1f7acf5f4b5e11ff992a05ee1baa537d54 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Dec 2016 17:38:12 +0100 Subject: [PATCH 7/9] dissect: try to read roothash value off user.verity.roothash xattr of image file This slightly extends the roothash loading logic to first check for a user.verity.roothash extended attribute on the image file. If it exists, it is used as Verity root hash and the ".roothash" file is not used. This should improve the chance that the roothash is retained when the file is moved around, as the data snippet is attached directly to the image file. The field is still detached from the file payload however, in order to make sure it may be trusted independently. This does not replace the ".roothash" file loading, it simply adds a second way to retrieve the data. Extended attributes are often a poor choice for storing metadata like this as it is usually difficult to discover for admins and users, and hard to fix if it ever gets out of sync. However, in this case I think it's safe as verity implies read-only access, and thus there's little chance of it to get out of sync. --- man/systemd-nspawn.xml | 10 +++++++--- src/shared/dissect-image.c | 36 ++++++++++++++++++++++-------------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index f6b3f57fc7..b8cae62818 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -257,9 +257,13 @@ Takes a data integrity (dm-verity) root hash specified in hexadecimal. This option enables data integrity checks using dm-verity, if the used image contains the appropriate integrity data (see above). The specified hash must match the root hash of integrity data, and is usually at least 256bits (and hence 64 - hexadecimal characters) long (in case of SHA256 for example). If this option is not specified, but a file with - the .roothash suffix is found next to the image file, bearing otherwise the same name the - root hash is read from it and automatically used. + formatted hexadecimal characters) long (in case of SHA256 for example). If this option is not specified, but + the image file carries the user.verity.roothash extended file attribute (see xattr7), then the root + hash is read from it, also as formatted hexadecimal characters. If the extended file attribute is not found (or + not supported by the underlying file system), but a file with the .roothash suffix is + found next to the image file, bearing otherwise the same name the root hash is read from it and automatically + used (again, as formatted hexadecimal characters). diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c index f3cd663602..66ddf3a872 100644 --- a/src/shared/dissect-image.c +++ b/src/shared/dissect-image.c @@ -40,6 +40,7 @@ #include "string-util.h" #include "strv.h" #include "udev-util.h" +#include "xattr-util.h" static int probe_filesystem(const char *node, char **ret_fstype) { #ifdef HAVE_BLKID @@ -1092,7 +1093,6 @@ int decrypted_image_relinquish(DecryptedImage *d) { int root_hash_load(const char *image, void **ret, size_t *ret_size) { _cleanup_free_ char *text = NULL; _cleanup_free_ void *k = NULL; - char *fn, *e, *n; size_t l; int r; @@ -1107,22 +1107,30 @@ int root_hash_load(const char *image, void **ret, size_t *ret_size) { return 0; } - fn = newa(char, strlen(image) + strlen(".roothash") + 1); - n = stpcpy(fn, image); - e = endswith(fn, ".raw"); - if (e) - n = e; + r = getxattr_malloc(image, "user.verity.roothash", &text, true); + if (r < 0) { + char *fn, *e, *n; - strcpy(n, ".roothash"); + if (!IN_SET(r, -ENODATA, -EOPNOTSUPP, -ENOENT)) + return r; - r = read_one_line_file(fn, &text); - if (r == -ENOENT) { - *ret = NULL; - *ret_size = 0; - return 0; + fn = newa(char, strlen(image) + strlen(".roothash") + 1); + n = stpcpy(fn, image); + e = endswith(fn, ".raw"); + if (e) + n = e; + + strcpy(n, ".roothash"); + + r = read_one_line_file(fn, &text); + if (r == -ENOENT) { + *ret = NULL; + *ret_size = 0; + return 0; + } + if (r < 0) + return r; } - if (r < 0) - return r; r = unhexmem(text, strlen(text), &k, &l); if (r < 0) From 08fe86d5bef3ac827880a9859b46a5d1363201a2 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Dec 2016 17:43:58 +0100 Subject: [PATCH 8/9] update TODO --- TODO | 4 ---- 1 file changed, 4 deletions(-) diff --git a/TODO b/TODO index 48f320093e..891d01b173 100644 --- a/TODO +++ b/TODO @@ -120,8 +120,6 @@ Features: * switch to ProtectSystem=strict for all our long-running services where that's possible -* If RootDirectory= is used, mount /proc, /sys, /dev into it, if not mounted yet - * Permit masking specific netlink APIs with RestrictAddressFamily= * nspawn: start UID allocation loop from hash of container name @@ -151,8 +149,6 @@ Features: * Add DataDirectory=, CacheDirectory= and LogDirectory= to match RuntimeDirectory=, and create it as necessary when starting a service, owned by the right user. -* Add RootImage= for mounting a disk image or file as root directory - * make sure the ratelimit object can deal with USEC_INFINITY as way to turn off things * journalctl: make sure -f ends when the container indicated by -M terminates From ef3116b5d4b9f12ae9f0fc25c8b40a04712c6d56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Wed, 8 Feb 2017 22:53:16 -0500 Subject: [PATCH 9/9] man: add more commas for clarify and reword a few sentences --- man/systemd-nspawn.xml | 8 ++++---- man/systemd.exec.xml | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index b8cae62818..a14992f0d9 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -256,14 +256,14 @@ Takes a data integrity (dm-verity) root hash specified in hexadecimal. This option enables data integrity checks using dm-verity, if the used image contains the appropriate integrity data (see above). The - specified hash must match the root hash of integrity data, and is usually at least 256bits (and hence 64 + specified hash must match the root hash of integrity data, and is usually at least 256 bits (and hence 64 formatted hexadecimal characters) long (in case of SHA256 for example). If this option is not specified, but the image file carries the user.verity.roothash extended file attribute (see xattr7), then the root hash is read from it, also as formatted hexadecimal characters. If the extended file attribute is not found (or - not supported by the underlying file system), but a file with the .roothash suffix is - found next to the image file, bearing otherwise the same name the root hash is read from it and automatically - used (again, as formatted hexadecimal characters). + is not supported by the underlying file system), but a file with the .roothash suffix is + found next to the image file, bearing otherwise the same name, the root hash is read from it and automatically + used, also as formatted hexadecimal characters. diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 09e78c6786..e95321f3c9 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -150,13 +150,13 @@ MountAPIVFS= Takes a boolean argument. If on, a private mount namespace for the unit's processes is created - and the API file systems /proc, /sys and /dev - will be mounted inside of it, unless they are already mounted. Note that this option has no effect unless used - in conjunction with RootDirectory=/RootImage= as these three mounts are generally mounted in the host - anyway, and unless the root directory is changed the private mount namespace will be a 1:1 copy of the host's, - and include these three mounts. Note that the /dev file system of the host is bind mounted - if this option is used without PrivateDevices=. To run the service with a private, minimal - version of /dev/, combine this option with + and the API file systems /proc, /sys, and /dev + are mounted inside of it, unless they are already mounted. Note that this option has no effect unless used in + conjunction with RootDirectory=/RootImage= as these three mounts are + generally mounted in the host anyway, and unless the root directory is changed, the private mount namespace + will be a 1:1 copy of the host's, and include these three mounts. Note that the /dev file + system of the host is bind mounted if this option is used without PrivateDevices=. To run + the service with a private, minimal version of /dev/, combine this option with PrivateDevices=.