diff --git a/TODO b/TODO index 723292cde0..33afe39783 100644 --- a/TODO +++ b/TODO @@ -56,11 +56,10 @@ Features: * ProtectKeyRing= to take keyring calls away -* PrivateUsers= which maps the all user ids except root and the one specified - in User= to nobody - * ProtectControlGroups= which mounts all of /sys/fs/cgroup read-only +* RemoveKeyRing= to remove all keyring entries of the specified user + * Add DataDirectory=, CacheDirectory= and LogDirectory= to match RuntimeDirectory=, and create it as necessary when starting a service, owned by the right user. @@ -80,6 +79,11 @@ Features: * expose the "privileged" flag of ExecCommand on the bus, and open it up to transient units +* in nss-systemd, if we run inside of RootDirectory= with PrivateUsers= set, + find a way to map the User=/Group= of the service to the right name. This way + a user/group for a service only has to exist on the host for the right + mapping to work. + * allow attaching additional journald log fields to cgroups * rework fopen_temporary() to make use of open_tmpfile_linkable() (problem: the diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 0fc658f180..2495998295 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -107,36 +107,29 @@ WorkingDirectory= - Takes a directory path relative to the service's root - directory specified by RootDirectory=, or the - special value ~. Sets the working directory - for executed processes. If set to ~, the - home directory of the user specified in - User= is used. If not set, defaults to the - root directory when systemd is running as a system instance - and the respective user's home directory if run as user. If - the setting is prefixed with the - - character, a missing working directory is not considered - fatal. If RootDirectory= is not set, then - WorkingDirectory= is relative to the root of - the system running the service manager. - Note that setting this parameter might result in - additional dependencies to be added to the unit (see - above). + Takes a directory path relative to the service's root directory specified by + RootDirectory=, or the special value ~. Sets the working directory for + executed processes. If set to ~, the home directory of the user specified in + User= is used. If not set, defaults to the root directory when systemd is running as a + system instance and the respective user's home directory if run as user. If the setting is prefixed with the + - character, a missing working directory is not considered fatal. If + RootDirectory= is not set, then WorkingDirectory= is relative to the root + of the system running the service manager. Note that setting this parameter might result in additional + dependencies to be added to the unit (see above). RootDirectory= - Takes a directory path relative to the host's root directory - (i.e. the root of the system running the service manager). Sets the - root directory for executed processes, with the chroot2 - system call. If this is used, it must be ensured that the - process binary and all its auxiliary files are available in - the chroot() jail. Note that setting this - parameter might result in additional dependencies to be added - to the unit (see above). + Takes a directory path relative to the host's root directory (i.e. the root of the system + running the service manager). Sets the root directory for executed processes, with the chroot2 system + call. If this is used, it must be ensured that the process binary and all its auxiliary files are available in + the chroot() jail. Note that setting this parameter might result in additional + dependencies to be added to the unit (see above). + + The PrivateUsers= setting is particularly useful in conjunction with + RootDirectory=. For details, see below. @@ -998,6 +991,28 @@ accessible). + + PrivateUsers= + + Takes a boolean argument. If true, sets up a new user namespace for the executed processes and + configures a minimal user and group mapping, that maps the root user and group as well as + the unit's own user and group to themselves and everything else to the nobody user and + group. This is useful to securely detach the user and group databases used by the unit from the rest of the + system, and thus to create an effective sandbox environment. All files, directories, processes, IPC objects and + other resources owned by users/groups not equalling root or the unit's own will stay visible + from within the unit but appear owned by the nobody user and group. If this mode is enabled, + all unit processes are run without privileges in the host user namespace (regardless if the unit's own + user/group is root or not). Specifically this means that the process will have zero process + capabilities on the host's user namespace, but full capabilities within the service's user namespace. Settings + such as CapabilityBoundingSet= will affect only the latter, and there's no way to acquire + additional capabilities in the host's user namespace. Defaults to off. + + This setting is particularly useful in conjunction with RootDirectory=, as the need to + synchronize the user and group databases in the root directory and on the host is reduced, as the only users + and groups who need to be matched are root, nobody and the unit's own + user and group. + + ProtectSystem= diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 346c8b973e..e35d3ccd2e 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -705,8 +705,9 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("InaccessiblePaths", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST), - SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectHome", "s", bus_property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectSystem", "s", bus_property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1068,7 +1069,7 @@ int bus_exec_context_set_transient_property( } else if (STR_IN_SET(name, "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", - "PrivateTmp", "PrivateDevices", "PrivateNetwork", + "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", "RestrictRealtime", "DynamicUser")) { int b; @@ -1090,6 +1091,8 @@ int bus_exec_context_set_transient_property( c->private_devices = b; else if (streq(name, "PrivateNetwork")) c->private_network = b; + else if (streq(name, "PrivateUsers")) + c->private_users = b; else if (streq(name, "NoNewPrivileges")) c->no_new_privileges = b; else if (streq(name, "SyslogLevelPrefix")) diff --git a/src/core/execute.c b/src/core/execute.c index 7aafb1b058..6019df7ea6 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -1552,6 +1553,159 @@ static bool exec_needs_mount_namespace( return false; } +static int setup_private_users(uid_t uid, gid_t gid) { + _cleanup_free_ char *uid_map = NULL, *gid_map = NULL; + _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 }; + _cleanup_close_ int unshare_ready_fd = -1; + _cleanup_(sigkill_waitp) pid_t pid = 0; + uint64_t c = 1; + siginfo_t si; + ssize_t n; + int r; + + /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to + * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which + * we however lack after opening the user namespace. To work around this we fork() a temporary child process, + * which waits for the parent to create the new user namespace while staying in the original namespace. The + * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and + * continues execution normally. */ + + if (uid != 0 && uid_is_valid(uid)) + asprintf(&uid_map, + "0 0 1\n" /* Map root → root */ + UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */ + uid, uid); /* The case where the above is the same */ + else + uid_map = strdup("0 0 1\n"); + if (!uid_map) + return -ENOMEM; + + if (gid != 0 && gid_is_valid(gid)) + asprintf(&gid_map, + "0 0 1\n" /* Map root → root */ + GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */ + gid, gid); + else + gid_map = strdup("0 0 1\n"); /* The case where the above is the same */ + if (!gid_map) + return -ENOMEM; + + /* Create a communication channel so that the parent can tell the child when it finished creating the user + * namespace. */ + unshare_ready_fd = eventfd(0, EFD_CLOEXEC); + if (unshare_ready_fd < 0) + return -errno; + + /* Create a communication channel so that the child can tell the parent a proper error code in case it + * failed. */ + if (pipe2(errno_pipe, O_CLOEXEC) < 0) + return -errno; + + pid = fork(); + if (pid < 0) + return -errno; + + if (pid == 0) { + _cleanup_close_ int fd = -1; + const char *a; + pid_t ppid; + + /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from + * here, after the parent opened its own user namespace. */ + + ppid = getppid(); + errno_pipe[0] = safe_close(errno_pipe[0]); + + /* Wait until the parent unshared the user namespace */ + if (read(unshare_ready_fd, &c, sizeof(c)) < 0) { + r = -errno; + goto child_fail; + } + + /* Disable the setgroups() system call in the child user namespace, for good. */ + a = procfs_file_alloca(ppid, "setgroups"); + fd = open(a, O_WRONLY|O_CLOEXEC); + if (fd < 0) { + if (errno != ENOENT) { + r = -errno; + goto child_fail; + } + + /* If the file is missing the kernel is too old, let's continue anyway. */ + } else { + if (write(fd, "deny\n", 5) < 0) { + r = -errno; + goto child_fail; + } + + fd = safe_close(fd); + } + + /* First write the GID map */ + a = procfs_file_alloca(ppid, "gid_map"); + fd = open(a, O_WRONLY|O_CLOEXEC); + if (fd < 0) { + r = -errno; + goto child_fail; + } + if (write(fd, gid_map, strlen(gid_map)) < 0) { + r = -errno; + goto child_fail; + } + fd = safe_close(fd); + + /* The write the UID map */ + a = procfs_file_alloca(ppid, "uid_map"); + fd = open(a, O_WRONLY|O_CLOEXEC); + if (fd < 0) { + r = -errno; + goto child_fail; + } + if (write(fd, uid_map, strlen(uid_map)) < 0) { + r = -errno; + goto child_fail; + } + + _exit(EXIT_SUCCESS); + + child_fail: + (void) write(errno_pipe[1], &r, sizeof(r)); + _exit(EXIT_FAILURE); + } + + errno_pipe[1] = safe_close(errno_pipe[1]); + + if (unshare(CLONE_NEWUSER) < 0) + return -errno; + + /* Let the child know that the namespace is ready now */ + if (write(unshare_ready_fd, &c, sizeof(c)) < 0) + return -errno; + + /* Try to read an error code from the child */ + n = read(errno_pipe[0], &r, sizeof(r)); + if (n < 0) + return -errno; + if (n == sizeof(r)) { /* an error code was sent to us */ + if (r < 0) + return r; + return -EIO; + } + if (n != 0) /* on success we should have read 0 bytes */ + return -EIO; + + r = wait_for_terminate(pid, &si); + if (r < 0) + return r; + pid = 0; + + /* If something strange happened with the child, let's consider this fatal, too */ + if (si.si_code != CLD_EXITED || si.si_status != 0) + return -EIO; + + return 0; +} + static void append_socket_pair(int *array, unsigned *n, int pair[2]) { assert(array); assert(n); @@ -2079,6 +2233,14 @@ static int exec_child( } #endif + if ((params->flags & EXEC_APPLY_PERMISSIONS) && context->private_users) { + r = setup_private_users(uid, gid); + if (r < 0) { + *exit_status = EXIT_USER; + return r; + } + } + /* We repeat the fd closing here, to make sure that * nothing is leaked from the PAM modules. Note that * we are more aggressive this time since socket_fd @@ -2640,8 +2802,9 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { "%sRootDirectory: %s\n" "%sNonBlocking: %s\n" "%sPrivateTmp: %s\n" - "%sPrivateNetwork: %s\n" "%sPrivateDevices: %s\n" + "%sPrivateNetwork: %s\n" + "%sPrivateUsers: %s\n" "%sProtectHome: %s\n" "%sProtectSystem: %s\n" "%sIgnoreSIGPIPE: %s\n" @@ -2652,8 +2815,9 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { prefix, c->root_directory ? c->root_directory : "/", prefix, yes_no(c->non_blocking), prefix, yes_no(c->private_tmp), - prefix, yes_no(c->private_network), prefix, yes_no(c->private_devices), + prefix, yes_no(c->private_network), + prefix, yes_no(c->private_users), prefix, protect_home_to_string(c->protect_home), prefix, protect_system_to_string(c->protect_system), prefix, yes_no(c->ignore_sigpipe), diff --git a/src/core/execute.h b/src/core/execute.h index 2d05ca39aa..106154f81a 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -171,6 +171,7 @@ struct ExecContext { bool private_tmp; bool private_network; bool private_devices; + bool private_users; ProtectSystem protect_system; ProtectHome protect_home; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 396f847213..251155b428 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -88,8 +88,9 @@ $1.ReadWritePaths, config_parse_namespace_path_strv, 0, $1.ReadOnlyPaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.read_only_paths) $1.InaccessiblePaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.inaccessible_paths) $1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp) -$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network) $1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices) +$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network) +$1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users) $1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context) $1.ProtectHome, config_parse_protect_home, 0, offsetof($1, exec_context) $1.MountFlags, config_parse_exec_mount_flags, 0, offsetof($1, exec_context) diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index c3a5233532..7774d607c7 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -202,7 +202,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen "CPUAccounting", "MemoryAccounting", "IOAccounting", "BlockIOAccounting", "TasksAccounting", "SendSIGHUP", "SendSIGKILL", "WakeSystem", "DefaultDependencies", "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit", - "PrivateTmp", "PrivateDevices", "PrivateNetwork", "NoNewPrivileges", + "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute", "RestrictRealtime", "DynamicUser")) {