diff --git a/TODO b/TODO
index 723292cde0..33afe39783 100644
--- a/TODO
+++ b/TODO
@@ -56,11 +56,10 @@ Features:
* ProtectKeyRing= to take keyring calls away
-* PrivateUsers= which maps the all user ids except root and the one specified
- in User= to nobody
-
* ProtectControlGroups= which mounts all of /sys/fs/cgroup read-only
+* RemoveKeyRing= to remove all keyring entries of the specified user
+
* Add DataDirectory=, CacheDirectory= and LogDirectory= to match
RuntimeDirectory=, and create it as necessary when starting a service, owned by the right user.
@@ -80,6 +79,11 @@ Features:
* expose the "privileged" flag of ExecCommand on the bus, and open it up to
transient units
+* in nss-systemd, if we run inside of RootDirectory= with PrivateUsers= set,
+ find a way to map the User=/Group= of the service to the right name. This way
+ a user/group for a service only has to exist on the host for the right
+ mapping to work.
+
* allow attaching additional journald log fields to cgroups
* rework fopen_temporary() to make use of open_tmpfile_linkable() (problem: the
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 0fc658f180..2495998295 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -107,36 +107,29 @@
WorkingDirectory=
- Takes a directory path relative to the service's root
- directory specified by RootDirectory=, or the
- special value ~. Sets the working directory
- for executed processes. If set to ~, the
- home directory of the user specified in
- User= is used. If not set, defaults to the
- root directory when systemd is running as a system instance
- and the respective user's home directory if run as user. If
- the setting is prefixed with the -
- character, a missing working directory is not considered
- fatal. If RootDirectory= is not set, then
- WorkingDirectory= is relative to the root of
- the system running the service manager.
- Note that setting this parameter might result in
- additional dependencies to be added to the unit (see
- above).
+ Takes a directory path relative to the service's root directory specified by
+ RootDirectory=, or the special value ~. Sets the working directory for
+ executed processes. If set to ~, the home directory of the user specified in
+ User= is used. If not set, defaults to the root directory when systemd is running as a
+ system instance and the respective user's home directory if run as user. If the setting is prefixed with the
+ - character, a missing working directory is not considered fatal. If
+ RootDirectory= is not set, then WorkingDirectory= is relative to the root
+ of the system running the service manager. Note that setting this parameter might result in additional
+ dependencies to be added to the unit (see above).
RootDirectory=
- Takes a directory path relative to the host's root directory
- (i.e. the root of the system running the service manager). Sets the
- root directory for executed processes, with the chroot2
- system call. If this is used, it must be ensured that the
- process binary and all its auxiliary files are available in
- the chroot() jail. Note that setting this
- parameter might result in additional dependencies to be added
- to the unit (see above).
+ Takes a directory path relative to the host's root directory (i.e. the root of the system
+ running the service manager). Sets the root directory for executed processes, with the chroot2 system
+ call. If this is used, it must be ensured that the process binary and all its auxiliary files are available in
+ the chroot() jail. Note that setting this parameter might result in additional
+ dependencies to be added to the unit (see above).
+
+ The PrivateUsers= setting is particularly useful in conjunction with
+ RootDirectory=. For details, see below.
@@ -998,6 +991,28 @@
accessible).
+
+ PrivateUsers=
+
+ Takes a boolean argument. If true, sets up a new user namespace for the executed processes and
+ configures a minimal user and group mapping, that maps the root user and group as well as
+ the unit's own user and group to themselves and everything else to the nobody user and
+ group. This is useful to securely detach the user and group databases used by the unit from the rest of the
+ system, and thus to create an effective sandbox environment. All files, directories, processes, IPC objects and
+ other resources owned by users/groups not equalling root or the unit's own will stay visible
+ from within the unit but appear owned by the nobody user and group. If this mode is enabled,
+ all unit processes are run without privileges in the host user namespace (regardless if the unit's own
+ user/group is root or not). Specifically this means that the process will have zero process
+ capabilities on the host's user namespace, but full capabilities within the service's user namespace. Settings
+ such as CapabilityBoundingSet= will affect only the latter, and there's no way to acquire
+ additional capabilities in the host's user namespace. Defaults to off.
+
+ This setting is particularly useful in conjunction with RootDirectory=, as the need to
+ synchronize the user and group databases in the root directory and on the host is reduced, as the only users
+ and groups who need to be matched are root, nobody and the unit's own
+ user and group.
+
+
ProtectSystem=
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index 346c8b973e..e35d3ccd2e 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -705,8 +705,9 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("InaccessiblePaths", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
- SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectHome", "s", bus_property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectSystem", "s", bus_property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1068,7 +1069,7 @@ int bus_exec_context_set_transient_property(
} else if (STR_IN_SET(name,
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset",
- "PrivateTmp", "PrivateDevices", "PrivateNetwork",
+ "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
"RestrictRealtime", "DynamicUser")) {
int b;
@@ -1090,6 +1091,8 @@ int bus_exec_context_set_transient_property(
c->private_devices = b;
else if (streq(name, "PrivateNetwork"))
c->private_network = b;
+ else if (streq(name, "PrivateUsers"))
+ c->private_users = b;
else if (streq(name, "NoNewPrivileges"))
c->no_new_privileges = b;
else if (streq(name, "SyslogLevelPrefix"))
diff --git a/src/core/execute.c b/src/core/execute.c
index 7aafb1b058..6019df7ea6 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -25,6 +25,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -1552,6 +1553,159 @@ static bool exec_needs_mount_namespace(
return false;
}
+static int setup_private_users(uid_t uid, gid_t gid) {
+ _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
+ _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
+ _cleanup_close_ int unshare_ready_fd = -1;
+ _cleanup_(sigkill_waitp) pid_t pid = 0;
+ uint64_t c = 1;
+ siginfo_t si;
+ ssize_t n;
+ int r;
+
+ /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
+ * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
+ * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
+ * which waits for the parent to create the new user namespace while staying in the original namespace. The
+ * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
+ * continues execution normally. */
+
+ if (uid != 0 && uid_is_valid(uid))
+ asprintf(&uid_map,
+ "0 0 1\n" /* Map root → root */
+ UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
+ uid, uid); /* The case where the above is the same */
+ else
+ uid_map = strdup("0 0 1\n");
+ if (!uid_map)
+ return -ENOMEM;
+
+ if (gid != 0 && gid_is_valid(gid))
+ asprintf(&gid_map,
+ "0 0 1\n" /* Map root → root */
+ GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
+ gid, gid);
+ else
+ gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
+ if (!gid_map)
+ return -ENOMEM;
+
+ /* Create a communication channel so that the parent can tell the child when it finished creating the user
+ * namespace. */
+ unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
+ if (unshare_ready_fd < 0)
+ return -errno;
+
+ /* Create a communication channel so that the child can tell the parent a proper error code in case it
+ * failed. */
+ if (pipe2(errno_pipe, O_CLOEXEC) < 0)
+ return -errno;
+
+ pid = fork();
+ if (pid < 0)
+ return -errno;
+
+ if (pid == 0) {
+ _cleanup_close_ int fd = -1;
+ const char *a;
+ pid_t ppid;
+
+ /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
+ * here, after the parent opened its own user namespace. */
+
+ ppid = getppid();
+ errno_pipe[0] = safe_close(errno_pipe[0]);
+
+ /* Wait until the parent unshared the user namespace */
+ if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ /* Disable the setgroups() system call in the child user namespace, for good. */
+ a = procfs_file_alloca(ppid, "setgroups");
+ fd = open(a, O_WRONLY|O_CLOEXEC);
+ if (fd < 0) {
+ if (errno != ENOENT) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ /* If the file is missing the kernel is too old, let's continue anyway. */
+ } else {
+ if (write(fd, "deny\n", 5) < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ fd = safe_close(fd);
+ }
+
+ /* First write the GID map */
+ a = procfs_file_alloca(ppid, "gid_map");
+ fd = open(a, O_WRONLY|O_CLOEXEC);
+ if (fd < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+ if (write(fd, gid_map, strlen(gid_map)) < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+ fd = safe_close(fd);
+
+ /* The write the UID map */
+ a = procfs_file_alloca(ppid, "uid_map");
+ fd = open(a, O_WRONLY|O_CLOEXEC);
+ if (fd < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+ if (write(fd, uid_map, strlen(uid_map)) < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ _exit(EXIT_SUCCESS);
+
+ child_fail:
+ (void) write(errno_pipe[1], &r, sizeof(r));
+ _exit(EXIT_FAILURE);
+ }
+
+ errno_pipe[1] = safe_close(errno_pipe[1]);
+
+ if (unshare(CLONE_NEWUSER) < 0)
+ return -errno;
+
+ /* Let the child know that the namespace is ready now */
+ if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
+ return -errno;
+
+ /* Try to read an error code from the child */
+ n = read(errno_pipe[0], &r, sizeof(r));
+ if (n < 0)
+ return -errno;
+ if (n == sizeof(r)) { /* an error code was sent to us */
+ if (r < 0)
+ return r;
+ return -EIO;
+ }
+ if (n != 0) /* on success we should have read 0 bytes */
+ return -EIO;
+
+ r = wait_for_terminate(pid, &si);
+ if (r < 0)
+ return r;
+ pid = 0;
+
+ /* If something strange happened with the child, let's consider this fatal, too */
+ if (si.si_code != CLD_EXITED || si.si_status != 0)
+ return -EIO;
+
+ return 0;
+}
+
static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
assert(array);
assert(n);
@@ -2079,6 +2233,14 @@ static int exec_child(
}
#endif
+ if ((params->flags & EXEC_APPLY_PERMISSIONS) && context->private_users) {
+ r = setup_private_users(uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return r;
+ }
+ }
+
/* We repeat the fd closing here, to make sure that
* nothing is leaked from the PAM modules. Note that
* we are more aggressive this time since socket_fd
@@ -2640,8 +2802,9 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
"%sRootDirectory: %s\n"
"%sNonBlocking: %s\n"
"%sPrivateTmp: %s\n"
- "%sPrivateNetwork: %s\n"
"%sPrivateDevices: %s\n"
+ "%sPrivateNetwork: %s\n"
+ "%sPrivateUsers: %s\n"
"%sProtectHome: %s\n"
"%sProtectSystem: %s\n"
"%sIgnoreSIGPIPE: %s\n"
@@ -2652,8 +2815,9 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
prefix, c->root_directory ? c->root_directory : "/",
prefix, yes_no(c->non_blocking),
prefix, yes_no(c->private_tmp),
- prefix, yes_no(c->private_network),
prefix, yes_no(c->private_devices),
+ prefix, yes_no(c->private_network),
+ prefix, yes_no(c->private_users),
prefix, protect_home_to_string(c->protect_home),
prefix, protect_system_to_string(c->protect_system),
prefix, yes_no(c->ignore_sigpipe),
diff --git a/src/core/execute.h b/src/core/execute.h
index 2d05ca39aa..106154f81a 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -171,6 +171,7 @@ struct ExecContext {
bool private_tmp;
bool private_network;
bool private_devices;
+ bool private_users;
ProtectSystem protect_system;
ProtectHome protect_home;
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4
index 396f847213..251155b428 100644
--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@@ -88,8 +88,9 @@ $1.ReadWritePaths, config_parse_namespace_path_strv, 0,
$1.ReadOnlyPaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.read_only_paths)
$1.InaccessiblePaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.inaccessible_paths)
$1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp)
-$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network)
$1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices)
+$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network)
+$1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users)
$1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context)
$1.ProtectHome, config_parse_protect_home, 0, offsetof($1, exec_context)
$1.MountFlags, config_parse_exec_mount_flags, 0, offsetof($1, exec_context)
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index c3a5233532..7774d607c7 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -202,7 +202,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
"CPUAccounting", "MemoryAccounting", "IOAccounting", "BlockIOAccounting", "TasksAccounting",
"SendSIGHUP", "SendSIGKILL", "WakeSystem", "DefaultDependencies",
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit",
- "PrivateTmp", "PrivateDevices", "PrivateNetwork", "NoNewPrivileges",
+ "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
"SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
"RestrictRealtime", "DynamicUser")) {