core: add new new bus call for migrating foreign processes to scope/service units

This adds a new bus call to service and scope units called
AttachProcesses() that moves arbitrary processes into the cgroup of the
unit. The primary user for this new API is systemd itself: the systemd
--user instance uses this call of the systemd --system instance to
migrate processes if itself gets the request to migrate processes and
the kernel refuses this due to access restrictions.

The primary use-case of this is to make "systemd-run --scope --user …"
invoked from user session scopes work correctly on pure cgroupsv2
environments. There, the kernel refuses to migrate processes between two
unprivileged-owned cgroups unless the requestor as well as the ownership
of the closest parent cgroup all match. This however is not the case
between the session-XYZ.scope unit of a login session and the
user@ABC.service of the systemd --user instance.

The new logic always tries to move the processes on its own, but if
that doesn't work when being the user manager, then the system manager
is asked to do it instead.

The new operation is relatively restrictive: it will only allow to move
the processes like this if the caller is root, or the UID of the target
unit, caller and process all match. Note that this means that
unprivileged users cannot attach processes to scope units, as those do
not have "owning" users (i.e. they have now User= field).

Fixes: #3388
This commit is contained in:
Lennart Poettering 2018-02-07 22:52:52 +01:00
parent 931e47547d
commit 6592b9759c
10 changed files with 346 additions and 17 deletions

View File

@ -24,6 +24,7 @@
#include "alloc-util.h"
#include "blockdev-util.h"
#include "bpf-firewall.h"
#include "bus-error.h"
#include "cgroup-util.h"
#include "cgroup.h"
#include "fd-util.h"
@ -1303,13 +1304,12 @@ void unit_update_cgroup_members_masks(Unit *u) {
}
}
static const char *migrate_callback(CGroupMask mask, void *userdata) {
Unit *u = userdata;
const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
assert(mask != 0);
assert(u);
/* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
while (u) {
if (u->cgroup_path &&
u->cgroup_realized &&
(u->cgroup_realized_mask & mask) == mask)
@ -1321,6 +1321,10 @@ static const char *migrate_callback(CGroupMask mask, void *userdata) {
return NULL;
}
static const char *migrate_callback(CGroupMask mask, void *userdata) {
return unit_get_realized_cgroup_path(userdata, mask);
}
char *unit_default_cgroup_path(Unit *u) {
_cleanup_free_ char *escaped = NULL, *slice = NULL;
int r;
@ -1503,19 +1507,142 @@ static int unit_create_cgroup(
return 0;
}
int unit_attach_pids_to_cgroup(Unit *u) {
static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
_cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
char *pp;
int r;
assert(u);
if (MANAGER_IS_SYSTEM(u->manager))
return -EINVAL;
if (!u->manager->system_bus)
return -EIO;
if (!u->cgroup_path)
return -EINVAL;
/* Determine this unit's cgroup path relative to our cgroup root */
pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
if (!pp)
return -EINVAL;
pp = strjoina("/", pp, suffix_path);
path_kill_slashes(pp);
r = sd_bus_call_method(u->manager->system_bus,
"org.freedesktop.systemd1",
"/org/freedesktop/systemd1",
"org.freedesktop.systemd1.Manager",
"AttachProcessesToUnit",
&error, NULL,
"ssau",
NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
if (r < 0)
return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
return 0;
}
int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
CGroupMask delegated_mask;
const char *p;
Iterator i;
void *pidp;
int r, q;
assert(u);
if (!UNIT_HAS_CGROUP_CONTEXT(u))
return -EINVAL;
if (set_isempty(pids))
return 0;
r = unit_realize_cgroup(u);
if (r < 0)
return r;
r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
if (r < 0)
return r;
if (isempty(suffix_path))
p = u->cgroup_path;
else
p = strjoina(u->cgroup_path, "/", suffix_path);
return 0;
delegated_mask = unit_get_delegate_mask(u);
r = 0;
SET_FOREACH(pidp, pids, i) {
pid_t pid = PTR_TO_PID(pidp);
CGroupController c;
/* First, attach the PID to the main cgroup hierarchy */
q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid);
if (q < 0) {
log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p);
if (MANAGER_IS_USER(u->manager) && IN_SET(q, -EPERM, -EACCES)) {
int z;
/* If we are in a user instance, and we can't move the process ourselves due to
* permission problems, let's ask the system instance about it instead. Since it's more
* privileged it might be able to move the process across the leaves of a subtree who's
* top node is not owned by us. */
z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path);
if (z < 0)
log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p);
else
continue; /* When the bus thing worked via the bus we are fully done for this PID. */
}
if (r >= 0)
r = q; /* Remember first error */
continue;
}
q = cg_all_unified();
if (q < 0)
return q;
if (q > 0)
continue;
/* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
* innermost realized one */
for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
const char *realized;
if (!(u->manager->cgroup_supported & bit))
continue;
/* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
if (delegated_mask & u->cgroup_realized_mask & bit) {
q = cg_attach(cgroup_controller_to_string(c), p, pid);
if (q >= 0)
continue; /* Success! */
log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
pid, p, cgroup_controller_to_string(c));
}
/* So this controller is either not delegate or realized, or something else weird happened. In
* that case let's attach the PID at least to the closest cgroup up the tree that is
* realized. */
realized = unit_get_realized_cgroup_path(u, bit);
if (!realized)
continue; /* Not even realized in the root slice? Then let's not bother */
q = cg_attach(cgroup_controller_to_string(c), realized, pid);
if (q < 0)
log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
pid, realized, cgroup_controller_to_string(c));
}
}
return r;
}
static void cgroup_xattr_apply(Unit *u) {

View File

@ -167,6 +167,7 @@ bool unit_get_needs_bpf(Unit *u);
void unit_update_cgroup_members_masks(Unit *u);
const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask);
char *unit_default_cgroup_path(Unit *u);
int unit_set_cgroup_path(Unit *u, const char *path);
int unit_pick_cgroup_path(Unit *u);
@ -178,7 +179,7 @@ int unit_watch_cgroup(Unit *u);
void unit_add_to_cgroup_empty_queue(Unit *u);
int unit_attach_pids_to_cgroup(Unit *u);
int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path);
int manager_setup_cgroup(Manager *m);
void manager_shutdown_cgroup(Manager *m, bool delete);

View File

@ -863,6 +863,26 @@ static int method_get_unit_processes(sd_bus_message *message, void *userdata, sd
return bus_unit_method_get_processes(message, u, error);
}
static int method_attach_processes_to_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
Manager *m = userdata;
const char *name;
Unit *u;
int r;
assert(message);
assert(m);
r = sd_bus_message_read(message, "s", &name);
if (r < 0)
return r;
r = bus_get_unit_by_name(m, message, name, &u, error);
if (r < 0)
return r;
return bus_unit_method_attach_processes(message, u, error);
}
static int transient_unit_from_message(
Manager *m,
sd_bus_message *message,
@ -2504,6 +2524,7 @@ const sd_bus_vtable bus_manager_vtable[] = {
SD_BUS_METHOD("UnrefUnit", "s", NULL, method_unref_unit, SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_METHOD("StartTransientUnit", "ssa(sv)a(sa(sv))", "o", method_start_transient_unit, SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_METHOD("GetUnitProcesses", "s", "a(sus)", method_get_unit_processes, SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_METHOD("AttachProcessesToUnit", "ssau", NULL, method_attach_processes_to_unit, SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_METHOD("GetJob", "u", "o", method_get_job, SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_METHOD("GetJobAfter", "u", "a(usssoo)", method_get_job_waiting, SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_METHOD("GetJobBefore", "u", "a(usssoo)", method_get_job_waiting, SD_BUS_VTABLE_UNPRIVILEGED),

View File

@ -89,17 +89,39 @@ static int bus_scope_set_transient_property(
return bus_set_transient_usec(UNIT(s), name, &s->timeout_stop_usec, message, flags, error);
if (streq(name, "PIDs")) {
_cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
unsigned n = 0;
uint32_t pid;
r = sd_bus_message_enter_container(message, 'a', "u");
if (r < 0)
return r;
while ((r = sd_bus_message_read(message, "u", &pid)) > 0) {
for (;;) {
uint32_t upid;
pid_t pid;
if (pid <= 1)
return -EINVAL;
r = sd_bus_message_read(message, "u", &upid);
if (r < 0)
return r;
if (r == 0)
break;
if (upid == 0) {
if (!creds) {
r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
if (r < 0)
return r;
}
r = sd_bus_creds_get_pid(creds, &pid);
if (r < 0)
return r;
} else
pid = (uid_t) upid;
r = unit_pid_attachable(UNIT(s), pid, error);
if (r < 0)
return r;
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
r = unit_watch_pid(UNIT(s), pid);
@ -109,8 +131,6 @@ static int bus_scope_set_transient_property(
n++;
}
if (r < 0)
return r;
r = sd_bus_message_exit_container(message);
if (r < 0)

View File

@ -1127,6 +1127,118 @@ static int property_get_ip_counter(
return sd_bus_message_append(reply, "t", value);
}
int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) {
_cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
_cleanup_(set_freep) Set *pids = NULL;
Unit *u = userdata;
const char *path;
int r;
assert(message);
/* This migrates the processes with the specified PIDs into the cgroup of this unit, optionally below a
* specified cgroup path. Obviously this only works for units that actually maintain a cgroup
* representation. If a process is already in the cgroup no operation is executed in this case the specified
* subcgroup path has no effect! */
r = mac_selinux_unit_access_check(u, message, "start", error);
if (r < 0)
return r;
r = sd_bus_message_read(message, "s", &path);
if (r < 0)
return r;
path = empty_to_null(path);
if (path) {
if (!path_is_absolute(path))
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not absolute: %s", path);
if (!path_is_normalized(path))
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not normalized: %s", path);
}
if (!unit_cgroup_delegate(u))
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process migration not available on non-delegated units.");
if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u)))
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not active, refusing.");
r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID|SD_BUS_CREDS_PID, &creds);
if (r < 0)
return r;
r = sd_bus_message_enter_container(message, 'a', "u");
if (r < 0)
return r;
for (;;) {
uid_t process_uid, sender_uid;
uint32_t upid;
pid_t pid;
r = sd_bus_message_read(message, "u", &upid);
if (r < 0)
return r;
if (r == 0)
break;
if (upid == 0) {
r = sd_bus_creds_get_pid(creds, &pid);
if (r < 0)
return r;
} else
pid = (uid_t) upid;
/* Filter out duplicates */
if (set_contains(pids, PID_TO_PTR(pid)))
continue;
/* Check if this process is suitable for attaching to this unit */
r = unit_pid_attachable(u, pid, error);
if (r < 0)
return r;
/* Let's query the sender's UID, so that we can make our security decisions */
r = sd_bus_creds_get_euid(creds, &sender_uid);
if (r < 0)
return r;
/* Let's validate security: if the sender is root, then all is OK. If the sender is is any other unit,
* then the process' UID and the target unit's UID have to match the sender's UID */
if (sender_uid != 0 && sender_uid != getuid()) {
r = get_process_uid(pid, &process_uid);
if (r < 0)
return sd_bus_error_set_errnof(error, r, "Failed to retrieve process UID: %m");
if (process_uid != sender_uid)
return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by client's UID. Refusing.", pid);
if (process_uid != u->ref_uid)
return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by target unit's UID. Refusing.", pid);
}
if (!pids) {
pids = set_new(NULL);
if (!pids)
return -ENOMEM;
}
r = set_put(pids, PID_TO_PTR(pid));
if (r < 0)
return r;
}
r = sd_bus_message_exit_container(message);
if (r < 0)
return r;
r = unit_attach_pids_to_cgroup(u, pids, path);
if (r < 0)
return sd_bus_error_set_errnof(error, r, "Failed to attach processes to control group: %m");
return sd_bus_reply_method_return(message, NULL);
}
const sd_bus_vtable bus_unit_cgroup_vtable[] = {
SD_BUS_VTABLE_START(0),
SD_BUS_PROPERTY("Slice", "s", property_get_slice, 0, 0),
@ -1139,6 +1251,7 @@ const sd_bus_vtable bus_unit_cgroup_vtable[] = {
SD_BUS_PROPERTY("IPEgressBytes", "t", property_get_ip_counter, 0, 0),
SD_BUS_PROPERTY("IPEgressPackets", "t", property_get_ip_counter, 0, 0),
SD_BUS_METHOD("GetProcesses", NULL, "a(sus)", bus_unit_method_get_processes, SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_METHOD("AttachProcesses", "sau", NULL, bus_unit_method_attach_processes, SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_VTABLE_END
};

View File

@ -37,6 +37,7 @@ int bus_unit_method_reset_failed(sd_bus_message *message, void *userdata, sd_bus
int bus_unit_set_properties(Unit *u, sd_bus_message *message, UnitWriteFlags flags, bool commit, sd_bus_error *error);
int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error);
int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error);
int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error);
int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error);
int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error);

View File

@ -234,6 +234,10 @@
send_interface="org.freedesktop.systemd1.Manager"
send_member="StartTransientUnit"/>
<allow send_destination="org.freedesktop.systemd1"
send_interface="org.freedesktop.systemd1.Manager"
send_member="AttachProcessesToUnit"/>
<allow send_destination="org.freedesktop.systemd1"
send_interface="org.freedesktop.systemd1.Manager"
send_member="CancelJob"/>
@ -366,6 +370,18 @@
send_interface="org.freedesktop.systemd1.Unit"
send_member="Unref"/>
<!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Service interface -->
<allow send_destination="org.freedesktop.systemd1"
send_interface="org.freedesktop.systemd1.Service"
send_member="AttachProcesses"/>
<!-- Managed via polkit or other criteria: org.freedesktop.systemd1.Scope interface -->
<allow send_destination="org.freedesktop.systemd1"
send_interface="org.freedesktop.systemd1.Scope"
send_member="AttachProcesses"/>
<allow receive_sender="org.freedesktop.systemd1"/>
</policy>

View File

@ -343,7 +343,7 @@ static int scope_start(Unit *u) {
unit_export_state_files(UNIT(s));
r = unit_attach_pids_to_cgroup(u);
r = unit_attach_pids_to_cgroup(u, UNIT(s)->pids, NULL);
if (r < 0) {
log_unit_warning_errno(UNIT(s), r, "Failed to add PIDs to scope's control group: %m");
scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);

View File

@ -5362,6 +5362,34 @@ const char *unit_label_path(Unit *u) {
return p;
}
int unit_pid_attachable(Unit *u, pid_t pid, sd_bus_error *error) {
int r;
assert(u);
/* Checks whether the specified PID is generally good for attaching, i.e. a valid PID, not our manager itself,
* and not a kernel thread either */
/* First, a simple range check */
if (!pid_is_valid(pid))
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process identifier " PID_FMT " is not valid.", pid);
/* Some extra safety check */
if (pid == 1 || pid == getpid_cached())
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process " PID_FMT " is a manager processs, refusing.", pid);
/* Don't even begin to bother with kernel threads */
r = is_kernel_thread(pid);
if (r == -ESRCH)
return sd_bus_error_setf(error, SD_BUS_ERROR_UNIX_PROCESS_ID_UNKNOWN, "Process with ID " PID_FMT " does not exist.", pid);
if (r < 0)
return sd_bus_error_set_errnof(error, r, "Failed to determine whether process " PID_FMT " is a kernel thread: %m", pid);
if (r > 0)
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process " PID_FMT " is a kernel thread, refusing.", pid);
return 0;
}
static const char* const collect_mode_table[_COLLECT_MODE_MAX] = {
[COLLECT_INACTIVE] = "inactive",
[COLLECT_INACTIVE_OR_FAILED] = "inactive-or-failed",

View File

@ -806,6 +806,8 @@ bool unit_needs_console(Unit *u);
const char *unit_label_path(Unit *u);
int unit_pid_attachable(Unit *unit, pid_t pid, sd_bus_error *error);
/* Macros which append UNIT= or USER_UNIT= to the message */
#define log_unit_full(unit, level, error, ...) \