core: rework how we track which PIDs to watch for a unit

Previously, we'd maintain two hashmaps keyed by PIDs, pointing to Unit
interested in SIGCHLD events for them. This scheme allowed a specific
PID to be watched by exactly 0, 1 or 2 units.

With this rework this is replaced by a single hashmap which is primarily
keyed by the PID and points to a Unit interested in it. However, it
optionally also keyed by the negated PID, in which case it points to a
NULL terminated array of additional Unit objects also interested. This
scheme means arbitrary numbers of Units may now watch the same PID.

Runtime and memory behaviour should not be impact by this change, as for
the common case (i.e. each PID only watched by a single unit) behaviour
stays the same, but for the uncommon case (a PID watched by more than
one unit) we only pay with a single additional memory allocation for the
array.

Why this all? Primarily, because allowing exactly two units to watch a
specific PID is not sufficient for some niche cases, as processes can
belong to more than one unit these days:

1. sd_notify() with MAINPID= can be used to attach a process from a
   different cgroup to multiple units.

2. Similar, the PIDFile= setting in unit files can be used for similar
   setups,

3. By creating a scope unit a main process of a service may join a
   different unit, too.

4. On cgroupsv1 we frequently end up watching all processes remaining in
   a scope, and if a process opens lots of scopes one after the other it
   might thus end up being watch by many of them.

This patch hence removes the 2-unit-per-PID limit. It also makes a
couple of other changes, some of them quite relevant:

- manager_get_unit_by_pid() (and the bus call wrapping it) when there's
  ambiguity will prefer returning the Unit the process belongs to based on
  cgroup membership, and only check the watch-pids hashmap if that
  fails. This change in logic is probably more in line with what people
  expect and makes things more stable as each process can belong to
  exactly one cgroup only.

- Every SIGCHLD event is now dispatched to all units interested in its
  PID. Previously, there was some magic conditionalization: the SIGCHLD
  would only be dispatched to the unit if it was only interested in a
  single PID only, or the PID belonged to the control or main PID or we
  didn't dispatch a signle SIGCHLD to the unit in the current event loop
  iteration yet. These rules were quite arbitrary and also redundant as
  the the per-unit handlers would filter the PIDs anyway a second time.
  With this change we'll hence relax the rules: all we do now is
  dispatch every SIGCHLD event exactly once to each unit interested in
  it, and it's up to the unit to then use or ignore this. We use a
  generation counter in the unit to ensure that we only invoke the unit
  handler once for each event, protecting us from confusion if a unit is
  both associated with a specific PID through cgroup membership and
  through the "watch_pids" logic. It also protects us from being
  confused if the "watch_pids" hashmap is altered while we are
  dispatching to it (which is a very likely case).

- sd_notify() message dispatching has been reworked to be very similar
  to SIGCHLD handling now. A generation counter is used for dispatching
  as well.

This also adds a new test that validates that "watch_pid" registration
and unregstration works correctly.
This commit is contained in:
Lennart Poettering 2018-01-12 13:41:05 +01:00
parent 11aef522c1
commit 62a769136d
7 changed files with 302 additions and 88 deletions

View File

@ -2186,40 +2186,46 @@ Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
_cleanup_free_ char *cgroup = NULL;
int r;
assert(m);
if (pid <= 0)
if (!pid_is_valid(pid))
return NULL;
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
if (r < 0)
if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
return NULL;
return manager_get_unit_by_cgroup(m, cgroup);
}
Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
Unit *u;
Unit *u, **array;
assert(m);
if (pid <= 0)
/* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
* cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
* relevant one as children of the process will be assigned to that one, too, before all else. */
if (!pid_is_valid(pid))
return NULL;
if (pid == getpid_cached())
return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid));
u = manager_get_unit_by_pid_cgroup(m, pid);
if (u)
return u;
u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid));
u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
if (u)
return u;
return manager_get_unit_by_pid_cgroup(m, pid);
array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
if (array)
return array[0];
return NULL;
}
int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {

View File

@ -1205,8 +1205,7 @@ Manager* manager_free(Manager *m) {
hashmap_free(m->units);
hashmap_free(m->units_by_invocation_id);
hashmap_free(m->jobs);
hashmap_free(m->watch_pids1);
hashmap_free(m->watch_pids2);
hashmap_free(m->watch_pids);
hashmap_free(m->watch_bus);
set_free(m->startup_units);
@ -1915,22 +1914,27 @@ static void manager_invoke_notify_message(
const char *buf,
FDSet *fds) {
_cleanup_strv_free_ char **tags = NULL;
assert(m);
assert(u);
assert(ucred);
assert(buf);
tags = strv_split(buf, NEWLINE);
if (!tags) {
log_oom();
if (u->notifygen == m->notifygen) /* Already invoked on this same unit in this same iteration? */
return;
}
u->notifygen = m->notifygen;
if (UNIT_VTABLE(u)->notify_message) {
_cleanup_strv_free_ char **tags = NULL;
tags = strv_split(buf, NEWLINE);
if (!tags) {
log_oom();
return;
}
if (UNIT_VTABLE(u)->notify_message)
UNIT_VTABLE(u)->notify_message(u, ucred, tags, fds);
else if (DEBUG_LOGGING) {
} else if (DEBUG_LOGGING) {
_cleanup_free_ char *x = NULL, *y = NULL;
x = ellipsize(buf, 20, 90);
@ -1964,9 +1968,11 @@ static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t
struct cmsghdr *cmsg;
struct ucred *ucred = NULL;
Unit *u1, *u2, *u3;
_cleanup_free_ Unit **array_copy = NULL;
Unit *u1, *u2, **array;
int r, *fd_array = NULL;
unsigned n_fds = 0;
bool found = false;
ssize_t n;
assert(m);
@ -2033,22 +2039,41 @@ static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t
/* Make sure it's NUL-terminated. */
buf[n] = 0;
/* Notify every unit that might be interested, but try
* to avoid notifying the same one multiple times. */
/* Increase the generation counter used for filtering out duplicate unit invocations. */
m->notifygen++;
/* Notify every unit that might be interested, which might be multiple. */
u1 = manager_get_unit_by_pid_cgroup(m, ucred->pid);
if (u1)
u2 = hashmap_get(m->watch_pids, PID_TO_PTR(ucred->pid));
array = hashmap_get(m->watch_pids, PID_TO_PTR(-ucred->pid));
if (array) {
size_t k = 0;
while (array[k])
k++;
array_copy = newdup(Unit*, array, k+1);
if (!array_copy)
log_oom();
}
/* And now invoke the per-unit callbacks. Note that manager_invoke_notify_message() will handle duplicate units
* make sure we only invoke each unit's handler once. */
if (u1) {
manager_invoke_notify_message(m, u1, ucred, buf, fds);
u2 = hashmap_get(m->watch_pids1, PID_TO_PTR(ucred->pid));
if (u2 && u2 != u1)
found = true;
}
if (u2) {
manager_invoke_notify_message(m, u2, ucred, buf, fds);
found = true;
}
if (array_copy)
for (size_t i = 0; array_copy[i]; i++) {
manager_invoke_notify_message(m, array_copy[i], ucred, buf, fds);
found = true;
}
u3 = hashmap_get(m->watch_pids2, PID_TO_PTR(ucred->pid));
if (u3 && u3 != u2 && u3 != u1)
manager_invoke_notify_message(m, u3, ucred, buf, fds);
if (!u1 && !u2 && !u3)
log_warning("Cannot find unit for notify message of PID "PID_FMT".", ucred->pid);
if (!found)
log_warning("Cannot find unit for notify message of PID "PID_FMT", ignoring.", ucred->pid);
if (fdset_size(fds) > 0)
log_warning("Got extra auxiliary fds with notification message, closing them.");
@ -2056,29 +2081,25 @@ static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t
return 0;
}
static void invoke_sigchld_event(Manager *m, Unit *u, const siginfo_t *si) {
uint64_t iteration;
static void manager_invoke_sigchld_event(
Manager *m,
Unit *u,
const siginfo_t *si) {
assert(m);
assert(u);
assert(si);
sd_event_get_iteration(m->event, &iteration);
log_unit_debug(u, "Child "PID_FMT" belongs to %s", si->si_pid, u->id);
/* Already invoked the handler of this unit in this iteration? Then don't process this again */
if (u->sigchldgen == m->sigchldgen)
return;
u->sigchldgen = m->sigchldgen;
log_unit_debug(u, "Child "PID_FMT" belongs to %s.", si->si_pid, u->id);
unit_unwatch_pid(u, si->si_pid);
if (UNIT_VTABLE(u)->sigchld_event) {
if (set_size(u->pids) <= 1 ||
iteration != u->sigchldgen ||
unit_main_pid(u) == si->si_pid ||
unit_control_pid(u) == si->si_pid) {
UNIT_VTABLE(u)->sigchld_event(u, si->si_pid, si->si_code, si->si_status);
u->sigchldgen = iteration;
} else
log_debug("%s already issued a sigchld this iteration %" PRIu64 ", skipping. Pids still being watched %d", u->id, iteration, set_size(u->pids));
}
if (UNIT_VTABLE(u)->sigchld_event)
UNIT_VTABLE(u)->sigchld_event(u, si->si_pid, si->si_code, si->si_status);
}
static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) {
@ -2105,8 +2126,9 @@ static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) {
goto turn_off;
if (IN_SET(si.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED)) {
_cleanup_free_ Unit **array_copy = NULL;
_cleanup_free_ char *name = NULL;
Unit *u1, *u2, *u3;
Unit *u1, *u2, **array;
(void) get_process_comm(si.si_pid, &name);
@ -2118,17 +2140,36 @@ static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) {
? exit_status_to_string(si.si_status, EXIT_STATUS_FULL)
: signal_to_string(si.si_status)));
/* And now figure out the unit this belongs
* to, it might be multiple... */
/* Increase the generation counter used for filtering out duplicate unit invocations */
m->sigchldgen++;
/* And now figure out the unit this belongs to, it might be multiple... */
u1 = manager_get_unit_by_pid_cgroup(m, si.si_pid);
u2 = hashmap_get(m->watch_pids, PID_TO_PTR(si.si_pid));
array = hashmap_get(m->watch_pids, PID_TO_PTR(-si.si_pid));
if (array) {
size_t n = 0;
/* Cound how many entries the array has */
while (array[n])
n++;
/* Make a copy of the array so that we don't trip up on the array changing beneath us */
array_copy = newdup(Unit*, array, n+1);
if (!array_copy)
log_oom();
}
/* Finally, execute them all. Note that u1, u2 and the array might contain duplicates, but
* that's fine, manager_invoke_sigchld_event() will ensure we only invoke the handlers once for
* each iteration. */
if (u1)
invoke_sigchld_event(m, u1, &si);
u2 = hashmap_get(m->watch_pids1, PID_TO_PTR(si.si_pid));
if (u2 && u2 != u1)
invoke_sigchld_event(m, u2, &si);
u3 = hashmap_get(m->watch_pids2, PID_TO_PTR(si.si_pid));
if (u3 && u3 != u2 && u3 != u1)
invoke_sigchld_event(m, u3, &si);
manager_invoke_sigchld_event(m, u1, &si);
if (u2)
manager_invoke_sigchld_event(m, u2, &si);
if (array_copy)
for (size_t i = 0; array_copy[i]; i++)
manager_invoke_sigchld_event(m, array_copy[i], &si);
}
/* And now, we actually reap the zombie. */

View File

@ -145,14 +145,14 @@ struct Manager {
sd_event *event;
/* We use two hash tables here, since the same PID might be
* watched by two different units: once the unit that forked
* it off, and possibly a different unit to which it was
* joined as cgroup member. Since we know that it is either
* one or two units for each PID we just use to hashmaps
* here. */
Hashmap *watch_pids1; /* pid => Unit object n:1 */
Hashmap *watch_pids2; /* pid => Unit object n:1 */
/* This maps PIDs we care about to units that are interested in. We allow multiple units to he interested in
* the same PID and multiple PIDs to be relevant to the same unit. Since in most cases only a single unit will
* be interested in the same PID we use a somewhat special encoding here: the first unit interested in a PID is
* stored directly in the hashmap, keyed by the PID unmodified. If there are other units interested too they'll
* be stored in a NULL-terminated array, and keyed by the negative PID. This is safe as pid_t is signed and
* negative PIDs are not used for regular processes but process groups, which we don't care about in this
* context, but this allows us to use the negative range for our own purposes. */
Hashmap *watch_pids; /* pid => unit as well as -pid => array of units */
/* A set contains all units which cgroup should be refreshed after startup */
Set *startup_units;
@ -350,8 +350,13 @@ struct Manager {
int first_boot; /* tri-state */
/* prefixes of e.g. RuntimeDirectory= */
/* Prefixes of e.g. RuntimeDirectory= */
char *prefix[_EXEC_DIRECTORY_TYPE_MAX];
/* Used in the SIGCHLD and sd_notify() message invocation logic to avoid that we dispatch the same event
* multiple times on the same unit. */
unsigned sigchldgen;
unsigned notifygen;
};
#define MANAGER_IS_SYSTEM(m) ((m)->unit_file_scope == UNIT_FILE_SYSTEM)

View File

@ -2534,44 +2534,97 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su
}
int unit_watch_pid(Unit *u, pid_t pid) {
int q, r;
int r;
assert(u);
assert(pid >= 1);
assert(pid_is_valid(pid));
/* Watch a specific PID. We only support one or two units
* watching each PID for now, not more. */
/* Watch a specific PID */
r = set_ensure_allocated(&u->pids, NULL);
if (r < 0)
return r;
r = hashmap_ensure_allocated(&u->manager->watch_pids1, NULL);
r = hashmap_ensure_allocated(&u->manager->watch_pids, NULL);
if (r < 0)
return r;
r = hashmap_put(u->manager->watch_pids1, PID_TO_PTR(pid), u);
if (r == -EEXIST) {
r = hashmap_ensure_allocated(&u->manager->watch_pids2, NULL);
if (r < 0)
return r;
/* First try, let's add the unit keyed by "pid". */
r = hashmap_put(u->manager->watch_pids, PID_TO_PTR(pid), u);
if (r == -EEXIST) {
Unit **array;
bool found = false;
size_t n = 0;
r = hashmap_put(u->manager->watch_pids2, PID_TO_PTR(pid), u);
}
/* OK, the "pid" key is already assigned to a different unit. Let's see if the "-pid" key (which points
* to an array of Units rather than just a Unit), lists us already. */
q = set_put(u->pids, PID_TO_PTR(pid));
if (q < 0)
return q;
array = hashmap_get(u->manager->watch_pids, PID_TO_PTR(-pid));
if (array)
for (; array[n]; n++)
if (array[n] == u)
found = true;
return r;
if (found) /* Found it already? if so, do nothing */
r = 0;
else {
Unit **new_array;
/* Allocate a new array */
new_array = new(Unit*, n + 2);
if (!new_array)
return -ENOMEM;
memcpy_safe(new_array, array, sizeof(Unit*) * n);
new_array[n] = u;
new_array[n+1] = NULL;
/* Add or replace the old array */
r = hashmap_replace(u->manager->watch_pids, PID_TO_PTR(-pid), new_array);
if (r < 0) {
free(new_array);
return r;
}
free(array);
}
} else if (r < 0)
return r;
r = set_put(u->pids, PID_TO_PTR(pid));
if (r < 0)
return r;
return 0;
}
void unit_unwatch_pid(Unit *u, pid_t pid) {
assert(u);
assert(pid >= 1);
Unit **array;
assert(u);
assert(pid_is_valid(pid));
/* First let's drop the unit in case it's keyed as "pid". */
(void) hashmap_remove_value(u->manager->watch_pids, PID_TO_PTR(pid), u);
/* Then, let's also drop the unit, in case it's in the array keyed by -pid */
array = hashmap_get(u->manager->watch_pids, PID_TO_PTR(-pid));
if (array) {
size_t n, m = 0;
/* Let's iterate through the array, dropping our own entry */
for (n = 0; array[n]; n++)
if (array[n] != u)
array[m++] = array[n];
array[m] = NULL;
if (m == 0) {
/* The array is now empty, remove the entire entry */
assert(hashmap_remove(u->manager->watch_pids, PID_TO_PTR(-pid)) == array);
free(array);
}
}
(void) hashmap_remove_value(u->manager->watch_pids1, PID_TO_PTR(pid), u);
(void) hashmap_remove_value(u->manager->watch_pids2, PID_TO_PTR(pid), u);
(void) set_remove(u->pids, PID_TO_PTR(pid));
}

View File

@ -236,8 +236,10 @@ struct Unit {
* process SIGCHLD for */
Set *pids;
/* Used in sigchld event invocation to avoid repeat events being invoked */
uint64_t sigchldgen;
/* Used in SIGCHLD and sd_notify() message event invocation logic to avoid that we dispatch the same event
* multiple times on the same unit. */
unsigned sigchldgen;
unsigned notifygen;
/* Used during GC sweeps */
unsigned gc_marker;

View File

@ -377,6 +377,17 @@ tests += [
libselinux,
libblkid]],
[['src/test/test-watch-pid.c',
'src/test/test-helper.c'],
[libcore,
libshared],
[libmount,
threads,
librt,
libseccomp,
libselinux,
libblkid]],
[['src/test/test-hashmap.c',
'src/test/test-hashmap-plain.c',
test_hashmap_ordered_c],

96
src/test/test-watch-pid.c Normal file
View File

@ -0,0 +1,96 @@
/* SPDX-License-Identifier: LGPL-2.1+ */
#include "log.h"
#include "manager.h"
#include "rm-rf.h"
#include "test-helper.h"
#include "tests.h"
int main(int argc, char *argv[]) {
_cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL;
Unit *a, *b, *c, *u;
Manager *m;
int r;
log_set_max_level(LOG_DEBUG);
log_parse_environment();
log_open();
if (getuid() != 0) {
log_notice("Not running as root, skipping kernel related tests.");
return EXIT_TEST_SKIP;
}
r = enter_cgroup_subroot();
if (r == -ENOMEDIUM) {
log_notice("cgroupfs not available, skipping tests");
return EXIT_TEST_SKIP;
}
assert_se(set_unit_path(get_testdata_dir("")) >= 0);
assert_se(runtime_dir = setup_fake_runtime_dir());
assert_se(manager_new(UNIT_FILE_USER, true, &m) >= 0);
assert_se(manager_startup(m, NULL, NULL) >= 0);
assert_se(a = unit_new(m, sizeof(Service)));
assert_se(unit_add_name(a, "a.service") >= 0);
assert_se(set_isempty(a->pids));
assert_se(b = unit_new(m, sizeof(Service)));
assert_se(unit_add_name(b, "b.service") >= 0);
assert_se(set_isempty(b->pids));
assert_se(c = unit_new(m, sizeof(Service)));
assert_se(unit_add_name(c, "c.service") >= 0);
assert_se(set_isempty(c->pids));
assert_se(hashmap_isempty(m->watch_pids));
assert_se(manager_get_unit_by_pid(m, 4711) == NULL);
assert_se(unit_watch_pid(a, 4711) >= 0);
assert_se(manager_get_unit_by_pid(m, 4711) == a);
assert_se(unit_watch_pid(a, 4711) >= 0);
assert_se(manager_get_unit_by_pid(m, 4711) == a);
assert_se(unit_watch_pid(b, 4711) >= 0);
u = manager_get_unit_by_pid(m, 4711);
assert_se(u == a || u == b);
assert_se(unit_watch_pid(b, 4711) >= 0);
u = manager_get_unit_by_pid(m, 4711);
assert_se(u == a || u == b);
assert_se(unit_watch_pid(c, 4711) >= 0);
u = manager_get_unit_by_pid(m, 4711);
assert_se(u == a || u == b || u == c);
assert_se(unit_watch_pid(c, 4711) >= 0);
u = manager_get_unit_by_pid(m, 4711);
assert_se(u == a || u == b || u == c);
unit_unwatch_pid(b, 4711);
u = manager_get_unit_by_pid(m, 4711);
assert_se(u == a || u == c);
unit_unwatch_pid(b, 4711);
u = manager_get_unit_by_pid(m, 4711);
assert_se(u == a || u == c);
unit_unwatch_pid(a, 4711);
assert_se(manager_get_unit_by_pid(m, 4711) == c);
unit_unwatch_pid(a, 4711);
assert_se(manager_get_unit_by_pid(m, 4711) == c);
unit_unwatch_pid(c, 4711);
assert_se(manager_get_unit_by_pid(m, 4711) == NULL);
unit_unwatch_pid(c, 4711);
assert_se(manager_get_unit_by_pid(m, 4711) == NULL);
manager_free(m);
return 0;
}