fix race between daemon-reload and other commands

When "systemctl daemon-reload" is run at the same time as "systemctl
start foo", the latter might hang. That's because commands like start
wait for JobRemoved signal to know when the job is finished. But if the
job is finished during reloading, the signal is never sent.

The hang can be easily reproduced by running

    # for ((N=1; N>0; N++)) ; do echo $N ; systemctl daemon-reload ; done
    # for ((N=1; N>0; N++)) ; do echo $N ; systemctl start systemd-coredump.socket ; done

in two different terminals. The start command will hang after 1-2
iterations.

This keeps track of jobs that were started before reload and finished
during it and sends JobRemoved after the reload has finished.
This commit is contained in:
David Tardon 2018-04-24 15:19:38 +02:00 committed by Zbigniew Jędrzejewski-Szmek
parent 90bc77af29
commit a7a7163df7
4 changed files with 67 additions and 11 deletions

View File

@ -43,6 +43,7 @@ Job* job_new_raw(Unit *unit) {
j->manager = unit->manager;
j->unit = unit;
j->type = _JOB_TYPE_INVALID;
j->reloaded = false;
return j;
}
@ -64,6 +65,32 @@ Job* job_new(Unit *unit, JobType type) {
return j;
}
void job_unlink(Job *j) {
assert(j);
assert(!j->installed);
assert(!j->transaction_prev);
assert(!j->transaction_next);
assert(!j->subject_list);
assert(!j->object_list);
if (j->in_run_queue) {
LIST_REMOVE(run_queue, j->manager->run_queue, j);
j->in_run_queue = false;
}
if (j->in_dbus_queue) {
LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j);
j->in_dbus_queue = false;
}
if (j->in_gc_queue) {
LIST_REMOVE(gc_queue, j->manager->gc_job_queue, j);
j->in_gc_queue = false;
}
j->timer_event_source = sd_event_source_unref(j->timer_event_source);
}
void job_free(Job *j) {
assert(j);
assert(!j->installed);
@ -72,16 +99,7 @@ void job_free(Job *j) {
assert(!j->subject_list);
assert(!j->object_list);
if (j->in_run_queue)
LIST_REMOVE(run_queue, j->manager->run_queue, j);
if (j->in_dbus_queue)
LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j);
if (j->in_gc_queue)
LIST_REMOVE(gc_queue, j->manager->gc_job_queue, j);
sd_event_source_unref(j->timer_event_source);
job_unlink(j);
sd_bus_track_unref(j->bus_track);
strv_free(j->deserialized_clients);
@ -241,6 +259,7 @@ int job_install_deserialized(Job *j) {
*pj = j;
j->installed = true;
j->reloaded = true;
if (j->state == JOB_RUNNING)
j->unit->manager->n_running_jobs++;
@ -844,6 +863,19 @@ static void job_fail_dependencies(Unit *u, UnitDependency d) {
}
}
static int job_save_pending_finished_job(Job *j) {
int r;
assert(j);
r = set_ensure_allocated(&j->manager->pending_finished_jobs, NULL);
if (r < 0)
return r;
job_unlink(j);
return set_put(j->manager->pending_finished_jobs, j);
}
int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already) {
Unit *u;
Unit *other;
@ -883,7 +915,12 @@ int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool alr
j->manager->n_failed_jobs++;
job_uninstall(j);
job_free(j);
/* Remember jobs started before the reload */
if (MANAGER_IS_RELOADING(j->manager) && j->reloaded) {
if (job_save_pending_finished_job(j) < 0)
job_free(j);
} else
job_free(j);
/* Fail depending jobs on failure */
if (result != JOB_DONE && recursive) {

View File

@ -162,10 +162,12 @@ struct Job {
bool irreversible:1;
bool in_gc_queue:1;
bool ref_by_private_bus:1;
bool reloaded:1;
};
Job* job_new(Unit *unit, JobType type);
Job* job_new_raw(Unit *unit);
void job_unlink(Job *job);
void job_free(Job *job);
Job* job_install(Job *j);
int job_install_deserialized(Job *j);

View File

@ -3186,6 +3186,17 @@ finish:
return r;
}
static void manager_flush_finished_jobs(Manager *m) {
Job *j;
while ((j = set_steal_first(m->pending_finished_jobs))) {
bus_job_send_removed_signal(j);
job_free(j);
}
m->pending_finished_jobs = set_free(m->pending_finished_jobs);
}
int manager_reload(Manager *m) {
int r, q;
_cleanup_fclose_ FILE *f = NULL;
@ -3294,6 +3305,9 @@ int manager_reload(Manager *m) {
if (q < 0 && r >= 0)
r = q;
if (!MANAGER_IS_RELOADING(m))
manager_flush_finished_jobs(m);
m->send_reloading_done = true;
return r;

View File

@ -300,6 +300,9 @@ struct Manager {
/* non-zero if we are reloading or reexecuting, */
int n_reloading;
/* A set which contains all jobs that started before reload and finished
* during it */
Set *pending_finished_jobs;
unsigned n_installed_jobs;
unsigned n_failed_jobs;