core: send sigabrt on watchdog timeout to get the stacktrace

if sigabrt doesn't do the job, follow regular shutdown
routine, sigterm > sigkill.
This commit is contained in:
Umut Tezduyar Lindskog 2014-10-28 16:35:40 +01:00 committed by Lennart Poettering
parent f10af76de5
commit db2cb23b5b
11 changed files with 63 additions and 27 deletions

2
TODO
View File

@ -48,8 +48,6 @@ Features:
* consider showing the unit names during boot up in the status output, not just the unit descriptions
* send SIGABRT when a service watchdog is triggered, by default, so that we acquire a backtrace of the hang.
* dhcp: do we allow configuring dhcp routes on interfaces that are not the one we got the dhcp info from?
* maybe allow timer units with an empty Units= setting, so that they

View File

@ -593,8 +593,9 @@
(i.e. the "keep-alive ping"). If the time
between two such calls is larger than
the configured time, then the service
is placed in a failed state. By
setting <varname>Restart=</varname> to
is placed in a failed state and it will
be terminated with <varname>SIGABRT</varname>.
By setting <varname>Restart=</varname> to
<option>on-failure</option> or
<option>always</option>, the service
will be automatically restarted. The

View File

@ -446,7 +446,7 @@ static void busname_enter_signal(BusName *n, BusNameState state, BusNameResult f
r = unit_kill_context(UNIT(n),
&kill_context,
state != BUSNAME_SIGTERM,
state != BUSNAME_SIGTERM ? KILL_KILL : KILL_TERMINATE,
-1,
n->control_pid,
false);

View File

@ -775,7 +775,8 @@ static void mount_enter_signal(Mount *m, MountState state, MountResult f) {
r = unit_kill_context(
UNIT(m),
&m->kill_context,
state != MOUNT_MOUNTING_SIGTERM && state != MOUNT_UNMOUNTING_SIGTERM && state != MOUNT_REMOUNTING_SIGTERM,
(state != MOUNT_MOUNTING_SIGTERM && state != MOUNT_UNMOUNTING_SIGTERM && state != MOUNT_REMOUNTING_SIGTERM) ?
KILL_KILL : KILL_TERMINATE,
-1,
m->control_pid,
false);

View File

@ -243,7 +243,7 @@ static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) {
r = unit_kill_context(
UNIT(s),
&s->kill_context,
state != SCOPE_STOP_SIGTERM,
state != SCOPE_STOP_SIGTERM ? KILL_KILL : KILL_TERMINATE,
-1, -1, false);
if (r < 0)
goto fail;

View File

@ -56,6 +56,7 @@ static const UnitActiveState state_translation_table[_SERVICE_STATE_MAX] = {
[SERVICE_EXITED] = UNIT_ACTIVE,
[SERVICE_RELOAD] = UNIT_RELOADING,
[SERVICE_STOP] = UNIT_DEACTIVATING,
[SERVICE_STOP_SIGABRT] = UNIT_DEACTIVATING,
[SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
[SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
[SERVICE_STOP_POST] = UNIT_DEACTIVATING,
@ -76,6 +77,7 @@ static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] =
[SERVICE_EXITED] = UNIT_ACTIVE,
[SERVICE_RELOAD] = UNIT_RELOADING,
[SERVICE_STOP] = UNIT_DEACTIVATING,
[SERVICE_STOP_SIGABRT] = UNIT_DEACTIVATING,
[SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
[SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
[SERVICE_STOP_POST] = UNIT_DEACTIVATING,
@ -663,7 +665,7 @@ static void service_set_state(Service *s, ServiceState state) {
SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
SERVICE_RELOAD,
SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
SERVICE_STOP_POST,
SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
SERVICE_AUTO_RESTART))
s->timer_event_source = sd_event_source_unref(s->timer_event_source);
@ -672,7 +674,7 @@ static void service_set_state(Service *s, ServiceState state) {
SERVICE_START, SERVICE_START_POST,
SERVICE_RUNNING, SERVICE_RELOAD,
SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
SERVICE_STOP_POST,
SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
service_unwatch_main_pid(s);
s->main_command = NULL;
@ -682,7 +684,7 @@ static void service_set_state(Service *s, ServiceState state) {
SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
SERVICE_RELOAD,
SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
SERVICE_STOP_POST,
SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
service_unwatch_control_pid(s);
s->control_command = NULL;
@ -696,7 +698,7 @@ static void service_set_state(Service *s, ServiceState state) {
SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
SERVICE_RUNNING, SERVICE_RELOAD,
SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL) &&
SERVICE_STOP_SIGABRT, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL) &&
!(state == SERVICE_DEAD && UNIT(s)->job)) {
service_close_socket_fd(s);
service_connection_unref(s);
@ -750,7 +752,7 @@ static int service_coldplug(Unit *u) {
SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
SERVICE_RELOAD,
SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
SERVICE_STOP_POST,
SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
usec_t k;
@ -779,7 +781,7 @@ static int service_coldplug(Unit *u) {
SERVICE_START, SERVICE_START_POST,
SERVICE_RUNNING, SERVICE_RELOAD,
SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
SERVICE_STOP_POST,
SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL))) {
r = unit_watch_pid(UNIT(s), s->main_pid);
if (r < 0)
@ -791,7 +793,7 @@ static int service_coldplug(Unit *u) {
SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
SERVICE_RELOAD,
SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
SERVICE_STOP_POST,
SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
r = unit_watch_pid(UNIT(s), s->control_pid);
if (r < 0)
@ -1181,7 +1183,8 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
r = unit_kill_context(
UNIT(s),
&s->kill_context,
state != SERVICE_STOP_SIGTERM && state != SERVICE_FINAL_SIGTERM,
(state != SERVICE_STOP_SIGTERM && state != SERVICE_FINAL_SIGTERM && state != SERVICE_STOP_SIGABRT) ?
KILL_KILL : (state == SERVICE_STOP_SIGABRT ? KILL_ABORT : KILL_TERMINATE),
s->main_pid,
s->control_pid,
s->main_pid_alien);
@ -1197,7 +1200,7 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
}
service_set_state(s, state);
} else if (state == SERVICE_STOP_SIGTERM)
} else if (state == SERVICE_STOP_SIGTERM || state == SERVICE_STOP_SIGABRT)
service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_SUCCESS);
else if (state == SERVICE_STOP_SIGKILL)
service_enter_stop_post(s, SERVICE_SUCCESS);
@ -1211,7 +1214,8 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
fail:
log_warning_unit(UNIT(s)->id, "%s failed to kill processes: %s", UNIT(s)->id, strerror(-r));
if (state == SERVICE_STOP_SIGTERM || state == SERVICE_STOP_SIGKILL)
if (state == SERVICE_STOP_SIGTERM || state == SERVICE_STOP_SIGKILL ||
state == SERVICE_STOP_SIGABRT)
service_enter_stop_post(s, SERVICE_FAILURE_RESOURCES);
else
service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true);
@ -1637,6 +1641,7 @@ static int service_start(Unit *u) {
/* We cannot fulfill this request right now, try again later
* please! */
if (s->state == SERVICE_STOP ||
s->state == SERVICE_STOP_SIGABRT ||
s->state == SERVICE_STOP_SIGTERM ||
s->state == SERVICE_STOP_SIGKILL ||
s->state == SERVICE_STOP_POST ||
@ -1695,6 +1700,7 @@ static int service_stop(Unit *u) {
/* Already on it */
if (s->state == SERVICE_STOP ||
s->state == SERVICE_STOP_SIGABRT ||
s->state == SERVICE_STOP_SIGTERM ||
s->state == SERVICE_STOP_SIGKILL ||
s->state == SERVICE_STOP_POST ||
@ -2126,6 +2132,7 @@ static void service_notify_cgroup_empty_event(Unit *u) {
service_enter_running(s, SERVICE_SUCCESS);
break;
case SERVICE_STOP_SIGABRT:
case SERVICE_STOP_SIGTERM:
case SERVICE_STOP_SIGKILL:
@ -2252,6 +2259,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
service_enter_running(s, f);
break;
case SERVICE_STOP_SIGABRT:
case SERVICE_STOP_SIGTERM:
case SERVICE_STOP_SIGKILL:
@ -2392,6 +2400,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
break;
case SERVICE_STOP_SIGABRT:
case SERVICE_STOP_SIGTERM:
case SERVICE_STOP_SIGKILL:
if (main_pid_good(s) <= 0)
@ -2461,6 +2470,12 @@ static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *us
service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT);
break;
case SERVICE_STOP_SIGABRT:
log_warning_unit(UNIT(s)->id,
"%s stop-sigabrt timed out. Terminating.", UNIT(s)->id);
service_enter_signal(s, SERVICE_STOP_SIGTERM, s->result);
break;
case SERVICE_STOP_SIGTERM:
if (s->kill_context.send_sigkill) {
log_warning_unit(UNIT(s)->id, "%s stop-sigterm timed out. Killing.", UNIT(s)->id);
@ -2528,7 +2543,7 @@ static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void
log_error_unit(UNIT(s)->id, "%s watchdog timeout (limit %s)!", UNIT(s)->id,
format_timespan(t, sizeof(t), s->watchdog_usec, 1));
service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_WATCHDOG);
service_enter_signal(s, SERVICE_STOP_SIGABRT, SERVICE_FAILURE_WATCHDOG);
return 0;
}

View File

@ -39,6 +39,7 @@ typedef enum ServiceState {
SERVICE_EXITED, /* Nothing is running anymore, but RemainAfterExit is true hence this is OK */
SERVICE_RELOAD,
SERVICE_STOP, /* No STOP_PRE state, instead just register multiple STOP executables */
SERVICE_STOP_SIGABRT, /* Watchdog timeout */
SERVICE_STOP_SIGTERM,
SERVICE_STOP_SIGKILL,
SERVICE_STOP_POST,

View File

@ -1578,7 +1578,8 @@ static void socket_enter_signal(Socket *s, SocketState state, SocketResult f) {
r = unit_kill_context(
UNIT(s),
&s->kill_context,
state != SOCKET_STOP_PRE_SIGTERM && state != SOCKET_FINAL_SIGTERM,
(state != SOCKET_STOP_PRE_SIGTERM && state != SOCKET_FINAL_SIGTERM) ?
KILL_KILL : KILL_TERMINATE,
-1,
s->control_pid,
false);

View File

@ -687,7 +687,8 @@ static void swap_enter_signal(Swap *s, SwapState state, SwapResult f) {
r = unit_kill_context(
UNIT(s),
&s->kill_context,
state != SWAP_ACTIVATING_SIGTERM && state != SWAP_DEACTIVATING_SIGTERM,
(state != SWAP_ACTIVATING_SIGTERM && state != SWAP_DEACTIVATING_SIGTERM) ?
KILL_KILL : KILL_TERMINATE,
-1,
s->control_pid,
false);

View File

@ -3313,7 +3313,7 @@ int unit_make_transient(Unit *u) {
int unit_kill_context(
Unit *u,
KillContext *c,
bool sigkill,
KillOperation k,
pid_t main_pid,
pid_t control_pid,
bool main_pid_alien) {
@ -3326,7 +3326,19 @@ int unit_kill_context(
if (c->kill_mode == KILL_NONE)
return 0;
sig = sigkill ? SIGKILL : c->kill_signal;
switch (k) {
case KILL_KILL:
sig = SIGKILL;
break;
case KILL_ABORT:
sig = SIGABRT;
break;
case KILL_TERMINATE:
sig = c->kill_signal;
break;
default:
assert_not_reached("KillOperation unknown");
}
if (main_pid > 0) {
r = kill_and_sigcont(main_pid, sig);
@ -3340,7 +3352,7 @@ int unit_kill_context(
if (!main_pid_alien)
wait_for_exit = true;
if (c->send_sighup && !sigkill)
if (c->send_sighup && k != KILL_KILL)
kill(main_pid, SIGHUP);
}
}
@ -3356,12 +3368,12 @@ int unit_kill_context(
} else {
wait_for_exit = true;
if (c->send_sighup && !sigkill)
if (c->send_sighup && k != KILL_KILL)
kill(control_pid, SIGHUP);
}
}
if ((c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && sigkill)) && u->cgroup_path) {
if ((c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && k == KILL_KILL)) && u->cgroup_path) {
_cleanup_set_free_ Set *pid_set = NULL;
/* Exclude the main/control pids from being killed via the cgroup */
@ -3385,7 +3397,7 @@ int unit_kill_context(
/* wait_for_exit = true; */
if (c->send_sighup && !sigkill) {
if (c->send_sighup && k != KILL_KILL) {
set_free(pid_set);
pid_set = unit_pid_set(main_pid, control_pid);

View File

@ -54,6 +54,12 @@ enum UnitActiveState {
_UNIT_ACTIVE_STATE_INVALID = -1
};
typedef enum KillOperation {
KILL_TERMINATE,
KILL_KILL,
KILL_ABORT,
} KillOperation;
static inline bool UNIT_IS_ACTIVE_OR_RELOADING(UnitActiveState t) {
return t == UNIT_ACTIVE || t == UNIT_RELOADING;
}
@ -576,7 +582,7 @@ int unit_write_drop_in_private_format(Unit *u, UnitSetPropertiesMode mode, const
int unit_remove_drop_in(Unit *u, UnitSetPropertiesMode mode, const char *name);
int unit_kill_context(Unit *u, KillContext *c, bool sigkill, pid_t main_pid, pid_t control_pid, bool main_pid_alien);
int unit_kill_context(Unit *u, KillContext *c, KillOperation k, pid_t main_pid, pid_t control_pid, bool main_pid_alien);
int unit_make_transient(Unit *u);