core: send sigabrt on watchdog timeout to get the stacktrace

if sigabrt doesn't do the job, follow regular shutdown
routine, sigterm > sigkill.
This commit is contained in:
Umut Tezduyar Lindskog 2014-10-28 16:35:40 +01:00 committed by Lennart Poettering
parent f10af76de5
commit db2cb23b5b
11 changed files with 63 additions and 27 deletions

2
TODO
View File

@ -48,8 +48,6 @@ Features:
* consider showing the unit names during boot up in the status output, not just the unit descriptions * consider showing the unit names during boot up in the status output, not just the unit descriptions
* send SIGABRT when a service watchdog is triggered, by default, so that we acquire a backtrace of the hang.
* dhcp: do we allow configuring dhcp routes on interfaces that are not the one we got the dhcp info from? * dhcp: do we allow configuring dhcp routes on interfaces that are not the one we got the dhcp info from?
* maybe allow timer units with an empty Units= setting, so that they * maybe allow timer units with an empty Units= setting, so that they

View File

@ -593,8 +593,9 @@
(i.e. the "keep-alive ping"). If the time (i.e. the "keep-alive ping"). If the time
between two such calls is larger than between two such calls is larger than
the configured time, then the service the configured time, then the service
is placed in a failed state. By is placed in a failed state and it will
setting <varname>Restart=</varname> to be terminated with <varname>SIGABRT</varname>.
By setting <varname>Restart=</varname> to
<option>on-failure</option> or <option>on-failure</option> or
<option>always</option>, the service <option>always</option>, the service
will be automatically restarted. The will be automatically restarted. The

View File

@ -446,7 +446,7 @@ static void busname_enter_signal(BusName *n, BusNameState state, BusNameResult f
r = unit_kill_context(UNIT(n), r = unit_kill_context(UNIT(n),
&kill_context, &kill_context,
state != BUSNAME_SIGTERM, state != BUSNAME_SIGTERM ? KILL_KILL : KILL_TERMINATE,
-1, -1,
n->control_pid, n->control_pid,
false); false);

View File

@ -775,7 +775,8 @@ static void mount_enter_signal(Mount *m, MountState state, MountResult f) {
r = unit_kill_context( r = unit_kill_context(
UNIT(m), UNIT(m),
&m->kill_context, &m->kill_context,
state != MOUNT_MOUNTING_SIGTERM && state != MOUNT_UNMOUNTING_SIGTERM && state != MOUNT_REMOUNTING_SIGTERM, (state != MOUNT_MOUNTING_SIGTERM && state != MOUNT_UNMOUNTING_SIGTERM && state != MOUNT_REMOUNTING_SIGTERM) ?
KILL_KILL : KILL_TERMINATE,
-1, -1,
m->control_pid, m->control_pid,
false); false);

View File

@ -243,7 +243,7 @@ static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) {
r = unit_kill_context( r = unit_kill_context(
UNIT(s), UNIT(s),
&s->kill_context, &s->kill_context,
state != SCOPE_STOP_SIGTERM, state != SCOPE_STOP_SIGTERM ? KILL_KILL : KILL_TERMINATE,
-1, -1, false); -1, -1, false);
if (r < 0) if (r < 0)
goto fail; goto fail;

View File

@ -56,6 +56,7 @@ static const UnitActiveState state_translation_table[_SERVICE_STATE_MAX] = {
[SERVICE_EXITED] = UNIT_ACTIVE, [SERVICE_EXITED] = UNIT_ACTIVE,
[SERVICE_RELOAD] = UNIT_RELOADING, [SERVICE_RELOAD] = UNIT_RELOADING,
[SERVICE_STOP] = UNIT_DEACTIVATING, [SERVICE_STOP] = UNIT_DEACTIVATING,
[SERVICE_STOP_SIGABRT] = UNIT_DEACTIVATING,
[SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING, [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
[SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING, [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
[SERVICE_STOP_POST] = UNIT_DEACTIVATING, [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
@ -76,6 +77,7 @@ static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] =
[SERVICE_EXITED] = UNIT_ACTIVE, [SERVICE_EXITED] = UNIT_ACTIVE,
[SERVICE_RELOAD] = UNIT_RELOADING, [SERVICE_RELOAD] = UNIT_RELOADING,
[SERVICE_STOP] = UNIT_DEACTIVATING, [SERVICE_STOP] = UNIT_DEACTIVATING,
[SERVICE_STOP_SIGABRT] = UNIT_DEACTIVATING,
[SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING, [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
[SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING, [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
[SERVICE_STOP_POST] = UNIT_DEACTIVATING, [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
@ -663,7 +665,7 @@ static void service_set_state(Service *s, ServiceState state) {
SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
SERVICE_RELOAD, SERVICE_RELOAD,
SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
SERVICE_STOP_POST, SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
SERVICE_AUTO_RESTART)) SERVICE_AUTO_RESTART))
s->timer_event_source = sd_event_source_unref(s->timer_event_source); s->timer_event_source = sd_event_source_unref(s->timer_event_source);
@ -672,7 +674,7 @@ static void service_set_state(Service *s, ServiceState state) {
SERVICE_START, SERVICE_START_POST, SERVICE_START, SERVICE_START_POST,
SERVICE_RUNNING, SERVICE_RELOAD, SERVICE_RUNNING, SERVICE_RELOAD,
SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
SERVICE_STOP_POST, SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) { SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
service_unwatch_main_pid(s); service_unwatch_main_pid(s);
s->main_command = NULL; s->main_command = NULL;
@ -682,7 +684,7 @@ static void service_set_state(Service *s, ServiceState state) {
SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
SERVICE_RELOAD, SERVICE_RELOAD,
SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
SERVICE_STOP_POST, SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) { SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
service_unwatch_control_pid(s); service_unwatch_control_pid(s);
s->control_command = NULL; s->control_command = NULL;
@ -696,7 +698,7 @@ static void service_set_state(Service *s, ServiceState state) {
SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
SERVICE_RUNNING, SERVICE_RELOAD, SERVICE_RUNNING, SERVICE_RELOAD,
SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST, SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL) && SERVICE_STOP_SIGABRT, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL) &&
!(state == SERVICE_DEAD && UNIT(s)->job)) { !(state == SERVICE_DEAD && UNIT(s)->job)) {
service_close_socket_fd(s); service_close_socket_fd(s);
service_connection_unref(s); service_connection_unref(s);
@ -750,7 +752,7 @@ static int service_coldplug(Unit *u) {
SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
SERVICE_RELOAD, SERVICE_RELOAD,
SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
SERVICE_STOP_POST, SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) { SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
usec_t k; usec_t k;
@ -779,7 +781,7 @@ static int service_coldplug(Unit *u) {
SERVICE_START, SERVICE_START_POST, SERVICE_START, SERVICE_START_POST,
SERVICE_RUNNING, SERVICE_RELOAD, SERVICE_RUNNING, SERVICE_RELOAD,
SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
SERVICE_STOP_POST, SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL))) { SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL))) {
r = unit_watch_pid(UNIT(s), s->main_pid); r = unit_watch_pid(UNIT(s), s->main_pid);
if (r < 0) if (r < 0)
@ -791,7 +793,7 @@ static int service_coldplug(Unit *u) {
SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
SERVICE_RELOAD, SERVICE_RELOAD,
SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
SERVICE_STOP_POST, SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) { SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
r = unit_watch_pid(UNIT(s), s->control_pid); r = unit_watch_pid(UNIT(s), s->control_pid);
if (r < 0) if (r < 0)
@ -1181,7 +1183,8 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
r = unit_kill_context( r = unit_kill_context(
UNIT(s), UNIT(s),
&s->kill_context, &s->kill_context,
state != SERVICE_STOP_SIGTERM && state != SERVICE_FINAL_SIGTERM, (state != SERVICE_STOP_SIGTERM && state != SERVICE_FINAL_SIGTERM && state != SERVICE_STOP_SIGABRT) ?
KILL_KILL : (state == SERVICE_STOP_SIGABRT ? KILL_ABORT : KILL_TERMINATE),
s->main_pid, s->main_pid,
s->control_pid, s->control_pid,
s->main_pid_alien); s->main_pid_alien);
@ -1197,7 +1200,7 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
} }
service_set_state(s, state); service_set_state(s, state);
} else if (state == SERVICE_STOP_SIGTERM) } else if (state == SERVICE_STOP_SIGTERM || state == SERVICE_STOP_SIGABRT)
service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_SUCCESS); service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_SUCCESS);
else if (state == SERVICE_STOP_SIGKILL) else if (state == SERVICE_STOP_SIGKILL)
service_enter_stop_post(s, SERVICE_SUCCESS); service_enter_stop_post(s, SERVICE_SUCCESS);
@ -1211,7 +1214,8 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
fail: fail:
log_warning_unit(UNIT(s)->id, "%s failed to kill processes: %s", UNIT(s)->id, strerror(-r)); log_warning_unit(UNIT(s)->id, "%s failed to kill processes: %s", UNIT(s)->id, strerror(-r));
if (state == SERVICE_STOP_SIGTERM || state == SERVICE_STOP_SIGKILL) if (state == SERVICE_STOP_SIGTERM || state == SERVICE_STOP_SIGKILL ||
state == SERVICE_STOP_SIGABRT)
service_enter_stop_post(s, SERVICE_FAILURE_RESOURCES); service_enter_stop_post(s, SERVICE_FAILURE_RESOURCES);
else else
service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true); service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true);
@ -1637,6 +1641,7 @@ static int service_start(Unit *u) {
/* We cannot fulfill this request right now, try again later /* We cannot fulfill this request right now, try again later
* please! */ * please! */
if (s->state == SERVICE_STOP || if (s->state == SERVICE_STOP ||
s->state == SERVICE_STOP_SIGABRT ||
s->state == SERVICE_STOP_SIGTERM || s->state == SERVICE_STOP_SIGTERM ||
s->state == SERVICE_STOP_SIGKILL || s->state == SERVICE_STOP_SIGKILL ||
s->state == SERVICE_STOP_POST || s->state == SERVICE_STOP_POST ||
@ -1695,6 +1700,7 @@ static int service_stop(Unit *u) {
/* Already on it */ /* Already on it */
if (s->state == SERVICE_STOP || if (s->state == SERVICE_STOP ||
s->state == SERVICE_STOP_SIGABRT ||
s->state == SERVICE_STOP_SIGTERM || s->state == SERVICE_STOP_SIGTERM ||
s->state == SERVICE_STOP_SIGKILL || s->state == SERVICE_STOP_SIGKILL ||
s->state == SERVICE_STOP_POST || s->state == SERVICE_STOP_POST ||
@ -2126,6 +2132,7 @@ static void service_notify_cgroup_empty_event(Unit *u) {
service_enter_running(s, SERVICE_SUCCESS); service_enter_running(s, SERVICE_SUCCESS);
break; break;
case SERVICE_STOP_SIGABRT:
case SERVICE_STOP_SIGTERM: case SERVICE_STOP_SIGTERM:
case SERVICE_STOP_SIGKILL: case SERVICE_STOP_SIGKILL:
@ -2252,6 +2259,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
service_enter_running(s, f); service_enter_running(s, f);
break; break;
case SERVICE_STOP_SIGABRT:
case SERVICE_STOP_SIGTERM: case SERVICE_STOP_SIGTERM:
case SERVICE_STOP_SIGKILL: case SERVICE_STOP_SIGKILL:
@ -2392,6 +2400,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
service_enter_signal(s, SERVICE_STOP_SIGTERM, f); service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
break; break;
case SERVICE_STOP_SIGABRT:
case SERVICE_STOP_SIGTERM: case SERVICE_STOP_SIGTERM:
case SERVICE_STOP_SIGKILL: case SERVICE_STOP_SIGKILL:
if (main_pid_good(s) <= 0) if (main_pid_good(s) <= 0)
@ -2461,6 +2470,12 @@ static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *us
service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT); service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT);
break; break;
case SERVICE_STOP_SIGABRT:
log_warning_unit(UNIT(s)->id,
"%s stop-sigabrt timed out. Terminating.", UNIT(s)->id);
service_enter_signal(s, SERVICE_STOP_SIGTERM, s->result);
break;
case SERVICE_STOP_SIGTERM: case SERVICE_STOP_SIGTERM:
if (s->kill_context.send_sigkill) { if (s->kill_context.send_sigkill) {
log_warning_unit(UNIT(s)->id, "%s stop-sigterm timed out. Killing.", UNIT(s)->id); log_warning_unit(UNIT(s)->id, "%s stop-sigterm timed out. Killing.", UNIT(s)->id);
@ -2528,7 +2543,7 @@ static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void
log_error_unit(UNIT(s)->id, "%s watchdog timeout (limit %s)!", UNIT(s)->id, log_error_unit(UNIT(s)->id, "%s watchdog timeout (limit %s)!", UNIT(s)->id,
format_timespan(t, sizeof(t), s->watchdog_usec, 1)); format_timespan(t, sizeof(t), s->watchdog_usec, 1));
service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_WATCHDOG); service_enter_signal(s, SERVICE_STOP_SIGABRT, SERVICE_FAILURE_WATCHDOG);
return 0; return 0;
} }

View File

@ -39,6 +39,7 @@ typedef enum ServiceState {
SERVICE_EXITED, /* Nothing is running anymore, but RemainAfterExit is true hence this is OK */ SERVICE_EXITED, /* Nothing is running anymore, but RemainAfterExit is true hence this is OK */
SERVICE_RELOAD, SERVICE_RELOAD,
SERVICE_STOP, /* No STOP_PRE state, instead just register multiple STOP executables */ SERVICE_STOP, /* No STOP_PRE state, instead just register multiple STOP executables */
SERVICE_STOP_SIGABRT, /* Watchdog timeout */
SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGTERM,
SERVICE_STOP_SIGKILL, SERVICE_STOP_SIGKILL,
SERVICE_STOP_POST, SERVICE_STOP_POST,

View File

@ -1578,7 +1578,8 @@ static void socket_enter_signal(Socket *s, SocketState state, SocketResult f) {
r = unit_kill_context( r = unit_kill_context(
UNIT(s), UNIT(s),
&s->kill_context, &s->kill_context,
state != SOCKET_STOP_PRE_SIGTERM && state != SOCKET_FINAL_SIGTERM, (state != SOCKET_STOP_PRE_SIGTERM && state != SOCKET_FINAL_SIGTERM) ?
KILL_KILL : KILL_TERMINATE,
-1, -1,
s->control_pid, s->control_pid,
false); false);

View File

@ -687,7 +687,8 @@ static void swap_enter_signal(Swap *s, SwapState state, SwapResult f) {
r = unit_kill_context( r = unit_kill_context(
UNIT(s), UNIT(s),
&s->kill_context, &s->kill_context,
state != SWAP_ACTIVATING_SIGTERM && state != SWAP_DEACTIVATING_SIGTERM, (state != SWAP_ACTIVATING_SIGTERM && state != SWAP_DEACTIVATING_SIGTERM) ?
KILL_KILL : KILL_TERMINATE,
-1, -1,
s->control_pid, s->control_pid,
false); false);

View File

@ -3313,7 +3313,7 @@ int unit_make_transient(Unit *u) {
int unit_kill_context( int unit_kill_context(
Unit *u, Unit *u,
KillContext *c, KillContext *c,
bool sigkill, KillOperation k,
pid_t main_pid, pid_t main_pid,
pid_t control_pid, pid_t control_pid,
bool main_pid_alien) { bool main_pid_alien) {
@ -3326,7 +3326,19 @@ int unit_kill_context(
if (c->kill_mode == KILL_NONE) if (c->kill_mode == KILL_NONE)
return 0; return 0;
sig = sigkill ? SIGKILL : c->kill_signal; switch (k) {
case KILL_KILL:
sig = SIGKILL;
break;
case KILL_ABORT:
sig = SIGABRT;
break;
case KILL_TERMINATE:
sig = c->kill_signal;
break;
default:
assert_not_reached("KillOperation unknown");
}
if (main_pid > 0) { if (main_pid > 0) {
r = kill_and_sigcont(main_pid, sig); r = kill_and_sigcont(main_pid, sig);
@ -3340,7 +3352,7 @@ int unit_kill_context(
if (!main_pid_alien) if (!main_pid_alien)
wait_for_exit = true; wait_for_exit = true;
if (c->send_sighup && !sigkill) if (c->send_sighup && k != KILL_KILL)
kill(main_pid, SIGHUP); kill(main_pid, SIGHUP);
} }
} }
@ -3356,12 +3368,12 @@ int unit_kill_context(
} else { } else {
wait_for_exit = true; wait_for_exit = true;
if (c->send_sighup && !sigkill) if (c->send_sighup && k != KILL_KILL)
kill(control_pid, SIGHUP); kill(control_pid, SIGHUP);
} }
} }
if ((c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && sigkill)) && u->cgroup_path) { if ((c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && k == KILL_KILL)) && u->cgroup_path) {
_cleanup_set_free_ Set *pid_set = NULL; _cleanup_set_free_ Set *pid_set = NULL;
/* Exclude the main/control pids from being killed via the cgroup */ /* Exclude the main/control pids from being killed via the cgroup */
@ -3385,7 +3397,7 @@ int unit_kill_context(
/* wait_for_exit = true; */ /* wait_for_exit = true; */
if (c->send_sighup && !sigkill) { if (c->send_sighup && k != KILL_KILL) {
set_free(pid_set); set_free(pid_set);
pid_set = unit_pid_set(main_pid, control_pid); pid_set = unit_pid_set(main_pid, control_pid);

View File

@ -54,6 +54,12 @@ enum UnitActiveState {
_UNIT_ACTIVE_STATE_INVALID = -1 _UNIT_ACTIVE_STATE_INVALID = -1
}; };
typedef enum KillOperation {
KILL_TERMINATE,
KILL_KILL,
KILL_ABORT,
} KillOperation;
static inline bool UNIT_IS_ACTIVE_OR_RELOADING(UnitActiveState t) { static inline bool UNIT_IS_ACTIVE_OR_RELOADING(UnitActiveState t) {
return t == UNIT_ACTIVE || t == UNIT_RELOADING; return t == UNIT_ACTIVE || t == UNIT_RELOADING;
} }
@ -576,7 +582,7 @@ int unit_write_drop_in_private_format(Unit *u, UnitSetPropertiesMode mode, const
int unit_remove_drop_in(Unit *u, UnitSetPropertiesMode mode, const char *name); int unit_remove_drop_in(Unit *u, UnitSetPropertiesMode mode, const char *name);
int unit_kill_context(Unit *u, KillContext *c, bool sigkill, pid_t main_pid, pid_t control_pid, bool main_pid_alien); int unit_kill_context(Unit *u, KillContext *c, KillOperation k, pid_t main_pid, pid_t control_pid, bool main_pid_alien);
int unit_make_transient(Unit *u); int unit_make_transient(Unit *u);