service: handle abort stops with dedicated timeout

When shooting down a service with SIGABRT the user might want to have a
much longer stop timeout than on regular stops/shutdowns. Especially in
the face of short stop timeouts the time might not be sufficient to
write huge core dumps before the service is killed.

This commit adds a dedicated (Default)TimeoutAbortSec= timer that is
used when stopping a service via SIGABRT. In all other cases the
existing TimeoutStopSec= is used. The timer value is unset by default
to skip the special handling and use TimeoutStopSec= for state
'stop-watchdog' to keep the old behaviour.

If the service is in state 'stop-watchdog' and the service should be
stopped explicitly we still go to 'stop-sigterm' and re-apply the usual
TimeoutStopSec= timeout.
This commit is contained in:
Jan Klötzke 2017-11-29 07:43:44 +01:00 committed by Lennart Poettering
parent 1ace223ca7
commit dc653bf487
16 changed files with 190 additions and 9 deletions

View File

@ -286,6 +286,7 @@ Most service unit settings are available for transient units.
✓ RestartSec=
✓ TimeoutStartSec=
✓ TimeoutStopSec=
✓ TimeoutAbortSec=
✓ TimeoutSec=
✓ RuntimeMaxSec=
✓ WatchdogSec=

View File

@ -239,13 +239,15 @@
<varlistentry>
<term><varname>DefaultTimeoutStartSec=</varname></term>
<term><varname>DefaultTimeoutStopSec=</varname></term>
<term><varname>DefaultTimeoutAbortSec=</varname></term>
<term><varname>DefaultRestartSec=</varname></term>
<listitem><para>Configures the default timeouts for starting
and stopping of units, as well as the default time to sleep
<listitem><para>Configures the default timeouts for starting,
stopping and aborting of units, as well as the default time to sleep
between automatic restarts of units, as configured per-unit in
<varname>TimeoutStartSec=</varname>,
<varname>TimeoutStopSec=</varname> and
<varname>TimeoutStopSec=</varname>,
<varname>TimeoutAbortSec=</varname> and
<varname>RestartSec=</varname> (for services, see
<citerefentry><refentrytitle>systemd.service</refentrytitle><manvolnum>5</manvolnum></citerefentry>
for details on the per-unit settings). Disabled by default, when
@ -255,7 +257,9 @@
<varname>TimeoutSec=</varname>
value. <varname>DefaultTimeoutStartSec=</varname> and
<varname>DefaultTimeoutStopSec=</varname> default to
90s. <varname>DefaultRestartSec=</varname> defaults to
90s. <varname>DefaultTimeoutAbortSec=</varname> is not set by default
so that all units fall back to <varname>TimeoutStopSec=</varname>.
<varname>DefaultRestartSec=</varname> defaults to
100ms.</para></listitem>
</varlistentry>

View File

@ -573,6 +573,35 @@
</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>TimeoutAbortSec=</varname></term>
<listitem><para>This option configures the time to wait for the service to terminate when it was aborted due to a
watchdog timeout (see <varname>WatchdogSec=</varname>). If the service has a short <varname>TimeoutStopSec=</varname>
this option can be used to give the system more time to write a core dump of the service. Upon expiration the service
will be forcibly terminated by <constant>SIGKILL</constant> (see <varname>KillMode=</varname> in
<citerefentry><refentrytitle>systemd.kill</refentrytitle><manvolnum>5</manvolnum></citerefentry>). The core file will
be truncated in this case. Use <varname>TimeoutAbortSec=</varname> to set a sensible timeout for the core dumping per
service that is large enough to write all expected data while also being short enough to handle the service failure
in due time.
</para>
<para>Takes a unit-less value in seconds, or a time span value such as "5min 20s". Pass an empty value to skip
the dedicated watchdog abort timeout handling and fall back <varname>TimeoutStopSec=</varname>. Pass
<literal>infinity</literal> to disable the timeout logic. Defaults to <varname>DefaultTimeoutAbortSec=</varname> from
the manager configuration file (see
<citerefentry><refentrytitle>systemd-system.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry>).
</para>
<para>If a service of <varname>Type=notify</varname> handles <constant>SIGABRT</constant> itself (instead of relying
on the kernel to write a core dump) it can send <literal>EXTEND_TIMEOUT_USEC=…</literal> to
extended the abort time beyond <varname>TimeoutAbortSec=</varname>. The first receipt of this message
must occur before <varname>TimeoutAbortSec=</varname> is exceeded, and once the abort time has exended beyond
<varname>TimeoutAbortSec=</varname>, the service manager will allow the service to continue to abort, provided
the service repeats <literal>EXTEND_TIMEOUT_USEC=…</literal> within the interval specified, or terminates itself
(see <citerefentry><refentrytitle>sd_notify</refentrytitle><manvolnum>3</manvolnum></citerefentry>).
</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>TimeoutSec=</varname></term>
<listitem><para>A shorthand for configuring both

View File

@ -287,6 +287,27 @@ static int property_set_runtime_watchdog(
return watchdog_set_timeout(t);
}
static int property_get_default_timeout_abort_usec(
sd_bus *bus,
const char *path,
const char *interface,
const char *property,
sd_bus_message *reply,
void *userdata,
sd_bus_error *error) {
Manager *m = userdata;
usec_t t;
assert(bus);
assert(reply);
assert(m);
t = manager_default_timeout_abort_usec(m);
return sd_bus_message_append(reply, "t", t);
}
static int bus_get_unit_by_name(Manager *m, sd_bus_message *message, const char *name, Unit **ret_unit, sd_bus_error *error) {
Unit *u;
int r;
@ -2410,6 +2431,7 @@ const sd_bus_vtable bus_manager_vtable[] = {
SD_BUS_PROPERTY("DefaultTimerAccuracyUSec", "t", bus_property_get_usec, offsetof(Manager, default_timer_accuracy_usec), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DefaultTimeoutStartUSec", "t", bus_property_get_usec, offsetof(Manager, default_timeout_start_usec), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DefaultTimeoutStopUSec", "t", bus_property_get_usec, offsetof(Manager, default_timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DefaultTimeoutAbortUSec", "t", property_get_default_timeout_abort_usec, 0, 0),
SD_BUS_PROPERTY("DefaultRestartUSec", "t", bus_property_get_usec, offsetof(Manager, default_restart_usec), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DefaultStartLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Manager, default_start_limit_interval), SD_BUS_VTABLE_PROPERTY_CONST),
/* The following two items are obsolete alias */

View File

@ -29,6 +29,27 @@ static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_restart, service_restart, Servi
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_notify_access, notify_access, NotifyAccess);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_emergency_action, emergency_action, EmergencyAction);
static int property_get_timeout_abort_usec(
sd_bus *bus,
const char *path,
const char *interface,
const char *property,
sd_bus_message *reply,
void *userdata,
sd_bus_error *error) {
Service *s = userdata;
usec_t t;
assert(bus);
assert(reply);
assert(s);
t = service_timeout_abort_usec(s);
return sd_bus_message_append(reply, "t", t);
}
static int property_get_exit_status_set(
sd_bus *bus,
const char *path,
@ -103,6 +124,7 @@ const sd_bus_vtable bus_service_vtable[] = {
SD_BUS_PROPERTY("RestartUSec", "t", bus_property_get_usec, offsetof(Service, restart_usec), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("TimeoutStartUSec", "t", bus_property_get_usec, offsetof(Service, timeout_start_usec), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("TimeoutStopUSec", "t", bus_property_get_usec, offsetof(Service, timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("TimeoutAbortUSec", "t", property_get_timeout_abort_usec, 0, 0),
SD_BUS_PROPERTY("RuntimeMaxUSec", "t", bus_property_get_usec, offsetof(Service, runtime_max_usec), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("WatchdogUSec", "t", bus_property_get_usec, offsetof(Service, watchdog_usec), SD_BUS_VTABLE_PROPERTY_CONST),
BUS_PROPERTY_DUAL_TIMESTAMP("WatchdogTimestamp", offsetof(Service, watchdog_timestamp), 0),

View File

@ -308,6 +308,7 @@ Service.RestartSec, config_parse_sec, 0,
Service.TimeoutSec, config_parse_service_timeout, 0, 0
Service.TimeoutStartSec, config_parse_service_timeout, 0, 0
Service.TimeoutStopSec, config_parse_sec_fix_0, 0, offsetof(Service, timeout_stop_usec)
Service.TimeoutAbortSec, config_parse_service_timeout_abort, 0, 0
Service.RuntimeMaxSec, config_parse_sec, 0, offsetof(Service, runtime_max_usec)
Service.WatchdogSec, config_parse_sec, 0, offsetof(Service, watchdog_usec)
m4_dnl The following five only exist for compatibility, they moved into Unit, see above

View File

@ -1894,6 +1894,42 @@ int config_parse_service_timeout(
return 0;
}
int config_parse_service_timeout_abort(
const char *unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
Service *s = userdata;
int r;
assert(filename);
assert(lvalue);
assert(rvalue);
assert(s);
rvalue += strspn(rvalue, WHITESPACE);
if (isempty(rvalue)) {
s->timeout_abort_set = false;
return 0;
}
r = parse_sec(rvalue, &s->timeout_abort_usec);
if (r < 0) {
log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse TimeoutAbortSec= setting, ignoring: %s", rvalue);
return 0;
}
s->timeout_abort_set = true;
return 0;
}
int config_parse_sec_fix_0(
const char *unit,
const char *filename,

View File

@ -24,6 +24,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_exec_nice);
CONFIG_PARSER_PROTOTYPE(config_parse_exec_oom_score_adjust);
CONFIG_PARSER_PROTOTYPE(config_parse_exec);
CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout);
CONFIG_PARSER_PROTOTYPE(config_parse_service_timeout_abort);
CONFIG_PARSER_PROTOTYPE(config_parse_service_type);
CONFIG_PARSER_PROTOTYPE(config_parse_service_restart);
CONFIG_PARSER_PROTOTYPE(config_parse_socket_bindtodevice);

View File

@ -112,6 +112,8 @@ static ExecOutput arg_default_std_error = EXEC_OUTPUT_INHERIT;
static usec_t arg_default_restart_usec = DEFAULT_RESTART_USEC;
static usec_t arg_default_timeout_start_usec = DEFAULT_TIMEOUT_USEC;
static usec_t arg_default_timeout_stop_usec = DEFAULT_TIMEOUT_USEC;
static usec_t arg_default_timeout_abort_usec = DEFAULT_TIMEOUT_USEC;
static bool arg_default_timeout_abort_set = false;
static usec_t arg_default_start_limit_interval = DEFAULT_START_LIMIT_INTERVAL;
static unsigned arg_default_start_limit_burst = DEFAULT_START_LIMIT_BURST;
static usec_t arg_runtime_watchdog = 0;
@ -668,6 +670,40 @@ static int config_parse_crash_chvt(
return 0;
}
static int config_parse_timeout_abort(
const char* unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
int r;
assert(filename);
assert(lvalue);
assert(rvalue);
rvalue += strspn(rvalue, WHITESPACE);
if (isempty(rvalue)) {
arg_default_timeout_abort_set = false;
return 0;
}
r = parse_sec(rvalue, &arg_default_timeout_abort_usec);
if (r < 0) {
log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse DefaultTimeoutAbortSec= setting, ignoring: %s", rvalue);
return 0;
}
arg_default_timeout_abort_set = true;
return 0;
}
static int parse_config_file(void) {
const ConfigTableItem items[] = {
@ -697,6 +733,7 @@ static int parse_config_file(void) {
{ "Manager", "DefaultStandardError", config_parse_output_restricted,0, &arg_default_std_error },
{ "Manager", "DefaultTimeoutStartSec", config_parse_sec, 0, &arg_default_timeout_start_usec },
{ "Manager", "DefaultTimeoutStopSec", config_parse_sec, 0, &arg_default_timeout_stop_usec },
{ "Manager", "DefaultTimeoutAbortSec", config_parse_timeout_abort, 0, NULL },
{ "Manager", "DefaultRestartSec", config_parse_sec, 0, &arg_default_restart_usec },
{ "Manager", "DefaultStartLimitInterval", config_parse_sec, 0, &arg_default_start_limit_interval }, /* obsolete alias */
{ "Manager", "DefaultStartLimitIntervalSec",config_parse_sec, 0, &arg_default_start_limit_interval },
@ -765,6 +802,8 @@ static void set_manager_defaults(Manager *m) {
m->default_std_error = arg_default_std_error;
m->default_timeout_start_usec = arg_default_timeout_start_usec;
m->default_timeout_stop_usec = arg_default_timeout_stop_usec;
m->default_timeout_abort_usec = arg_default_timeout_abort_usec;
m->default_timeout_abort_set = arg_default_timeout_abort_set;
m->default_restart_usec = arg_default_restart_usec;
m->default_start_limit_interval = arg_default_start_limit_interval;
m->default_start_limit_burst = arg_default_start_limit_burst;

View File

@ -330,6 +330,8 @@ struct Manager {
ExecOutput default_std_output, default_std_error;
usec_t default_restart_usec, default_timeout_start_usec, default_timeout_stop_usec;
usec_t default_timeout_abort_usec;
bool default_timeout_abort_set;
usec_t default_start_limit_interval;
unsigned default_start_limit_burst;
@ -417,6 +419,10 @@ struct Manager {
bool honor_device_enumeration;
};
static inline usec_t manager_default_timeout_abort_usec(Manager *m) {
return m->default_timeout_abort_set ? m->default_timeout_abort_usec : m->default_timeout_stop_usec;
}
#define MANAGER_IS_SYSTEM(m) ((m)->unit_file_scope == UNIT_FILE_SYSTEM)
#define MANAGER_IS_USER(m) ((m)->unit_file_scope != UNIT_FILE_SYSTEM)

View File

@ -99,6 +99,8 @@ static void service_init(Unit *u) {
s->timeout_start_usec = u->manager->default_timeout_start_usec;
s->timeout_stop_usec = u->manager->default_timeout_stop_usec;
s->timeout_abort_usec = u->manager->default_timeout_abort_usec;
s->timeout_abort_set = u->manager->default_timeout_abort_set;
s->restart_usec = u->manager->default_restart_usec;
s->runtime_max_usec = USEC_INFINITY;
s->type = _SERVICE_TYPE_INVALID;
@ -789,7 +791,7 @@ static int service_load(Unit *u) {
static void service_dump(Unit *u, FILE *f, const char *prefix) {
char buf_restart[FORMAT_TIMESPAN_MAX], buf_start[FORMAT_TIMESPAN_MAX], buf_stop[FORMAT_TIMESPAN_MAX];
char buf_runtime[FORMAT_TIMESPAN_MAX], buf_watchdog[FORMAT_TIMESPAN_MAX];
char buf_runtime[FORMAT_TIMESPAN_MAX], buf_watchdog[FORMAT_TIMESPAN_MAX], buf_abort[FORMAT_TIMESPAN_MAX];
ServiceExecCommand c;
Service *s = SERVICE(u);
const char *prefix2;
@ -860,11 +862,15 @@ static void service_dump(Unit *u, FILE *f, const char *prefix) {
"%sRestartSec: %s\n"
"%sTimeoutStartSec: %s\n"
"%sTimeoutStopSec: %s\n"
"%sTimeoutAbortSec: %s\n"
"%sRuntimeMaxSec: %s\n"
"%sWatchdogSec: %s\n",
prefix, format_timespan(buf_restart, sizeof(buf_restart), s->restart_usec, USEC_PER_SEC),
prefix, format_timespan(buf_start, sizeof(buf_start), s->timeout_start_usec, USEC_PER_SEC),
prefix, format_timespan(buf_stop, sizeof(buf_stop), s->timeout_stop_usec, USEC_PER_SEC),
prefix, s->timeout_abort_set
? format_timespan(buf_abort, sizeof(buf_abort), s->timeout_abort_usec, USEC_PER_SEC)
: "",
prefix, format_timespan(buf_runtime, sizeof(buf_runtime), s->runtime_max_usec, USEC_PER_SEC),
prefix, format_timespan(buf_watchdog, sizeof(buf_watchdog), s->watchdog_usec, USEC_PER_SEC));
@ -1132,7 +1138,6 @@ static usec_t service_coldplug_timeout(Service *s) {
return usec_add(UNIT(s)->active_enter_timestamp.monotonic, s->runtime_max_usec);
case SERVICE_STOP:
case SERVICE_STOP_WATCHDOG:
case SERVICE_STOP_SIGTERM:
case SERVICE_STOP_SIGKILL:
case SERVICE_STOP_POST:
@ -1140,6 +1145,9 @@ static usec_t service_coldplug_timeout(Service *s) {
case SERVICE_FINAL_SIGKILL:
return usec_add(UNIT(s)->state_change_timestamp.monotonic, s->timeout_stop_usec);
case SERVICE_STOP_WATCHDOG:
return usec_add(UNIT(s)->state_change_timestamp.monotonic, service_timeout_abort_usec(s));
case SERVICE_AUTO_RESTART:
return usec_add(UNIT(s)->inactive_enter_timestamp.monotonic, s->restart_usec);
@ -1857,7 +1865,8 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
goto fail;
if (r > 0) {
r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_stop_usec));
r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC),
state == SERVICE_STOP_WATCHDOG ? service_timeout_abort_usec(s) : s->timeout_stop_usec));
if (r < 0)
goto fail;
@ -2428,7 +2437,7 @@ static int service_stop(Unit *u) {
/* Already on it */
if (IN_SET(s->state,
SERVICE_STOP, SERVICE_STOP_WATCHDOG, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL))
return 0;
@ -2440,7 +2449,7 @@ static int service_stop(Unit *u) {
/* If there's already something running we go directly into
* kill mode. */
if (IN_SET(s->state, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RELOAD)) {
if (IN_SET(s->state, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RELOAD, SERVICE_STOP_WATCHDOG)) {
service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_SUCCESS);
return 0;
}

View File

@ -97,6 +97,8 @@ struct Service {
usec_t restart_usec;
usec_t timeout_start_usec;
usec_t timeout_stop_usec;
usec_t timeout_abort_usec;
usec_t timeout_abort_set;
usec_t runtime_max_usec;
dual_timestamp watchdog_timestamp;
@ -189,6 +191,10 @@ struct Service {
OOMPolicy oom_policy;
};
static inline usec_t service_timeout_abort_usec(Service *s) {
return s->timeout_abort_set ? s->timeout_abort_usec : s->timeout_stop_usec;
}
extern const UnitVTable service_vtable;
int service_set_socket_fd(Service *s, int fd, struct Socket *socket, bool selinux_context_net);

View File

@ -35,6 +35,7 @@
#DefaultStandardError=inherit
#DefaultTimeoutStartSec=90s
#DefaultTimeoutStopSec=90s
#DefaultTimeoutAbortSec=
#DefaultRestartSec=100ms
#DefaultStartLimitIntervalSec=10s
#DefaultStartLimitBurst=5

View File

@ -22,6 +22,7 @@
#DefaultStandardError=inherit
#DefaultTimeoutStartSec=90s
#DefaultTimeoutStopSec=90s
#DefaultTimeoutAbortSec=
#DefaultRestartSec=100ms
#DefaultStartLimitIntervalSec=10s
#DefaultStartLimitBurst=5

View File

@ -128,6 +128,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_path);
CONFIG_PARSER_PROTOTYPE(config_parse_strv);
CONFIG_PARSER_PROTOTYPE(config_parse_sec);
CONFIG_PARSER_PROTOTYPE(config_parse_sec_def_infinity);
CONFIG_PARSER_PROTOTYPE(config_parse_sec_def_unset);
CONFIG_PARSER_PROTOTYPE(config_parse_nsec);
CONFIG_PARSER_PROTOTYPE(config_parse_mode);
CONFIG_PARSER_PROTOTYPE(config_parse_warn_compat);

View File

@ -227,6 +227,7 @@ TimeoutIdleSec=
TimeoutSec=
TimeoutStartSec=
TimeoutStopSec=
TimeoutAbortSec=
Transparent=
TriggerLimitBurst=
TriggerLimitIntervalSec=
@ -686,6 +687,7 @@ DefaultTasksAccounting=
DefaultTasksMax=
DefaultTimeoutStartSec=
DefaultTimeoutStopSec=
DefaultTimeoutAbortSec=
DefaultTimerAccuracySec=
DumpCore=
HibernateMode=