Systemd/src/core/scope.c

621 lines
18 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: LGPL-2.1+ */
#include <errno.h>
#include <unistd.h>
#include "alloc-util.h"
#include "dbus-scope.h"
#include "load-dropin.h"
#include "log.h"
#include "scope.h"
#include "serialize.h"
#include "special.h"
#include "string-table.h"
#include "string-util.h"
#include "strv.h"
#include "unit-name.h"
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
#include "unit.h"
static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = {
[SCOPE_DEAD] = UNIT_INACTIVE,
[SCOPE_RUNNING] = UNIT_ACTIVE,
[SCOPE_ABANDONED] = UNIT_ACTIVE,
[SCOPE_STOP_SIGTERM] = UNIT_DEACTIVATING,
[SCOPE_STOP_SIGKILL] = UNIT_DEACTIVATING,
[SCOPE_FAILED] = UNIT_FAILED
};
static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
static void scope_init(Unit *u) {
Scope *s = SCOPE(u);
assert(u);
assert(u->load_state == UNIT_STUB);
s->timeout_stop_usec = u->manager->default_timeout_stop_usec;
u->ignore_on_isolate = true;
}
static void scope_done(Unit *u) {
Scope *s = SCOPE(u);
assert(u);
s->controller = mfree(s->controller);
s->controller_track = sd_bus_track_unref(s->controller_track);
s->timer_event_source = sd_event_source_unref(s->timer_event_source);
}
static int scope_arm_timer(Scope *s, usec_t usec) {
int r;
assert(s);
if (s->timer_event_source) {
r = sd_event_source_set_time(s->timer_event_source, usec);
if (r < 0)
return r;
return sd_event_source_set_enabled(s->timer_event_source, SD_EVENT_ONESHOT);
}
if (usec == USEC_INFINITY)
return 0;
r = sd_event_add_time(
UNIT(s)->manager->event,
&s->timer_event_source,
CLOCK_MONOTONIC,
usec, 0,
scope_dispatch_timer, s);
2015-04-29 16:05:32 +02:00
if (r < 0)
return r;
(void) sd_event_source_set_description(s->timer_event_source, "scope-timer");
return 0;
}
static void scope_set_state(Scope *s, ScopeState state) {
ScopeState old_state;
assert(s);
old_state = s->state;
s->state = state;
if (!IN_SET(state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
s->timer_event_source = sd_event_source_unref(s->timer_event_source);
core: rework how we track service and scope PIDs This reworks how systemd tracks processes on cgroupv1 systems where cgroup notification is not reliable. Previously, whenever we had reason to believe that new processes showed up or got removed we'd scan the cgroup of the scope or service unit for new processes, and would tidy up the list of PIDs previously watched. This scanning is relatively slow, and does not scale well. With this change behaviour is changed: instead of scanning for new/removed processes right away we do this work in a per-unit deferred event loop job. This event source is scheduled at a very low priority, so that it is executed when we have time but does not starve other event sources. This has two benefits: this expensive work is coalesced, if events happen in quick succession, and we won't delay SIGCHLD handling for too long. This patch basically replaces all direct invocation of unit_watch_all_pids() in scope.c and service.c with invocations of the new unit_enqueue_rewatch_pids() call which just enqueues a request of watching/tidying up the PID sets (with one exception: in scope_enter_signal() and service_enter_signal() we'll still do unit_watch_all_pids() synchronously first, since we really want to know all processes we are about to kill so that we can track them properly. Moreover, all direct invocations of unit_tidy_watch_pids() and unit_synthesize_cgroup_empty_event() are removed too, when the unit_enqueue_rewatch_pids() call is invoked, as the queued job will run those operations too. All of this is done on cgroupsv1 systems only, and is disabled on cgroupsv2 systems as cgroup-empty notifications are reliable there, and we do not need SIGCHLD events to track processes there. Fixes: #9138
2018-05-31 15:41:59 +02:00
if (IN_SET(state, SCOPE_DEAD, SCOPE_FAILED)) {
unit_unwatch_all_pids(UNIT(s));
core: rework how we track service and scope PIDs This reworks how systemd tracks processes on cgroupv1 systems where cgroup notification is not reliable. Previously, whenever we had reason to believe that new processes showed up or got removed we'd scan the cgroup of the scope or service unit for new processes, and would tidy up the list of PIDs previously watched. This scanning is relatively slow, and does not scale well. With this change behaviour is changed: instead of scanning for new/removed processes right away we do this work in a per-unit deferred event loop job. This event source is scheduled at a very low priority, so that it is executed when we have time but does not starve other event sources. This has two benefits: this expensive work is coalesced, if events happen in quick succession, and we won't delay SIGCHLD handling for too long. This patch basically replaces all direct invocation of unit_watch_all_pids() in scope.c and service.c with invocations of the new unit_enqueue_rewatch_pids() call which just enqueues a request of watching/tidying up the PID sets (with one exception: in scope_enter_signal() and service_enter_signal() we'll still do unit_watch_all_pids() synchronously first, since we really want to know all processes we are about to kill so that we can track them properly. Moreover, all direct invocations of unit_tidy_watch_pids() and unit_synthesize_cgroup_empty_event() are removed too, when the unit_enqueue_rewatch_pids() call is invoked, as the queued job will run those operations too. All of this is done on cgroupsv1 systems only, and is disabled on cgroupsv2 systems as cgroup-empty notifications are reliable there, and we do not need SIGCHLD events to track processes there. Fixes: #9138
2018-05-31 15:41:59 +02:00
unit_dequeue_rewatch_pids(UNIT(s));
}
if (state != old_state)
log_debug("%s changed %s -> %s", UNIT(s)->id, scope_state_to_string(old_state), scope_state_to_string(state));
unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], 0);
}
static int scope_add_default_dependencies(Scope *s) {
int r;
assert(s);
if (!UNIT(s)->default_dependencies)
return 0;
/* Make sure scopes are unloaded on shutdown */
r = unit_add_two_dependencies_by_name(
UNIT(s),
UNIT_BEFORE, UNIT_CONFLICTS,
SPECIAL_SHUTDOWN_TARGET, true,
UNIT_DEPENDENCY_DEFAULT);
if (r < 0)
return r;
return 0;
}
static int scope_verify(Scope *s) {
assert(s);
if (UNIT(s)->load_state != UNIT_LOADED)
return 0;
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
if (set_isempty(UNIT(s)->pids) &&
!MANAGER_IS_RELOADING(UNIT(s)->manager) &&
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
!unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE)) {
core,network: major per-object logging rework This changes log_unit_info() (and friends) to take a real Unit* object insted of just a unit name as parameter. The call will now prefix all logged messages with the unit name, thus allowing the unit name to be dropped from the various passed romat strings, simplifying invocations drastically, and unifying log output across messages. Also, UNIT= vs. USER_UNIT= is now derived from the Manager object attached to the Unit object, instead of getpid(). This has the benefit of correcting the field for --test runs. Also contains a couple of other logging improvements: - Drops a couple of strerror() invocations in favour of using %m. - Not only .mount units now warn if a symlinks exist for the mount point already, .automount units do that too, now. - A few invocations of log_struct() that didn't actually pass any additional structured data have been replaced by simpler invocations of log_unit_info() and friends. - For structured data a new LOG_UNIT_MESSAGE() macro has been added, that works like LOG_MESSAGE() but prefixes the message with the unit name. Similar, there's now LOG_LINK_MESSAGE() and LOG_NETDEV_MESSAGE(). - For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(), LOG_NETDEV_INTERFACE() macros have been added that generate the necessary per object fields. The old log_unit_struct() call has been removed in favour of these new macros used in raw log_struct() invocations. In addition to removing one more function call this allows generated structured log messages that contain two object fields, as necessary for example for network interfaces that are joined into another network interface, and whose messages shall be indexed by both. - The LOG_ERRNO() macro has been removed, in favour of log_struct_errno(). The latter has the benefit of ensuring that %m in format strings is properly resolved to the specified error number. - A number of logging messages have been converted to use log_unit_info() instead of log_info() - The client code in sysv-generator no longer #includes core code from src/core/. - log_unit_full_errno() has been removed, log_unit_full() instead takes an errno now, too. - log_unit_info(), log_link_info(), log_netdev_info() and friends, now avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
log_unit_error(UNIT(s), "Scope has no PIDs. Refusing.");
return -ENOENT;
}
return 0;
}
static int scope_load_init_scope(Unit *u) {
assert(u);
if (!unit_has_name(u, SPECIAL_INIT_SCOPE))
return 0;
u->transient = true;
u->perpetual = true;
/* init.scope is a bit special, as it has to stick around forever. Because of its special semantics we
* synthesize it here, instead of relying on the unit file on disk. */
u->default_dependencies = false;
/* Prettify things, if we can. */
if (!u->description)
u->description = strdup("System and Service Manager");
if (!u->documentation)
(void) strv_extend(&u->documentation, "man:systemd(1)");
return 1;
}
static int scope_load(Unit *u) {
Scope *s = SCOPE(u);
int r;
assert(s);
assert(u->load_state == UNIT_STUB);
if (!u->transient && !MANAGER_IS_RELOADING(u->manager))
/* Refuse to load non-transient scope units, but allow them while reloading. */
return -ENOENT;
r = scope_load_init_scope(u);
if (r < 0)
return r;
r = unit_load_fragment_and_dropin_optional(u);
if (r < 0)
return r;
if (u->load_state == UNIT_LOADED) {
r = unit_patch_contexts(u);
if (r < 0)
return r;
r = unit_set_default_slice(u);
if (r < 0)
return r;
r = scope_add_default_dependencies(s);
if (r < 0)
return r;
}
return scope_verify(s);
}
static int scope_coldplug(Unit *u) {
Scope *s = SCOPE(u);
int r;
assert(s);
assert(s->state == SCOPE_DEAD);
if (s->deserialized_state == s->state)
return 0;
if (IN_SET(s->deserialized_state, SCOPE_STOP_SIGKILL, SCOPE_STOP_SIGTERM)) {
r = scope_arm_timer(s, usec_add(u->state_change_timestamp.monotonic, s->timeout_stop_usec));
if (r < 0)
return r;
}
if (!IN_SET(s->deserialized_state, SCOPE_DEAD, SCOPE_FAILED))
core: rework how we track service and scope PIDs This reworks how systemd tracks processes on cgroupv1 systems where cgroup notification is not reliable. Previously, whenever we had reason to believe that new processes showed up or got removed we'd scan the cgroup of the scope or service unit for new processes, and would tidy up the list of PIDs previously watched. This scanning is relatively slow, and does not scale well. With this change behaviour is changed: instead of scanning for new/removed processes right away we do this work in a per-unit deferred event loop job. This event source is scheduled at a very low priority, so that it is executed when we have time but does not starve other event sources. This has two benefits: this expensive work is coalesced, if events happen in quick succession, and we won't delay SIGCHLD handling for too long. This patch basically replaces all direct invocation of unit_watch_all_pids() in scope.c and service.c with invocations of the new unit_enqueue_rewatch_pids() call which just enqueues a request of watching/tidying up the PID sets (with one exception: in scope_enter_signal() and service_enter_signal() we'll still do unit_watch_all_pids() synchronously first, since we really want to know all processes we are about to kill so that we can track them properly. Moreover, all direct invocations of unit_tidy_watch_pids() and unit_synthesize_cgroup_empty_event() are removed too, when the unit_enqueue_rewatch_pids() call is invoked, as the queued job will run those operations too. All of this is done on cgroupsv1 systems only, and is disabled on cgroupsv2 systems as cgroup-empty notifications are reliable there, and we do not need SIGCHLD events to track processes there. Fixes: #9138
2018-05-31 15:41:59 +02:00
(void) unit_enqueue_rewatch_pids(u);
bus_scope_track_controller(s);
scope_set_state(s, s->deserialized_state);
return 0;
}
static void scope_dump(Unit *u, FILE *f, const char *prefix) {
Scope *s = SCOPE(u);
assert(s);
assert(f);
fprintf(f,
"%sScope State: %s\n"
"%sResult: %s\n",
prefix, scope_state_to_string(s->state),
prefix, scope_result_to_string(s->result));
cgroup_context_dump(&s->cgroup_context, f, prefix);
kill_context_dump(&s->kill_context, f, prefix);
}
static void scope_enter_dead(Scope *s, ScopeResult f) {
assert(s);
if (s->result == SCOPE_SUCCESS)
s->result = f;
unit_log_result(UNIT(s), s->result == SCOPE_SUCCESS, scope_result_to_string(s->result));
scope_set_state(s, s->result != SCOPE_SUCCESS ? SCOPE_FAILED : SCOPE_DEAD);
}
static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) {
bool skip_signal = false;
int r;
assert(s);
if (s->result == SCOPE_SUCCESS)
s->result = f;
core: rework how we track service and scope PIDs This reworks how systemd tracks processes on cgroupv1 systems where cgroup notification is not reliable. Previously, whenever we had reason to believe that new processes showed up or got removed we'd scan the cgroup of the scope or service unit for new processes, and would tidy up the list of PIDs previously watched. This scanning is relatively slow, and does not scale well. With this change behaviour is changed: instead of scanning for new/removed processes right away we do this work in a per-unit deferred event loop job. This event source is scheduled at a very low priority, so that it is executed when we have time but does not starve other event sources. This has two benefits: this expensive work is coalesced, if events happen in quick succession, and we won't delay SIGCHLD handling for too long. This patch basically replaces all direct invocation of unit_watch_all_pids() in scope.c and service.c with invocations of the new unit_enqueue_rewatch_pids() call which just enqueues a request of watching/tidying up the PID sets (with one exception: in scope_enter_signal() and service_enter_signal() we'll still do unit_watch_all_pids() synchronously first, since we really want to know all processes we are about to kill so that we can track them properly. Moreover, all direct invocations of unit_tidy_watch_pids() and unit_synthesize_cgroup_empty_event() are removed too, when the unit_enqueue_rewatch_pids() call is invoked, as the queued job will run those operations too. All of this is done on cgroupsv1 systems only, and is disabled on cgroupsv2 systems as cgroup-empty notifications are reliable there, and we do not need SIGCHLD events to track processes there. Fixes: #9138
2018-05-31 15:41:59 +02:00
/* Before sending any signal, make sure we track all members of this cgroup */
(void) unit_watch_all_pids(UNIT(s));
/* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have
* died now */
(void) unit_enqueue_rewatch_pids(UNIT(s));
/* If we have a controller set let's ask the controller nicely to terminate the scope, instead of us going
* directly into SIGTERM berserk mode */
if (state == SCOPE_STOP_SIGTERM)
skip_signal = bus_scope_send_request_stop(s) > 0;
if (skip_signal)
r = 1; /* wait */
else {
r = unit_kill_context(
UNIT(s),
&s->kill_context,
state != SCOPE_STOP_SIGTERM ? KILL_KILL :
s->was_abandoned ? KILL_TERMINATE_AND_LOG :
KILL_TERMINATE,
-1, -1, false);
if (r < 0)
goto fail;
}
if (r > 0) {
r = scope_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_stop_usec));
if (r < 0)
goto fail;
scope_set_state(s, state);
} else if (state == SCOPE_STOP_SIGTERM)
scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_SUCCESS);
else
scope_enter_dead(s, SCOPE_SUCCESS);
return;
fail:
core,network: major per-object logging rework This changes log_unit_info() (and friends) to take a real Unit* object insted of just a unit name as parameter. The call will now prefix all logged messages with the unit name, thus allowing the unit name to be dropped from the various passed romat strings, simplifying invocations drastically, and unifying log output across messages. Also, UNIT= vs. USER_UNIT= is now derived from the Manager object attached to the Unit object, instead of getpid(). This has the benefit of correcting the field for --test runs. Also contains a couple of other logging improvements: - Drops a couple of strerror() invocations in favour of using %m. - Not only .mount units now warn if a symlinks exist for the mount point already, .automount units do that too, now. - A few invocations of log_struct() that didn't actually pass any additional structured data have been replaced by simpler invocations of log_unit_info() and friends. - For structured data a new LOG_UNIT_MESSAGE() macro has been added, that works like LOG_MESSAGE() but prefixes the message with the unit name. Similar, there's now LOG_LINK_MESSAGE() and LOG_NETDEV_MESSAGE(). - For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(), LOG_NETDEV_INTERFACE() macros have been added that generate the necessary per object fields. The old log_unit_struct() call has been removed in favour of these new macros used in raw log_struct() invocations. In addition to removing one more function call this allows generated structured log messages that contain two object fields, as necessary for example for network interfaces that are joined into another network interface, and whose messages shall be indexed by both. - The LOG_ERRNO() macro has been removed, in favour of log_struct_errno(). The latter has the benefit of ensuring that %m in format strings is properly resolved to the specified error number. - A number of logging messages have been converted to use log_unit_info() instead of log_info() - The client code in sysv-generator no longer #includes core code from src/core/. - log_unit_full_errno() has been removed, log_unit_full() instead takes an errno now, too. - log_unit_info(), log_link_info(), log_netdev_info() and friends, now avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);
}
static int scope_start(Unit *u) {
Scope *s = SCOPE(u);
int r;
assert(s);
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
if (unit_has_name(u, SPECIAL_INIT_SCOPE))
return -EPERM;
if (s->state == SCOPE_FAILED)
return -EPERM;
/* We can't fulfill this right now, please try again later */
if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
return -EAGAIN;
assert(s->state == SCOPE_DEAD);
if (!u->transient && !MANAGER_IS_RELOADING(u->manager))
return -ENOENT;
(void) bus_scope_track_controller(s);
core: add "invocation ID" concept to service manager This adds a new invocation ID concept to the service manager. The invocation ID identifies each runtime cycle of a unit uniquely. A new randomized 128bit ID is generated each time a unit moves from and inactive to an activating or active state. The primary usecase for this concept is to connect the runtime data PID 1 maintains about a service with the offline data the journal stores about it. Previously we'd use the unit name plus start/stop times, which however is highly racy since the journal will generally process log data after the service already ended. The "invocation ID" kinda matches the "boot ID" concept of the Linux kernel, except that it applies to an individual unit instead of the whole system. The invocation ID is passed to the activated processes as environment variable. It is additionally stored as extended attribute on the cgroup of the unit. The latter is used by journald to automatically retrieve it for each log logged message and attach it to the log entry. The environment variable is very easily accessible, even for unprivileged services. OTOH the extended attribute is only accessible to privileged processes (this is because cgroupfs only supports the "trusted." xattr namespace, not "user."). The environment variable may be altered by services, the extended attribute may not be, hence is the better choice for the journal. Note that reading the invocation ID off the extended attribute from journald is racy, similar to the way reading the unit name for a logging process is. This patch adds APIs to read the invocation ID to sd-id128: sd_id128_get_invocation() may be used in a similar fashion to sd_id128_get_boot(). PID1's own logging is updated to always include the invocation ID when it logs information about a unit. A new bus call GetUnitByInvocationID() is added that allows retrieving a bus path to a unit by its invocation ID. The bus path is built using the invocation ID, thus providing a path for referring to a unit that is valid only for the current runtime cycleof it. Outlook for the future: should the kernel eventually allow passing of cgroup information along AF_UNIX/SOCK_DGRAM messages via a unique cgroup id, then we can alter the invocation ID to be generated as hash from that rather than entirely randomly. This way we can derive the invocation race-freely from the messages.
2016-08-30 23:18:46 +02:00
r = unit_acquire_invocation_id(u);
if (r < 0)
return r;
(void) unit_realize_cgroup(u);
(void) unit_reset_cpu_accounting(u);
(void) unit_reset_ip_accounting(u);
core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald And let's make use of it to implement two new unit settings with it: 1. LogLevelMax= is a new per-unit setting that may be used to configure log priority filtering: set it to LogLevelMax=notice and only messages of level "notice" and lower (i.e. more important) will be processed, all others are dropped. 2. LogExtraFields= is a new per-unit setting for configuring per-unit journal fields, that are implicitly included in every log record generated by the unit's processes. It takes field/value pairs in the form of FOO=BAR. Also, related to this, one exisiting unit setting is ported to this new facility: 3. The invocation ID is now pulled from /run/systemd/units/ instead of cgroupfs xattrs. This substantially relaxes requirements of systemd on the kernel version and the privileges it runs with (specifically, cgroupfs xattrs are not available in containers, since they are stored in kernel memory, and hence are unsafe to permit to lesser privileged code). /run/systemd/units/ is a new directory, which contains a number of files and symlinks encoding the above information. PID 1 creates and manages these files, and journald reads them from there. Note that this is supposed to be a direct path between PID 1 and the journal only, due to the special runtime environment the journal runs in. Normally, today we shouldn't introduce new interfaces that (mis-)use a file system as IPC framework, and instead just an IPC system, but this is very hard to do between the journal and PID 1, as long as the IPC system is a subject PID 1 manages, and itself a client to the journal. This patch cleans up a couple of types used in journal code: specifically we switch to size_t for a couple of memory-sizing values, as size_t is the right choice for everything that is memory. Fixes: #4089 Fixes: #3041 Fixes: #4441
2017-11-02 19:43:32 +01:00
unit_export_state_files(UNIT(s));
r = unit_attach_pids_to_cgroup(u, UNIT(s)->pids, NULL);
if (r < 0) {
core,network: major per-object logging rework This changes log_unit_info() (and friends) to take a real Unit* object insted of just a unit name as parameter. The call will now prefix all logged messages with the unit name, thus allowing the unit name to be dropped from the various passed romat strings, simplifying invocations drastically, and unifying log output across messages. Also, UNIT= vs. USER_UNIT= is now derived from the Manager object attached to the Unit object, instead of getpid(). This has the benefit of correcting the field for --test runs. Also contains a couple of other logging improvements: - Drops a couple of strerror() invocations in favour of using %m. - Not only .mount units now warn if a symlinks exist for the mount point already, .automount units do that too, now. - A few invocations of log_struct() that didn't actually pass any additional structured data have been replaced by simpler invocations of log_unit_info() and friends. - For structured data a new LOG_UNIT_MESSAGE() macro has been added, that works like LOG_MESSAGE() but prefixes the message with the unit name. Similar, there's now LOG_LINK_MESSAGE() and LOG_NETDEV_MESSAGE(). - For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(), LOG_NETDEV_INTERFACE() macros have been added that generate the necessary per object fields. The old log_unit_struct() call has been removed in favour of these new macros used in raw log_struct() invocations. In addition to removing one more function call this allows generated structured log messages that contain two object fields, as necessary for example for network interfaces that are joined into another network interface, and whose messages shall be indexed by both. - The LOG_ERRNO() macro has been removed, in favour of log_struct_errno(). The latter has the benefit of ensuring that %m in format strings is properly resolved to the specified error number. - A number of logging messages have been converted to use log_unit_info() instead of log_info() - The client code in sysv-generator no longer #includes core code from src/core/. - log_unit_full_errno() has been removed, log_unit_full() instead takes an errno now, too. - log_unit_info(), log_link_info(), log_netdev_info() and friends, now avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
log_unit_warning_errno(UNIT(s), r, "Failed to add PIDs to scope's control group: %m");
2015-04-28 19:02:49 +02:00
scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);
return r;
}
s->result = SCOPE_SUCCESS;
scope_set_state(s, SCOPE_RUNNING);
/* Start watching the PIDs currently in the scope */
(void) unit_enqueue_rewatch_pids(UNIT(s));
return 1;
}
static int scope_stop(Unit *u) {
Scope *s = SCOPE(u);
assert(s);
if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
return 0;
assert(IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED));
scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_SUCCESS);
return 1;
}
2013-07-02 01:35:35 +02:00
static void scope_reset_failed(Unit *u) {
Scope *s = SCOPE(u);
assert(s);
if (s->state == SCOPE_FAILED)
scope_set_state(s, SCOPE_DEAD);
s->result = SCOPE_SUCCESS;
}
static int scope_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) {
return unit_kill_common(u, who, signo, -1, -1, error);
}
static int scope_get_timeout(Unit *u, usec_t *timeout) {
Scope *s = SCOPE(u);
usec_t t;
int r;
if (!s->timer_event_source)
return 0;
r = sd_event_source_get_time(s->timer_event_source, &t);
if (r < 0)
return r;
if (t == USEC_INFINITY)
return 0;
*timeout = t;
return 1;
}
static int scope_serialize(Unit *u, FILE *f, FDSet *fds) {
Scope *s = SCOPE(u);
assert(s);
assert(f);
assert(fds);
(void) serialize_item(f, "state", scope_state_to_string(s->state));
(void) serialize_bool(f, "was-abandoned", s->was_abandoned);
if (s->controller)
(void) serialize_item(f, "controller", s->controller);
return 0;
}
static int scope_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
Scope *s = SCOPE(u);
int r;
assert(u);
assert(key);
assert(value);
assert(fds);
if (streq(key, "state")) {
ScopeState state;
state = scope_state_from_string(value);
if (state < 0)
core,network: major per-object logging rework This changes log_unit_info() (and friends) to take a real Unit* object insted of just a unit name as parameter. The call will now prefix all logged messages with the unit name, thus allowing the unit name to be dropped from the various passed romat strings, simplifying invocations drastically, and unifying log output across messages. Also, UNIT= vs. USER_UNIT= is now derived from the Manager object attached to the Unit object, instead of getpid(). This has the benefit of correcting the field for --test runs. Also contains a couple of other logging improvements: - Drops a couple of strerror() invocations in favour of using %m. - Not only .mount units now warn if a symlinks exist for the mount point already, .automount units do that too, now. - A few invocations of log_struct() that didn't actually pass any additional structured data have been replaced by simpler invocations of log_unit_info() and friends. - For structured data a new LOG_UNIT_MESSAGE() macro has been added, that works like LOG_MESSAGE() but prefixes the message with the unit name. Similar, there's now LOG_LINK_MESSAGE() and LOG_NETDEV_MESSAGE(). - For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(), LOG_NETDEV_INTERFACE() macros have been added that generate the necessary per object fields. The old log_unit_struct() call has been removed in favour of these new macros used in raw log_struct() invocations. In addition to removing one more function call this allows generated structured log messages that contain two object fields, as necessary for example for network interfaces that are joined into another network interface, and whose messages shall be indexed by both. - The LOG_ERRNO() macro has been removed, in favour of log_struct_errno(). The latter has the benefit of ensuring that %m in format strings is properly resolved to the specified error number. - A number of logging messages have been converted to use log_unit_info() instead of log_info() - The client code in sysv-generator no longer #includes core code from src/core/. - log_unit_full_errno() has been removed, log_unit_full() instead takes an errno now, too. - log_unit_info(), log_link_info(), log_netdev_info() and friends, now avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
log_unit_debug(u, "Failed to parse state value: %s", value);
else
s->deserialized_state = state;
} else if (streq(key, "was-abandoned")) {
int k;
k = parse_boolean(value);
if (k < 0)
log_unit_debug(u, "Failed to parse boolean value: %s", value);
else
s->was_abandoned = k;
} else if (streq(key, "controller")) {
r = free_and_strdup(&s->controller, value);
if (r < 0)
return log_oom();
} else
core,network: major per-object logging rework This changes log_unit_info() (and friends) to take a real Unit* object insted of just a unit name as parameter. The call will now prefix all logged messages with the unit name, thus allowing the unit name to be dropped from the various passed romat strings, simplifying invocations drastically, and unifying log output across messages. Also, UNIT= vs. USER_UNIT= is now derived from the Manager object attached to the Unit object, instead of getpid(). This has the benefit of correcting the field for --test runs. Also contains a couple of other logging improvements: - Drops a couple of strerror() invocations in favour of using %m. - Not only .mount units now warn if a symlinks exist for the mount point already, .automount units do that too, now. - A few invocations of log_struct() that didn't actually pass any additional structured data have been replaced by simpler invocations of log_unit_info() and friends. - For structured data a new LOG_UNIT_MESSAGE() macro has been added, that works like LOG_MESSAGE() but prefixes the message with the unit name. Similar, there's now LOG_LINK_MESSAGE() and LOG_NETDEV_MESSAGE(). - For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(), LOG_NETDEV_INTERFACE() macros have been added that generate the necessary per object fields. The old log_unit_struct() call has been removed in favour of these new macros used in raw log_struct() invocations. In addition to removing one more function call this allows generated structured log messages that contain two object fields, as necessary for example for network interfaces that are joined into another network interface, and whose messages shall be indexed by both. - The LOG_ERRNO() macro has been removed, in favour of log_struct_errno(). The latter has the benefit of ensuring that %m in format strings is properly resolved to the specified error number. - A number of logging messages have been converted to use log_unit_info() instead of log_info() - The client code in sysv-generator no longer #includes core code from src/core/. - log_unit_full_errno() has been removed, log_unit_full() instead takes an errno now, too. - log_unit_info(), log_link_info(), log_netdev_info() and friends, now avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
log_unit_debug(u, "Unknown serialization key: %s", key);
return 0;
}
static void scope_notify_cgroup_empty_event(Unit *u) {
Scope *s = SCOPE(u);
assert(u);
core,network: major per-object logging rework This changes log_unit_info() (and friends) to take a real Unit* object insted of just a unit name as parameter. The call will now prefix all logged messages with the unit name, thus allowing the unit name to be dropped from the various passed romat strings, simplifying invocations drastically, and unifying log output across messages. Also, UNIT= vs. USER_UNIT= is now derived from the Manager object attached to the Unit object, instead of getpid(). This has the benefit of correcting the field for --test runs. Also contains a couple of other logging improvements: - Drops a couple of strerror() invocations in favour of using %m. - Not only .mount units now warn if a symlinks exist for the mount point already, .automount units do that too, now. - A few invocations of log_struct() that didn't actually pass any additional structured data have been replaced by simpler invocations of log_unit_info() and friends. - For structured data a new LOG_UNIT_MESSAGE() macro has been added, that works like LOG_MESSAGE() but prefixes the message with the unit name. Similar, there's now LOG_LINK_MESSAGE() and LOG_NETDEV_MESSAGE(). - For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(), LOG_NETDEV_INTERFACE() macros have been added that generate the necessary per object fields. The old log_unit_struct() call has been removed in favour of these new macros used in raw log_struct() invocations. In addition to removing one more function call this allows generated structured log messages that contain two object fields, as necessary for example for network interfaces that are joined into another network interface, and whose messages shall be indexed by both. - The LOG_ERRNO() macro has been removed, in favour of log_struct_errno(). The latter has the benefit of ensuring that %m in format strings is properly resolved to the specified error number. - A number of logging messages have been converted to use log_unit_info() instead of log_info() - The client code in sysv-generator no longer #includes core code from src/core/. - log_unit_full_errno() has been removed, log_unit_full() instead takes an errno now, too. - log_unit_info(), log_link_info(), log_netdev_info() and friends, now avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
log_unit_debug(u, "cgroup is empty");
if (IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
scope_enter_dead(s, SCOPE_SUCCESS);
}
static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) {
assert(u);
/* If we get a SIGCHLD event for one of the processes we were interested in, then we look for others to
* watch, under the assumption that we'll sooner or later get a SIGCHLD for them, as the original
* process we watched was probably the parent of them, and they are hence now our children. */
core: rework how we track service and scope PIDs This reworks how systemd tracks processes on cgroupv1 systems where cgroup notification is not reliable. Previously, whenever we had reason to believe that new processes showed up or got removed we'd scan the cgroup of the scope or service unit for new processes, and would tidy up the list of PIDs previously watched. This scanning is relatively slow, and does not scale well. With this change behaviour is changed: instead of scanning for new/removed processes right away we do this work in a per-unit deferred event loop job. This event source is scheduled at a very low priority, so that it is executed when we have time but does not starve other event sources. This has two benefits: this expensive work is coalesced, if events happen in quick succession, and we won't delay SIGCHLD handling for too long. This patch basically replaces all direct invocation of unit_watch_all_pids() in scope.c and service.c with invocations of the new unit_enqueue_rewatch_pids() call which just enqueues a request of watching/tidying up the PID sets (with one exception: in scope_enter_signal() and service_enter_signal() we'll still do unit_watch_all_pids() synchronously first, since we really want to know all processes we are about to kill so that we can track them properly. Moreover, all direct invocations of unit_tidy_watch_pids() and unit_synthesize_cgroup_empty_event() are removed too, when the unit_enqueue_rewatch_pids() call is invoked, as the queued job will run those operations too. All of this is done on cgroupsv1 systems only, and is disabled on cgroupsv2 systems as cgroup-empty notifications are reliable there, and we do not need SIGCHLD events to track processes there. Fixes: #9138
2018-05-31 15:41:59 +02:00
(void) unit_enqueue_rewatch_pids(u);
}
static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
Scope *s = SCOPE(userdata);
assert(s);
assert(s->timer_event_source == source);
switch (s->state) {
case SCOPE_STOP_SIGTERM:
if (s->kill_context.send_sigkill) {
core,network: major per-object logging rework This changes log_unit_info() (and friends) to take a real Unit* object insted of just a unit name as parameter. The call will now prefix all logged messages with the unit name, thus allowing the unit name to be dropped from the various passed romat strings, simplifying invocations drastically, and unifying log output across messages. Also, UNIT= vs. USER_UNIT= is now derived from the Manager object attached to the Unit object, instead of getpid(). This has the benefit of correcting the field for --test runs. Also contains a couple of other logging improvements: - Drops a couple of strerror() invocations in favour of using %m. - Not only .mount units now warn if a symlinks exist for the mount point already, .automount units do that too, now. - A few invocations of log_struct() that didn't actually pass any additional structured data have been replaced by simpler invocations of log_unit_info() and friends. - For structured data a new LOG_UNIT_MESSAGE() macro has been added, that works like LOG_MESSAGE() but prefixes the message with the unit name. Similar, there's now LOG_LINK_MESSAGE() and LOG_NETDEV_MESSAGE(). - For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(), LOG_NETDEV_INTERFACE() macros have been added that generate the necessary per object fields. The old log_unit_struct() call has been removed in favour of these new macros used in raw log_struct() invocations. In addition to removing one more function call this allows generated structured log messages that contain two object fields, as necessary for example for network interfaces that are joined into another network interface, and whose messages shall be indexed by both. - The LOG_ERRNO() macro has been removed, in favour of log_struct_errno(). The latter has the benefit of ensuring that %m in format strings is properly resolved to the specified error number. - A number of logging messages have been converted to use log_unit_info() instead of log_info() - The client code in sysv-generator no longer #includes core code from src/core/. - log_unit_full_errno() has been removed, log_unit_full() instead takes an errno now, too. - log_unit_info(), log_link_info(), log_netdev_info() and friends, now avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
log_unit_warning(UNIT(s), "Stopping timed out. Killing.");
scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_FAILURE_TIMEOUT);
} else {
core,network: major per-object logging rework This changes log_unit_info() (and friends) to take a real Unit* object insted of just a unit name as parameter. The call will now prefix all logged messages with the unit name, thus allowing the unit name to be dropped from the various passed romat strings, simplifying invocations drastically, and unifying log output across messages. Also, UNIT= vs. USER_UNIT= is now derived from the Manager object attached to the Unit object, instead of getpid(). This has the benefit of correcting the field for --test runs. Also contains a couple of other logging improvements: - Drops a couple of strerror() invocations in favour of using %m. - Not only .mount units now warn if a symlinks exist for the mount point already, .automount units do that too, now. - A few invocations of log_struct() that didn't actually pass any additional structured data have been replaced by simpler invocations of log_unit_info() and friends. - For structured data a new LOG_UNIT_MESSAGE() macro has been added, that works like LOG_MESSAGE() but prefixes the message with the unit name. Similar, there's now LOG_LINK_MESSAGE() and LOG_NETDEV_MESSAGE(). - For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(), LOG_NETDEV_INTERFACE() macros have been added that generate the necessary per object fields. The old log_unit_struct() call has been removed in favour of these new macros used in raw log_struct() invocations. In addition to removing one more function call this allows generated structured log messages that contain two object fields, as necessary for example for network interfaces that are joined into another network interface, and whose messages shall be indexed by both. - The LOG_ERRNO() macro has been removed, in favour of log_struct_errno(). The latter has the benefit of ensuring that %m in format strings is properly resolved to the specified error number. - A number of logging messages have been converted to use log_unit_info() instead of log_info() - The client code in sysv-generator no longer #includes core code from src/core/. - log_unit_full_errno() has been removed, log_unit_full() instead takes an errno now, too. - log_unit_info(), log_link_info(), log_netdev_info() and friends, now avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL.");
scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT);
}
break;
case SCOPE_STOP_SIGKILL:
core,network: major per-object logging rework This changes log_unit_info() (and friends) to take a real Unit* object insted of just a unit name as parameter. The call will now prefix all logged messages with the unit name, thus allowing the unit name to be dropped from the various passed romat strings, simplifying invocations drastically, and unifying log output across messages. Also, UNIT= vs. USER_UNIT= is now derived from the Manager object attached to the Unit object, instead of getpid(). This has the benefit of correcting the field for --test runs. Also contains a couple of other logging improvements: - Drops a couple of strerror() invocations in favour of using %m. - Not only .mount units now warn if a symlinks exist for the mount point already, .automount units do that too, now. - A few invocations of log_struct() that didn't actually pass any additional structured data have been replaced by simpler invocations of log_unit_info() and friends. - For structured data a new LOG_UNIT_MESSAGE() macro has been added, that works like LOG_MESSAGE() but prefixes the message with the unit name. Similar, there's now LOG_LINK_MESSAGE() and LOG_NETDEV_MESSAGE(). - For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(), LOG_NETDEV_INTERFACE() macros have been added that generate the necessary per object fields. The old log_unit_struct() call has been removed in favour of these new macros used in raw log_struct() invocations. In addition to removing one more function call this allows generated structured log messages that contain two object fields, as necessary for example for network interfaces that are joined into another network interface, and whose messages shall be indexed by both. - The LOG_ERRNO() macro has been removed, in favour of log_struct_errno(). The latter has the benefit of ensuring that %m in format strings is properly resolved to the specified error number. - A number of logging messages have been converted to use log_unit_info() instead of log_info() - The client code in sysv-generator no longer #includes core code from src/core/. - log_unit_full_errno() has been removed, log_unit_full() instead takes an errno now, too. - log_unit_info(), log_link_info(), log_netdev_info() and friends, now avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
log_unit_warning(UNIT(s), "Still around after SIGKILL. Ignoring.");
scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT);
break;
default:
assert_not_reached("Timeout at wrong time.");
}
return 0;
}
int scope_abandon(Scope *s) {
assert(s);
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
if (unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE))
return -EPERM;
if (!IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED))
return -ESTALE;
s->was_abandoned = true;
s->controller = mfree(s->controller);
s->controller_track = sd_bus_track_unref(s->controller_track);
scope_set_state(s, SCOPE_ABANDONED);
core: rework how we track service and scope PIDs This reworks how systemd tracks processes on cgroupv1 systems where cgroup notification is not reliable. Previously, whenever we had reason to believe that new processes showed up or got removed we'd scan the cgroup of the scope or service unit for new processes, and would tidy up the list of PIDs previously watched. This scanning is relatively slow, and does not scale well. With this change behaviour is changed: instead of scanning for new/removed processes right away we do this work in a per-unit deferred event loop job. This event source is scheduled at a very low priority, so that it is executed when we have time but does not starve other event sources. This has two benefits: this expensive work is coalesced, if events happen in quick succession, and we won't delay SIGCHLD handling for too long. This patch basically replaces all direct invocation of unit_watch_all_pids() in scope.c and service.c with invocations of the new unit_enqueue_rewatch_pids() call which just enqueues a request of watching/tidying up the PID sets (with one exception: in scope_enter_signal() and service_enter_signal() we'll still do unit_watch_all_pids() synchronously first, since we really want to know all processes we are about to kill so that we can track them properly. Moreover, all direct invocations of unit_tidy_watch_pids() and unit_synthesize_cgroup_empty_event() are removed too, when the unit_enqueue_rewatch_pids() call is invoked, as the queued job will run those operations too. All of this is done on cgroupsv1 systems only, and is disabled on cgroupsv2 systems as cgroup-empty notifications are reliable there, and we do not need SIGCHLD events to track processes there. Fixes: #9138
2018-05-31 15:41:59 +02:00
/* The client is no longer watching the remaining processes, so let's step in here, under the assumption that
* the remaining processes will be sooner or later reassigned to us as parent. */
(void) unit_enqueue_rewatch_pids(UNIT(s));
return 0;
}
_pure_ static UnitActiveState scope_active_state(Unit *u) {
assert(u);
return state_translation_table[SCOPE(u)->state];
}
_pure_ static const char *scope_sub_state_to_string(Unit *u) {
assert(u);
return scope_state_to_string(SCOPE(u)->state);
}
static void scope_enumerate_perpetual(Manager *m) {
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
Unit *u;
int r;
assert(m);
/* Let's unconditionally add the "init.scope" special unit
* that encapsulates PID 1. Note that PID 1 already is in the
* cgroup for this, we hence just need to allocate the object
* for it and that's it. */
u = manager_get_unit(m, SPECIAL_INIT_SCOPE);
if (!u) {
r = unit_new_for_name(m, sizeof(Scope), SPECIAL_INIT_SCOPE, &u);
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
if (r < 0) {
log_error_errno(r, "Failed to allocate the special " SPECIAL_INIT_SCOPE " unit: %m");
return;
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
}
}
u->transient = true;
u->perpetual = true;
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
SCOPE(u)->deserialized_state = SCOPE_RUNNING;
unit_add_to_load_queue(u);
unit_add_to_dbus_queue(u);
}
static const char* const scope_result_table[_SCOPE_RESULT_MAX] = {
[SCOPE_SUCCESS] = "success",
[SCOPE_FAILURE_RESOURCES] = "resources",
[SCOPE_FAILURE_TIMEOUT] = "timeout",
};
DEFINE_STRING_TABLE_LOOKUP(scope_result, ScopeResult);
const UnitVTable scope_vtable = {
.object_size = sizeof(Scope),
.cgroup_context_offset = offsetof(Scope, cgroup_context),
.kill_context_offset = offsetof(Scope, kill_context),
.sections =
"Unit\0"
"Scope\0"
"Install\0",
.private_section = "Scope",
.can_transient = true,
.can_delegate = true,
.once_only = true,
.init = scope_init,
.load = scope_load,
.done = scope_done,
.coldplug = scope_coldplug,
.dump = scope_dump,
.start = scope_start,
.stop = scope_stop,
.kill = scope_kill,
.get_timeout = scope_get_timeout,
.serialize = scope_serialize,
.deserialize_item = scope_deserialize_item,
.active_state = scope_active_state,
.sub_state_to_string = scope_sub_state_to_string,
.sigchld_event = scope_sigchld_event,
2013-07-02 01:35:35 +02:00
.reset_failed = scope_reset_failed,
.notify_cgroup_empty = scope_notify_cgroup_empty_event,
.bus_vtable = bus_scope_vtable,
.bus_set_property = bus_scope_set_property,
.bus_commit_properties = bus_scope_commit_properties,
.enumerate_perpetual = scope_enumerate_perpetual,
};