2017-11-18 17:09:20 +01:00
|
|
|
/* SPDX-License-Identifier: LGPL-2.1+ */
|
2013-07-01 00:03:57 +02:00
|
|
|
|
|
|
|
#include <errno.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
|
2015-10-27 03:01:06 +01:00
|
|
|
#include "alloc-util.h"
|
2015-10-24 22:58:24 +02:00
|
|
|
#include "dbus-scope.h"
|
|
|
|
#include "load-dropin.h"
|
2013-07-01 00:03:57 +02:00
|
|
|
#include "log.h"
|
2015-10-27 03:01:06 +01:00
|
|
|
#include "scope.h"
|
2018-10-17 20:40:09 +02:00
|
|
|
#include "serialize.h"
|
2013-07-01 00:03:57 +02:00
|
|
|
#include "special.h"
|
2015-10-26 22:31:05 +01:00
|
|
|
#include "string-table.h"
|
2015-10-24 22:58:24 +02:00
|
|
|
#include "string-util.h"
|
|
|
|
#include "strv.h"
|
2013-07-01 00:03:57 +02:00
|
|
|
#include "unit-name.h"
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
#include "unit.h"
|
2013-07-01 00:03:57 +02:00
|
|
|
|
|
|
|
static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = {
|
|
|
|
[SCOPE_DEAD] = UNIT_INACTIVE,
|
|
|
|
[SCOPE_RUNNING] = UNIT_ACTIVE,
|
2014-02-06 17:17:51 +01:00
|
|
|
[SCOPE_ABANDONED] = UNIT_ACTIVE,
|
2013-07-01 00:03:57 +02:00
|
|
|
[SCOPE_STOP_SIGTERM] = UNIT_DEACTIVATING,
|
|
|
|
[SCOPE_STOP_SIGKILL] = UNIT_DEACTIVATING,
|
|
|
|
[SCOPE_FAILED] = UNIT_FAILED
|
|
|
|
};
|
|
|
|
|
2013-11-19 21:12:59 +01:00
|
|
|
static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata);
|
|
|
|
|
2013-07-01 00:03:57 +02:00
|
|
|
static void scope_init(Unit *u) {
|
|
|
|
Scope *s = SCOPE(u);
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
assert(u->load_state == UNIT_STUB);
|
|
|
|
|
2013-11-04 17:47:43 +01:00
|
|
|
s->timeout_stop_usec = u->manager->default_timeout_stop_usec;
|
2016-02-18 22:51:23 +01:00
|
|
|
u->ignore_on_isolate = true;
|
2013-07-01 00:03:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void scope_done(Unit *u) {
|
|
|
|
Scope *s = SCOPE(u);
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
2017-11-23 20:15:48 +01:00
|
|
|
s->controller = mfree(s->controller);
|
|
|
|
s->controller_track = sd_bus_track_unref(s->controller_track);
|
2014-01-31 17:45:13 +01:00
|
|
|
|
2013-11-19 21:12:59 +01:00
|
|
|
s->timer_event_source = sd_event_source_unref(s->timer_event_source);
|
|
|
|
}
|
|
|
|
|
core: rework unit timeout handling, and add new setting RuntimeMaxSec=
This clean-ups timeout handling in PID 1. Specifically, instead of storing 0 in internal timeout variables as
indication for a disabled timeout, use USEC_INFINITY which is in-line with how we do this in the rest of our code
(following the logic that 0 means "no", and USEC_INFINITY means "never").
This also replace all usec_t additions with invocations to usec_add(), so that USEC_INFINITY is properly propagated,
and sd-event considers it has indication for turning off the event source.
This also alters the deserialization of the units to restart timeouts from the time they were originally started from.
Before this patch timeouts would be restarted beginning with the time of the deserialization, which could lead to
artificially prolonged timeouts if a daemon reload took place.
Finally, a new RuntimeMaxSec= setting is introduced for service units, that specifies a maximum runtime after which a
specific service is forcibly terminated. This is useful to put time limits on time-intensive processing jobs.
This also simplifies the various xyz_spawn() calls of the various types in that explicit distruction of the timers is
removed, as that is done anyway by the state change handlers, and a state change is always done when the xyz_spawn()
calls fail.
Fixes: #2249
2016-02-01 21:48:10 +01:00
|
|
|
static int scope_arm_timer(Scope *s, usec_t usec) {
|
2013-11-19 21:12:59 +01:00
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(s);
|
|
|
|
|
|
|
|
if (s->timer_event_source) {
|
core: rework unit timeout handling, and add new setting RuntimeMaxSec=
This clean-ups timeout handling in PID 1. Specifically, instead of storing 0 in internal timeout variables as
indication for a disabled timeout, use USEC_INFINITY which is in-line with how we do this in the rest of our code
(following the logic that 0 means "no", and USEC_INFINITY means "never").
This also replace all usec_t additions with invocations to usec_add(), so that USEC_INFINITY is properly propagated,
and sd-event considers it has indication for turning off the event source.
This also alters the deserialization of the units to restart timeouts from the time they were originally started from.
Before this patch timeouts would be restarted beginning with the time of the deserialization, which could lead to
artificially prolonged timeouts if a daemon reload took place.
Finally, a new RuntimeMaxSec= setting is introduced for service units, that specifies a maximum runtime after which a
specific service is forcibly terminated. This is useful to put time limits on time-intensive processing jobs.
This also simplifies the various xyz_spawn() calls of the various types in that explicit distruction of the timers is
removed, as that is done anyway by the state change handlers, and a state change is always done when the xyz_spawn()
calls fail.
Fixes: #2249
2016-02-01 21:48:10 +01:00
|
|
|
r = sd_event_source_set_time(s->timer_event_source, usec);
|
2013-11-19 21:12:59 +01:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
return sd_event_source_set_enabled(s->timer_event_source, SD_EVENT_ONESHOT);
|
|
|
|
}
|
|
|
|
|
core: rework unit timeout handling, and add new setting RuntimeMaxSec=
This clean-ups timeout handling in PID 1. Specifically, instead of storing 0 in internal timeout variables as
indication for a disabled timeout, use USEC_INFINITY which is in-line with how we do this in the rest of our code
(following the logic that 0 means "no", and USEC_INFINITY means "never").
This also replace all usec_t additions with invocations to usec_add(), so that USEC_INFINITY is properly propagated,
and sd-event considers it has indication for turning off the event source.
This also alters the deserialization of the units to restart timeouts from the time they were originally started from.
Before this patch timeouts would be restarted beginning with the time of the deserialization, which could lead to
artificially prolonged timeouts if a daemon reload took place.
Finally, a new RuntimeMaxSec= setting is introduced for service units, that specifies a maximum runtime after which a
specific service is forcibly terminated. This is useful to put time limits on time-intensive processing jobs.
This also simplifies the various xyz_spawn() calls of the various types in that explicit distruction of the timers is
removed, as that is done anyway by the state change handlers, and a state change is always done when the xyz_spawn()
calls fail.
Fixes: #2249
2016-02-01 21:48:10 +01:00
|
|
|
if (usec == USEC_INFINITY)
|
|
|
|
return 0;
|
|
|
|
|
2015-05-13 18:30:14 +02:00
|
|
|
r = sd_event_add_time(
|
2014-03-24 02:49:09 +01:00
|
|
|
UNIT(s)->manager->event,
|
|
|
|
&s->timer_event_source,
|
|
|
|
CLOCK_MONOTONIC,
|
core: rework unit timeout handling, and add new setting RuntimeMaxSec=
This clean-ups timeout handling in PID 1. Specifically, instead of storing 0 in internal timeout variables as
indication for a disabled timeout, use USEC_INFINITY which is in-line with how we do this in the rest of our code
(following the logic that 0 means "no", and USEC_INFINITY means "never").
This also replace all usec_t additions with invocations to usec_add(), so that USEC_INFINITY is properly propagated,
and sd-event considers it has indication for turning off the event source.
This also alters the deserialization of the units to restart timeouts from the time they were originally started from.
Before this patch timeouts would be restarted beginning with the time of the deserialization, which could lead to
artificially prolonged timeouts if a daemon reload took place.
Finally, a new RuntimeMaxSec= setting is introduced for service units, that specifies a maximum runtime after which a
specific service is forcibly terminated. This is useful to put time limits on time-intensive processing jobs.
This also simplifies the various xyz_spawn() calls of the various types in that explicit distruction of the timers is
removed, as that is done anyway by the state change handlers, and a state change is always done when the xyz_spawn()
calls fail.
Fixes: #2249
2016-02-01 21:48:10 +01:00
|
|
|
usec, 0,
|
2014-03-24 02:49:09 +01:00
|
|
|
scope_dispatch_timer, s);
|
2015-04-29 16:05:32 +02:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
(void) sd_event_source_set_description(s->timer_event_source, "scope-timer");
|
|
|
|
|
|
|
|
return 0;
|
2013-07-01 00:03:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void scope_set_state(Scope *s, ScopeState state) {
|
|
|
|
ScopeState old_state;
|
|
|
|
assert(s);
|
|
|
|
|
|
|
|
old_state = s->state;
|
|
|
|
s->state = state;
|
|
|
|
|
2014-02-06 17:17:51 +01:00
|
|
|
if (!IN_SET(state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
|
2013-11-19 21:12:59 +01:00
|
|
|
s->timer_event_source = sd_event_source_unref(s->timer_event_source);
|
2013-07-01 00:03:57 +02:00
|
|
|
|
core: rework how we track service and scope PIDs
This reworks how systemd tracks processes on cgroupv1 systems where
cgroup notification is not reliable. Previously, whenever we had reason
to believe that new processes showed up or got removed we'd scan the
cgroup of the scope or service unit for new processes, and would tidy up
the list of PIDs previously watched. This scanning is relatively slow,
and does not scale well. With this change behaviour is changed: instead
of scanning for new/removed processes right away we do this work in a
per-unit deferred event loop job. This event source is scheduled at a
very low priority, so that it is executed when we have time but does not
starve other event sources. This has two benefits: this expensive work is
coalesced, if events happen in quick succession, and we won't delay
SIGCHLD handling for too long.
This patch basically replaces all direct invocation of
unit_watch_all_pids() in scope.c and service.c with invocations of the
new unit_enqueue_rewatch_pids() call which just enqueues a request of
watching/tidying up the PID sets (with one exception: in
scope_enter_signal() and service_enter_signal() we'll still do
unit_watch_all_pids() synchronously first, since we really want to know
all processes we are about to kill so that we can track them properly.
Moreover, all direct invocations of unit_tidy_watch_pids() and
unit_synthesize_cgroup_empty_event() are removed too, when the
unit_enqueue_rewatch_pids() call is invoked, as the queued job will run
those operations too.
All of this is done on cgroupsv1 systems only, and is disabled on
cgroupsv2 systems as cgroup-empty notifications are reliable there, and
we do not need SIGCHLD events to track processes there.
Fixes: #9138
2018-05-31 15:41:59 +02:00
|
|
|
if (IN_SET(state, SCOPE_DEAD, SCOPE_FAILED)) {
|
2014-02-06 17:17:51 +01:00
|
|
|
unit_unwatch_all_pids(UNIT(s));
|
core: rework how we track service and scope PIDs
This reworks how systemd tracks processes on cgroupv1 systems where
cgroup notification is not reliable. Previously, whenever we had reason
to believe that new processes showed up or got removed we'd scan the
cgroup of the scope or service unit for new processes, and would tidy up
the list of PIDs previously watched. This scanning is relatively slow,
and does not scale well. With this change behaviour is changed: instead
of scanning for new/removed processes right away we do this work in a
per-unit deferred event loop job. This event source is scheduled at a
very low priority, so that it is executed when we have time but does not
starve other event sources. This has two benefits: this expensive work is
coalesced, if events happen in quick succession, and we won't delay
SIGCHLD handling for too long.
This patch basically replaces all direct invocation of
unit_watch_all_pids() in scope.c and service.c with invocations of the
new unit_enqueue_rewatch_pids() call which just enqueues a request of
watching/tidying up the PID sets (with one exception: in
scope_enter_signal() and service_enter_signal() we'll still do
unit_watch_all_pids() synchronously first, since we really want to know
all processes we are about to kill so that we can track them properly.
Moreover, all direct invocations of unit_tidy_watch_pids() and
unit_synthesize_cgroup_empty_event() are removed too, when the
unit_enqueue_rewatch_pids() call is invoked, as the queued job will run
those operations too.
All of this is done on cgroupsv1 systems only, and is disabled on
cgroupsv2 systems as cgroup-empty notifications are reliable there, and
we do not need SIGCHLD events to track processes there.
Fixes: #9138
2018-05-31 15:41:59 +02:00
|
|
|
unit_dequeue_rewatch_pids(UNIT(s));
|
|
|
|
}
|
2014-02-06 17:17:51 +01:00
|
|
|
|
2013-07-01 00:03:57 +02:00
|
|
|
if (state != old_state)
|
2014-02-06 17:17:51 +01:00
|
|
|
log_debug("%s changed %s -> %s", UNIT(s)->id, scope_state_to_string(old_state), scope_state_to_string(state));
|
2013-07-01 00:03:57 +02:00
|
|
|
|
2018-06-01 19:06:19 +02:00
|
|
|
unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], 0);
|
2013-07-01 00:03:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static int scope_add_default_dependencies(Scope *s) {
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(s);
|
|
|
|
|
2015-11-11 20:42:39 +01:00
|
|
|
if (!UNIT(s)->default_dependencies)
|
|
|
|
return 0;
|
|
|
|
|
2013-07-01 00:03:57 +02:00
|
|
|
/* Make sure scopes are unloaded on shutdown */
|
|
|
|
r = unit_add_two_dependencies_by_name(
|
|
|
|
UNIT(s),
|
|
|
|
UNIT_BEFORE, UNIT_CONFLICTS,
|
2018-09-15 20:02:00 +02:00
|
|
|
SPECIAL_SHUTDOWN_TARGET, true,
|
core: track why unit dependencies came to be
This replaces the dependencies Set* objects by Hashmap* objects, where
the key is the depending Unit, and the value is a bitmask encoding why
the specific dependency was created.
The bitmask contains a number of different, defined bits, that indicate
why dependencies exist, for example whether they are created due to
explicitly configured deps in files, by udev rules or implicitly.
Note that memory usage is not increased by this change, even though we
store more information, as we manage to encode the bit mask inside the
value pointer each Hashmap entry contains.
Why this all? When we know how a dependency came to be, we can update
dependencies correctly when a configuration source changes but others
are left unaltered. Specifically:
1. We can fix UDEV_WANTS dependency generation: so far we kept adding
dependencies configured that way, but if a device lost such a
dependency we couldn't them again as there was no scheme for removing
of dependencies in place.
2. We can implement "pin-pointed" reload of unit files. If we know what
dependencies were created as result of configuration in a unit file,
then we know what to flush out when we want to reload it.
3. It's useful for debugging: "systemd-analyze dump" now shows
this information, helping substantially with understanding how
systemd's dependency tree came to be the way it came to be.
2017-10-25 20:46:01 +02:00
|
|
|
UNIT_DEPENDENCY_DEFAULT);
|
2013-07-01 00:03:57 +02:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int scope_verify(Scope *s) {
|
|
|
|
assert(s);
|
|
|
|
|
|
|
|
if (UNIT(s)->load_state != UNIT_LOADED)
|
|
|
|
return 0;
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
if (set_isempty(UNIT(s)->pids) &&
|
2016-02-24 21:36:09 +01:00
|
|
|
!MANAGER_IS_RELOADING(UNIT(s)->manager) &&
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
!unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE)) {
|
core,network: major per-object logging rework
This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.
Also contains a couple of other logging improvements:
- Drops a couple of strerror() invocations in favour of using %m.
- Not only .mount units now warn if a symlinks exist for the mount
point already, .automount units do that too, now.
- A few invocations of log_struct() that didn't actually pass any
additional structured data have been replaced by simpler invocations
of log_unit_info() and friends.
- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
that works like LOG_MESSAGE() but prefixes the message with the unit
name. Similar, there's now LOG_LINK_MESSAGE() and
LOG_NETDEV_MESSAGE().
- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
LOG_NETDEV_INTERFACE() macros have been added that generate the
necessary per object fields. The old log_unit_struct() call has been
removed in favour of these new macros used in raw log_struct()
invocations. In addition to removing one more function call this
allows generated structured log messages that contain two object
fields, as necessary for example for network interfaces that are
joined into another network interface, and whose messages shall be
indexed by both.
- The LOG_ERRNO() macro has been removed, in favour of
log_struct_errno(). The latter has the benefit of ensuring that %m in
format strings is properly resolved to the specified error number.
- A number of logging messages have been converted to use
log_unit_info() instead of log_info()
- The client code in sysv-generator no longer #includes core code from
src/core/.
- log_unit_full_errno() has been removed, log_unit_full() instead takes
an errno now, too.
- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
|
|
|
log_unit_error(UNIT(s), "Scope has no PIDs. Refusing.");
|
2018-06-01 18:06:54 +02:00
|
|
|
return -ENOENT;
|
2013-07-01 00:03:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-10-24 20:37:54 +02:00
|
|
|
static int scope_load_init_scope(Unit *u) {
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (!unit_has_name(u, SPECIAL_INIT_SCOPE))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
u->transient = true;
|
2016-10-24 21:41:54 +02:00
|
|
|
u->perpetual = true;
|
2016-10-24 20:37:54 +02:00
|
|
|
|
|
|
|
/* init.scope is a bit special, as it has to stick around forever. Because of its special semantics we
|
|
|
|
* synthesize it here, instead of relying on the unit file on disk. */
|
|
|
|
|
|
|
|
u->default_dependencies = false;
|
|
|
|
|
|
|
|
/* Prettify things, if we can. */
|
|
|
|
if (!u->description)
|
|
|
|
u->description = strdup("System and Service Manager");
|
|
|
|
if (!u->documentation)
|
|
|
|
(void) strv_extend(&u->documentation, "man:systemd(1)");
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2013-07-01 00:03:57 +02:00
|
|
|
static int scope_load(Unit *u) {
|
|
|
|
Scope *s = SCOPE(u);
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(s);
|
|
|
|
assert(u->load_state == UNIT_STUB);
|
|
|
|
|
2016-02-24 21:36:09 +01:00
|
|
|
if (!u->transient && !MANAGER_IS_RELOADING(u->manager))
|
2016-04-07 15:43:59 +02:00
|
|
|
/* Refuse to load non-transient scope units, but allow them while reloading. */
|
2013-07-01 00:03:57 +02:00
|
|
|
return -ENOENT;
|
|
|
|
|
2016-10-24 20:37:54 +02:00
|
|
|
r = scope_load_init_scope(u);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
2016-04-07 15:43:59 +02:00
|
|
|
r = unit_load_fragment_and_dropin_optional(u);
|
2013-07-01 00:03:57 +02:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
2016-04-07 15:43:59 +02:00
|
|
|
if (u->load_state == UNIT_LOADED) {
|
|
|
|
r = unit_patch_contexts(u);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
2014-03-19 20:40:05 +01:00
|
|
|
|
2016-04-07 15:43:59 +02:00
|
|
|
r = unit_set_default_slice(u);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
2013-07-01 00:03:57 +02:00
|
|
|
|
2016-04-07 15:43:59 +02:00
|
|
|
r = scope_add_default_dependencies(s);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
}
|
2013-07-01 00:03:57 +02:00
|
|
|
|
|
|
|
return scope_verify(s);
|
|
|
|
}
|
|
|
|
|
2015-04-24 15:27:19 +02:00
|
|
|
static int scope_coldplug(Unit *u) {
|
2013-07-01 00:03:57 +02:00
|
|
|
Scope *s = SCOPE(u);
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(s);
|
|
|
|
assert(s->state == SCOPE_DEAD);
|
|
|
|
|
core: rework unit timeout handling, and add new setting RuntimeMaxSec=
This clean-ups timeout handling in PID 1. Specifically, instead of storing 0 in internal timeout variables as
indication for a disabled timeout, use USEC_INFINITY which is in-line with how we do this in the rest of our code
(following the logic that 0 means "no", and USEC_INFINITY means "never").
This also replace all usec_t additions with invocations to usec_add(), so that USEC_INFINITY is properly propagated,
and sd-event considers it has indication for turning off the event source.
This also alters the deserialization of the units to restart timeouts from the time they were originally started from.
Before this patch timeouts would be restarted beginning with the time of the deserialization, which could lead to
artificially prolonged timeouts if a daemon reload took place.
Finally, a new RuntimeMaxSec= setting is introduced for service units, that specifies a maximum runtime after which a
specific service is forcibly terminated. This is useful to put time limits on time-intensive processing jobs.
This also simplifies the various xyz_spawn() calls of the various types in that explicit distruction of the timers is
removed, as that is done anyway by the state change handlers, and a state change is always done when the xyz_spawn()
calls fail.
Fixes: #2249
2016-02-01 21:48:10 +01:00
|
|
|
if (s->deserialized_state == s->state)
|
|
|
|
return 0;
|
2014-02-06 17:17:51 +01:00
|
|
|
|
core: rework unit timeout handling, and add new setting RuntimeMaxSec=
This clean-ups timeout handling in PID 1. Specifically, instead of storing 0 in internal timeout variables as
indication for a disabled timeout, use USEC_INFINITY which is in-line with how we do this in the rest of our code
(following the logic that 0 means "no", and USEC_INFINITY means "never").
This also replace all usec_t additions with invocations to usec_add(), so that USEC_INFINITY is properly propagated,
and sd-event considers it has indication for turning off the event source.
This also alters the deserialization of the units to restart timeouts from the time they were originally started from.
Before this patch timeouts would be restarted beginning with the time of the deserialization, which could lead to
artificially prolonged timeouts if a daemon reload took place.
Finally, a new RuntimeMaxSec= setting is introduced for service units, that specifies a maximum runtime after which a
specific service is forcibly terminated. This is useful to put time limits on time-intensive processing jobs.
This also simplifies the various xyz_spawn() calls of the various types in that explicit distruction of the timers is
removed, as that is done anyway by the state change handlers, and a state change is always done when the xyz_spawn()
calls fail.
Fixes: #2249
2016-02-01 21:48:10 +01:00
|
|
|
if (IN_SET(s->deserialized_state, SCOPE_STOP_SIGKILL, SCOPE_STOP_SIGTERM)) {
|
|
|
|
r = scope_arm_timer(s, usec_add(u->state_change_timestamp.monotonic, s->timeout_stop_usec));
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
2013-07-01 00:03:57 +02:00
|
|
|
}
|
|
|
|
|
core: rework unit timeout handling, and add new setting RuntimeMaxSec=
This clean-ups timeout handling in PID 1. Specifically, instead of storing 0 in internal timeout variables as
indication for a disabled timeout, use USEC_INFINITY which is in-line with how we do this in the rest of our code
(following the logic that 0 means "no", and USEC_INFINITY means "never").
This also replace all usec_t additions with invocations to usec_add(), so that USEC_INFINITY is properly propagated,
and sd-event considers it has indication for turning off the event source.
This also alters the deserialization of the units to restart timeouts from the time they were originally started from.
Before this patch timeouts would be restarted beginning with the time of the deserialization, which could lead to
artificially prolonged timeouts if a daemon reload took place.
Finally, a new RuntimeMaxSec= setting is introduced for service units, that specifies a maximum runtime after which a
specific service is forcibly terminated. This is useful to put time limits on time-intensive processing jobs.
This also simplifies the various xyz_spawn() calls of the various types in that explicit distruction of the timers is
removed, as that is done anyway by the state change handlers, and a state change is always done when the xyz_spawn()
calls fail.
Fixes: #2249
2016-02-01 21:48:10 +01:00
|
|
|
if (!IN_SET(s->deserialized_state, SCOPE_DEAD, SCOPE_FAILED))
|
core: rework how we track service and scope PIDs
This reworks how systemd tracks processes on cgroupv1 systems where
cgroup notification is not reliable. Previously, whenever we had reason
to believe that new processes showed up or got removed we'd scan the
cgroup of the scope or service unit for new processes, and would tidy up
the list of PIDs previously watched. This scanning is relatively slow,
and does not scale well. With this change behaviour is changed: instead
of scanning for new/removed processes right away we do this work in a
per-unit deferred event loop job. This event source is scheduled at a
very low priority, so that it is executed when we have time but does not
starve other event sources. This has two benefits: this expensive work is
coalesced, if events happen in quick succession, and we won't delay
SIGCHLD handling for too long.
This patch basically replaces all direct invocation of
unit_watch_all_pids() in scope.c and service.c with invocations of the
new unit_enqueue_rewatch_pids() call which just enqueues a request of
watching/tidying up the PID sets (with one exception: in
scope_enter_signal() and service_enter_signal() we'll still do
unit_watch_all_pids() synchronously first, since we really want to know
all processes we are about to kill so that we can track them properly.
Moreover, all direct invocations of unit_tidy_watch_pids() and
unit_synthesize_cgroup_empty_event() are removed too, when the
unit_enqueue_rewatch_pids() call is invoked, as the queued job will run
those operations too.
All of this is done on cgroupsv1 systems only, and is disabled on
cgroupsv2 systems as cgroup-empty notifications are reliable there, and
we do not need SIGCHLD events to track processes there.
Fixes: #9138
2018-05-31 15:41:59 +02:00
|
|
|
(void) unit_enqueue_rewatch_pids(u);
|
core: rework unit timeout handling, and add new setting RuntimeMaxSec=
This clean-ups timeout handling in PID 1. Specifically, instead of storing 0 in internal timeout variables as
indication for a disabled timeout, use USEC_INFINITY which is in-line with how we do this in the rest of our code
(following the logic that 0 means "no", and USEC_INFINITY means "never").
This also replace all usec_t additions with invocations to usec_add(), so that USEC_INFINITY is properly propagated,
and sd-event considers it has indication for turning off the event source.
This also alters the deserialization of the units to restart timeouts from the time they were originally started from.
Before this patch timeouts would be restarted beginning with the time of the deserialization, which could lead to
artificially prolonged timeouts if a daemon reload took place.
Finally, a new RuntimeMaxSec= setting is introduced for service units, that specifies a maximum runtime after which a
specific service is forcibly terminated. This is useful to put time limits on time-intensive processing jobs.
This also simplifies the various xyz_spawn() calls of the various types in that explicit distruction of the timers is
removed, as that is done anyway by the state change handlers, and a state change is always done when the xyz_spawn()
calls fail.
Fixes: #2249
2016-02-01 21:48:10 +01:00
|
|
|
|
2017-11-23 20:15:48 +01:00
|
|
|
bus_scope_track_controller(s);
|
|
|
|
|
core: rework unit timeout handling, and add new setting RuntimeMaxSec=
This clean-ups timeout handling in PID 1. Specifically, instead of storing 0 in internal timeout variables as
indication for a disabled timeout, use USEC_INFINITY which is in-line with how we do this in the rest of our code
(following the logic that 0 means "no", and USEC_INFINITY means "never").
This also replace all usec_t additions with invocations to usec_add(), so that USEC_INFINITY is properly propagated,
and sd-event considers it has indication for turning off the event source.
This also alters the deserialization of the units to restart timeouts from the time they were originally started from.
Before this patch timeouts would be restarted beginning with the time of the deserialization, which could lead to
artificially prolonged timeouts if a daemon reload took place.
Finally, a new RuntimeMaxSec= setting is introduced for service units, that specifies a maximum runtime after which a
specific service is forcibly terminated. This is useful to put time limits on time-intensive processing jobs.
This also simplifies the various xyz_spawn() calls of the various types in that explicit distruction of the timers is
removed, as that is done anyway by the state change handlers, and a state change is always done when the xyz_spawn()
calls fail.
Fixes: #2249
2016-02-01 21:48:10 +01:00
|
|
|
scope_set_state(s, s->deserialized_state);
|
2013-07-01 00:03:57 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void scope_dump(Unit *u, FILE *f, const char *prefix) {
|
|
|
|
Scope *s = SCOPE(u);
|
|
|
|
|
|
|
|
assert(s);
|
|
|
|
assert(f);
|
|
|
|
|
|
|
|
fprintf(f,
|
|
|
|
"%sScope State: %s\n"
|
|
|
|
"%sResult: %s\n",
|
|
|
|
prefix, scope_state_to_string(s->state),
|
|
|
|
prefix, scope_result_to_string(s->result));
|
|
|
|
|
|
|
|
cgroup_context_dump(&s->cgroup_context, f, prefix);
|
|
|
|
kill_context_dump(&s->kill_context, f, prefix);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void scope_enter_dead(Scope *s, ScopeResult f) {
|
|
|
|
assert(s);
|
|
|
|
|
core: remember first unit failure, not last unit failure
Previously, the result value of a unit was overriden with each failure that
took place, so that the result always reported the last failure that took
place.
With this commit this is changed, so that the first failure taking place is
stored instead. This should normally not matter much as multiple failures are
sufficiently uncommon. However, it improves one behaviour: if we send SIGABRT
to a service due to a watchdog timeout, then this currently would be reported
as "coredump" failure, rather than the "watchodg" failure it really is. Hence,
in order to report information about the type of the failure, and not about
the effect of it, let's change this from all unit type to store the first, not
the last failure.
This addresses the issue pointed out here:
https://github.com/systemd/systemd/pull/3818#discussion_r73433520
2016-08-04 21:14:27 +02:00
|
|
|
if (s->result == SCOPE_SUCCESS)
|
2013-07-01 00:03:57 +02:00
|
|
|
s->result = f;
|
|
|
|
|
2018-11-16 10:09:45 +01:00
|
|
|
unit_log_result(UNIT(s), s->result == SCOPE_SUCCESS, scope_result_to_string(s->result));
|
2013-07-01 00:03:57 +02:00
|
|
|
scope_set_state(s, s->result != SCOPE_SUCCESS ? SCOPE_FAILED : SCOPE_DEAD);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) {
|
2014-01-31 17:45:13 +01:00
|
|
|
bool skip_signal = false;
|
2013-07-01 00:03:57 +02:00
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(s);
|
|
|
|
|
core: remember first unit failure, not last unit failure
Previously, the result value of a unit was overriden with each failure that
took place, so that the result always reported the last failure that took
place.
With this commit this is changed, so that the first failure taking place is
stored instead. This should normally not matter much as multiple failures are
sufficiently uncommon. However, it improves one behaviour: if we send SIGABRT
to a service due to a watchdog timeout, then this currently would be reported
as "coredump" failure, rather than the "watchodg" failure it really is. Hence,
in order to report information about the type of the failure, and not about
the effect of it, let's change this from all unit type to store the first, not
the last failure.
This addresses the issue pointed out here:
https://github.com/systemd/systemd/pull/3818#discussion_r73433520
2016-08-04 21:14:27 +02:00
|
|
|
if (s->result == SCOPE_SUCCESS)
|
2013-07-01 00:03:57 +02:00
|
|
|
s->result = f;
|
|
|
|
|
core: rework how we track service and scope PIDs
This reworks how systemd tracks processes on cgroupv1 systems where
cgroup notification is not reliable. Previously, whenever we had reason
to believe that new processes showed up or got removed we'd scan the
cgroup of the scope or service unit for new processes, and would tidy up
the list of PIDs previously watched. This scanning is relatively slow,
and does not scale well. With this change behaviour is changed: instead
of scanning for new/removed processes right away we do this work in a
per-unit deferred event loop job. This event source is scheduled at a
very low priority, so that it is executed when we have time but does not
starve other event sources. This has two benefits: this expensive work is
coalesced, if events happen in quick succession, and we won't delay
SIGCHLD handling for too long.
This patch basically replaces all direct invocation of
unit_watch_all_pids() in scope.c and service.c with invocations of the
new unit_enqueue_rewatch_pids() call which just enqueues a request of
watching/tidying up the PID sets (with one exception: in
scope_enter_signal() and service_enter_signal() we'll still do
unit_watch_all_pids() synchronously first, since we really want to know
all processes we are about to kill so that we can track them properly.
Moreover, all direct invocations of unit_tidy_watch_pids() and
unit_synthesize_cgroup_empty_event() are removed too, when the
unit_enqueue_rewatch_pids() call is invoked, as the queued job will run
those operations too.
All of this is done on cgroupsv1 systems only, and is disabled on
cgroupsv2 systems as cgroup-empty notifications are reliable there, and
we do not need SIGCHLD events to track processes there.
Fixes: #9138
2018-05-31 15:41:59 +02:00
|
|
|
/* Before sending any signal, make sure we track all members of this cgroup */
|
|
|
|
(void) unit_watch_all_pids(UNIT(s));
|
|
|
|
|
|
|
|
/* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have
|
|
|
|
* died now */
|
|
|
|
(void) unit_enqueue_rewatch_pids(UNIT(s));
|
2014-02-06 17:17:51 +01:00
|
|
|
|
2017-11-23 20:15:48 +01:00
|
|
|
/* If we have a controller set let's ask the controller nicely to terminate the scope, instead of us going
|
|
|
|
* directly into SIGTERM berserk mode */
|
2014-01-31 17:45:13 +01:00
|
|
|
if (state == SCOPE_STOP_SIGTERM)
|
|
|
|
skip_signal = bus_scope_send_request_stop(s) > 0;
|
|
|
|
|
2016-11-15 21:01:40 +01:00
|
|
|
if (skip_signal)
|
|
|
|
r = 1; /* wait */
|
|
|
|
else {
|
2014-01-31 17:45:13 +01:00
|
|
|
r = unit_kill_context(
|
|
|
|
UNIT(s),
|
|
|
|
&s->kill_context,
|
2016-07-20 13:41:32 +02:00
|
|
|
state != SCOPE_STOP_SIGTERM ? KILL_KILL :
|
|
|
|
s->was_abandoned ? KILL_TERMINATE_AND_LOG :
|
|
|
|
KILL_TERMINATE,
|
2014-01-31 17:45:13 +01:00
|
|
|
-1, -1, false);
|
|
|
|
if (r < 0)
|
|
|
|
goto fail;
|
2016-11-15 21:01:40 +01:00
|
|
|
}
|
2013-07-01 00:03:57 +02:00
|
|
|
|
|
|
|
if (r > 0) {
|
core: rework unit timeout handling, and add new setting RuntimeMaxSec=
This clean-ups timeout handling in PID 1. Specifically, instead of storing 0 in internal timeout variables as
indication for a disabled timeout, use USEC_INFINITY which is in-line with how we do this in the rest of our code
(following the logic that 0 means "no", and USEC_INFINITY means "never").
This also replace all usec_t additions with invocations to usec_add(), so that USEC_INFINITY is properly propagated,
and sd-event considers it has indication for turning off the event source.
This also alters the deserialization of the units to restart timeouts from the time they were originally started from.
Before this patch timeouts would be restarted beginning with the time of the deserialization, which could lead to
artificially prolonged timeouts if a daemon reload took place.
Finally, a new RuntimeMaxSec= setting is introduced for service units, that specifies a maximum runtime after which a
specific service is forcibly terminated. This is useful to put time limits on time-intensive processing jobs.
This also simplifies the various xyz_spawn() calls of the various types in that explicit distruction of the timers is
removed, as that is done anyway by the state change handlers, and a state change is always done when the xyz_spawn()
calls fail.
Fixes: #2249
2016-02-01 21:48:10 +01:00
|
|
|
r = scope_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_stop_usec));
|
2013-11-19 21:12:59 +01:00
|
|
|
if (r < 0)
|
|
|
|
goto fail;
|
2013-07-01 00:03:57 +02:00
|
|
|
|
|
|
|
scope_set_state(s, state);
|
2014-01-29 14:58:04 +01:00
|
|
|
} else if (state == SCOPE_STOP_SIGTERM)
|
|
|
|
scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_SUCCESS);
|
|
|
|
else
|
2013-07-01 00:03:57 +02:00
|
|
|
scope_enter_dead(s, SCOPE_SUCCESS);
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
fail:
|
core,network: major per-object logging rework
This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.
Also contains a couple of other logging improvements:
- Drops a couple of strerror() invocations in favour of using %m.
- Not only .mount units now warn if a symlinks exist for the mount
point already, .automount units do that too, now.
- A few invocations of log_struct() that didn't actually pass any
additional structured data have been replaced by simpler invocations
of log_unit_info() and friends.
- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
that works like LOG_MESSAGE() but prefixes the message with the unit
name. Similar, there's now LOG_LINK_MESSAGE() and
LOG_NETDEV_MESSAGE().
- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
LOG_NETDEV_INTERFACE() macros have been added that generate the
necessary per object fields. The old log_unit_struct() call has been
removed in favour of these new macros used in raw log_struct()
invocations. In addition to removing one more function call this
allows generated structured log messages that contain two object
fields, as necessary for example for network interfaces that are
joined into another network interface, and whose messages shall be
indexed by both.
- The LOG_ERRNO() macro has been removed, in favour of
log_struct_errno(). The latter has the benefit of ensuring that %m in
format strings is properly resolved to the specified error number.
- A number of logging messages have been converted to use
log_unit_info() instead of log_info()
- The client code in sysv-generator no longer #includes core code from
src/core/.
- log_unit_full_errno() has been removed, log_unit_full() instead takes
an errno now, too.
- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
|
|
|
log_unit_warning_errno(UNIT(s), r, "Failed to kill processes: %m");
|
2013-07-01 00:03:57 +02:00
|
|
|
|
|
|
|
scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int scope_start(Unit *u) {
|
|
|
|
Scope *s = SCOPE(u);
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(s);
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
if (unit_has_name(u, SPECIAL_INIT_SCOPE))
|
|
|
|
return -EPERM;
|
|
|
|
|
2013-09-11 19:26:47 +02:00
|
|
|
if (s->state == SCOPE_FAILED)
|
|
|
|
return -EPERM;
|
|
|
|
|
2015-04-28 12:20:29 +02:00
|
|
|
/* We can't fulfill this right now, please try again later */
|
2017-09-29 00:37:23 +02:00
|
|
|
if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
|
2013-07-01 00:03:57 +02:00
|
|
|
return -EAGAIN;
|
|
|
|
|
|
|
|
assert(s->state == SCOPE_DEAD);
|
|
|
|
|
2016-02-24 21:36:09 +01:00
|
|
|
if (!u->transient && !MANAGER_IS_RELOADING(u->manager))
|
2013-07-01 00:03:57 +02:00
|
|
|
return -ENOENT;
|
|
|
|
|
2017-11-23 20:15:48 +01:00
|
|
|
(void) bus_scope_track_controller(s);
|
|
|
|
|
2016-08-30 23:18:46 +02:00
|
|
|
r = unit_acquire_invocation_id(u);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
2015-03-01 16:24:19 +01:00
|
|
|
(void) unit_realize_cgroup(u);
|
2017-09-05 19:27:53 +02:00
|
|
|
(void) unit_reset_cpu_accounting(u);
|
|
|
|
(void) unit_reset_ip_accounting(u);
|
2015-03-01 16:24:19 +01:00
|
|
|
|
core: implement /run/systemd/units/-based path for passing unit info from PID 1 to journald
And let's make use of it to implement two new unit settings with it:
1. LogLevelMax= is a new per-unit setting that may be used to configure
log priority filtering: set it to LogLevelMax=notice and only
messages of level "notice" and lower (i.e. more important) will be
processed, all others are dropped.
2. LogExtraFields= is a new per-unit setting for configuring per-unit
journal fields, that are implicitly included in every log record
generated by the unit's processes. It takes field/value pairs in the
form of FOO=BAR.
Also, related to this, one exisiting unit setting is ported to this new
facility:
3. The invocation ID is now pulled from /run/systemd/units/ instead of
cgroupfs xattrs. This substantially relaxes requirements of systemd
on the kernel version and the privileges it runs with (specifically,
cgroupfs xattrs are not available in containers, since they are
stored in kernel memory, and hence are unsafe to permit to lesser
privileged code).
/run/systemd/units/ is a new directory, which contains a number of files
and symlinks encoding the above information. PID 1 creates and manages
these files, and journald reads them from there.
Note that this is supposed to be a direct path between PID 1 and the
journal only, due to the special runtime environment the journal runs
in. Normally, today we shouldn't introduce new interfaces that (mis-)use
a file system as IPC framework, and instead just an IPC system, but this
is very hard to do between the journal and PID 1, as long as the IPC
system is a subject PID 1 manages, and itself a client to the journal.
This patch cleans up a couple of types used in journal code:
specifically we switch to size_t for a couple of memory-sizing values,
as size_t is the right choice for everything that is memory.
Fixes: #4089
Fixes: #3041
Fixes: #4441
2017-11-02 19:43:32 +01:00
|
|
|
unit_export_state_files(UNIT(s));
|
|
|
|
|
2018-02-07 22:52:52 +01:00
|
|
|
r = unit_attach_pids_to_cgroup(u, UNIT(s)->pids, NULL);
|
2015-04-28 12:20:29 +02:00
|
|
|
if (r < 0) {
|
core,network: major per-object logging rework
This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.
Also contains a couple of other logging improvements:
- Drops a couple of strerror() invocations in favour of using %m.
- Not only .mount units now warn if a symlinks exist for the mount
point already, .automount units do that too, now.
- A few invocations of log_struct() that didn't actually pass any
additional structured data have been replaced by simpler invocations
of log_unit_info() and friends.
- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
that works like LOG_MESSAGE() but prefixes the message with the unit
name. Similar, there's now LOG_LINK_MESSAGE() and
LOG_NETDEV_MESSAGE().
- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
LOG_NETDEV_INTERFACE() macros have been added that generate the
necessary per object fields. The old log_unit_struct() call has been
removed in favour of these new macros used in raw log_struct()
invocations. In addition to removing one more function call this
allows generated structured log messages that contain two object
fields, as necessary for example for network interfaces that are
joined into another network interface, and whose messages shall be
indexed by both.
- The LOG_ERRNO() macro has been removed, in favour of
log_struct_errno(). The latter has the benefit of ensuring that %m in
format strings is properly resolved to the specified error number.
- A number of logging messages have been converted to use
log_unit_info() instead of log_info()
- The client code in sysv-generator no longer #includes core code from
src/core/.
- log_unit_full_errno() has been removed, log_unit_full() instead takes
an errno now, too.
- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
|
|
|
log_unit_warning_errno(UNIT(s), r, "Failed to add PIDs to scope's control group: %m");
|
2015-04-28 19:02:49 +02:00
|
|
|
scope_enter_dead(s, SCOPE_FAILURE_RESOURCES);
|
2013-07-01 00:03:57 +02:00
|
|
|
return r;
|
2015-04-28 12:20:29 +02:00
|
|
|
}
|
2013-07-01 00:03:57 +02:00
|
|
|
|
|
|
|
s->result = SCOPE_SUCCESS;
|
|
|
|
|
|
|
|
scope_set_state(s, SCOPE_RUNNING);
|
2018-05-31 15:50:46 +02:00
|
|
|
|
|
|
|
/* Start watching the PIDs currently in the scope */
|
|
|
|
(void) unit_enqueue_rewatch_pids(UNIT(s));
|
2015-01-28 15:07:13 +01:00
|
|
|
return 1;
|
2013-07-01 00:03:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static int scope_stop(Unit *u) {
|
|
|
|
Scope *s = SCOPE(u);
|
|
|
|
|
|
|
|
assert(s);
|
|
|
|
|
2017-09-29 00:37:23 +02:00
|
|
|
if (IN_SET(s->state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
|
2013-07-01 00:03:57 +02:00
|
|
|
return 0;
|
|
|
|
|
2017-09-29 00:37:23 +02:00
|
|
|
assert(IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED));
|
2013-07-01 00:03:57 +02:00
|
|
|
|
|
|
|
scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_SUCCESS);
|
2015-01-28 15:07:13 +01:00
|
|
|
return 1;
|
2013-07-01 00:03:57 +02:00
|
|
|
}
|
|
|
|
|
2013-07-02 01:35:35 +02:00
|
|
|
static void scope_reset_failed(Unit *u) {
|
|
|
|
Scope *s = SCOPE(u);
|
|
|
|
|
|
|
|
assert(s);
|
|
|
|
|
|
|
|
if (s->state == SCOPE_FAILED)
|
|
|
|
scope_set_state(s, SCOPE_DEAD);
|
|
|
|
|
|
|
|
s->result = SCOPE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2013-11-19 21:12:59 +01:00
|
|
|
static int scope_kill(Unit *u, KillWho who, int signo, sd_bus_error *error) {
|
2013-07-01 00:03:57 +02:00
|
|
|
return unit_kill_common(u, who, signo, -1, -1, error);
|
|
|
|
}
|
|
|
|
|
2016-02-04 00:35:43 +01:00
|
|
|
static int scope_get_timeout(Unit *u, usec_t *timeout) {
|
2014-01-27 06:57:34 +01:00
|
|
|
Scope *s = SCOPE(u);
|
2016-02-04 00:35:43 +01:00
|
|
|
usec_t t;
|
2014-01-27 06:57:34 +01:00
|
|
|
int r;
|
|
|
|
|
|
|
|
if (!s->timer_event_source)
|
|
|
|
return 0;
|
|
|
|
|
2016-02-04 00:35:43 +01:00
|
|
|
r = sd_event_source_get_time(s->timer_event_source, &t);
|
2014-01-27 06:57:34 +01:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
2016-02-04 00:35:43 +01:00
|
|
|
if (t == USEC_INFINITY)
|
|
|
|
return 0;
|
2014-01-27 06:57:34 +01:00
|
|
|
|
2016-02-04 00:35:43 +01:00
|
|
|
*timeout = t;
|
2014-01-27 06:57:34 +01:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2013-07-01 00:03:57 +02:00
|
|
|
static int scope_serialize(Unit *u, FILE *f, FDSet *fds) {
|
|
|
|
Scope *s = SCOPE(u);
|
|
|
|
|
|
|
|
assert(s);
|
|
|
|
assert(f);
|
|
|
|
assert(fds);
|
|
|
|
|
2018-10-17 20:40:09 +02:00
|
|
|
(void) serialize_item(f, "state", scope_state_to_string(s->state));
|
|
|
|
(void) serialize_bool(f, "was-abandoned", s->was_abandoned);
|
2017-11-23 12:51:54 +01:00
|
|
|
|
|
|
|
if (s->controller)
|
2018-10-17 20:40:09 +02:00
|
|
|
(void) serialize_item(f, "controller", s->controller);
|
2017-11-23 12:51:54 +01:00
|
|
|
|
2013-07-01 00:03:57 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int scope_deserialize_item(Unit *u, const char *key, const char *value, FDSet *fds) {
|
|
|
|
Scope *s = SCOPE(u);
|
2017-11-23 12:51:54 +01:00
|
|
|
int r;
|
2013-07-01 00:03:57 +02:00
|
|
|
|
|
|
|
assert(u);
|
|
|
|
assert(key);
|
|
|
|
assert(value);
|
|
|
|
assert(fds);
|
|
|
|
|
|
|
|
if (streq(key, "state")) {
|
|
|
|
ScopeState state;
|
|
|
|
|
|
|
|
state = scope_state_from_string(value);
|
|
|
|
if (state < 0)
|
core,network: major per-object logging rework
This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.
Also contains a couple of other logging improvements:
- Drops a couple of strerror() invocations in favour of using %m.
- Not only .mount units now warn if a symlinks exist for the mount
point already, .automount units do that too, now.
- A few invocations of log_struct() that didn't actually pass any
additional structured data have been replaced by simpler invocations
of log_unit_info() and friends.
- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
that works like LOG_MESSAGE() but prefixes the message with the unit
name. Similar, there's now LOG_LINK_MESSAGE() and
LOG_NETDEV_MESSAGE().
- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
LOG_NETDEV_INTERFACE() macros have been added that generate the
necessary per object fields. The old log_unit_struct() call has been
removed in favour of these new macros used in raw log_struct()
invocations. In addition to removing one more function call this
allows generated structured log messages that contain two object
fields, as necessary for example for network interfaces that are
joined into another network interface, and whose messages shall be
indexed by both.
- The LOG_ERRNO() macro has been removed, in favour of
log_struct_errno(). The latter has the benefit of ensuring that %m in
format strings is properly resolved to the specified error number.
- A number of logging messages have been converted to use
log_unit_info() instead of log_info()
- The client code in sysv-generator no longer #includes core code from
src/core/.
- log_unit_full_errno() has been removed, log_unit_full() instead takes
an errno now, too.
- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
|
|
|
log_unit_debug(u, "Failed to parse state value: %s", value);
|
2013-07-01 00:03:57 +02:00
|
|
|
else
|
|
|
|
s->deserialized_state = state;
|
|
|
|
|
2016-07-20 13:41:32 +02:00
|
|
|
} else if (streq(key, "was-abandoned")) {
|
|
|
|
int k;
|
|
|
|
|
|
|
|
k = parse_boolean(value);
|
|
|
|
if (k < 0)
|
|
|
|
log_unit_debug(u, "Failed to parse boolean value: %s", value);
|
|
|
|
else
|
|
|
|
s->was_abandoned = k;
|
2017-11-23 12:51:54 +01:00
|
|
|
} else if (streq(key, "controller")) {
|
|
|
|
|
|
|
|
r = free_and_strdup(&s->controller, value);
|
|
|
|
if (r < 0)
|
2018-10-17 20:40:09 +02:00
|
|
|
return log_oom();
|
2017-11-23 12:51:54 +01:00
|
|
|
|
2013-07-01 00:03:57 +02:00
|
|
|
} else
|
core,network: major per-object logging rework
This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.
Also contains a couple of other logging improvements:
- Drops a couple of strerror() invocations in favour of using %m.
- Not only .mount units now warn if a symlinks exist for the mount
point already, .automount units do that too, now.
- A few invocations of log_struct() that didn't actually pass any
additional structured data have been replaced by simpler invocations
of log_unit_info() and friends.
- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
that works like LOG_MESSAGE() but prefixes the message with the unit
name. Similar, there's now LOG_LINK_MESSAGE() and
LOG_NETDEV_MESSAGE().
- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
LOG_NETDEV_INTERFACE() macros have been added that generate the
necessary per object fields. The old log_unit_struct() call has been
removed in favour of these new macros used in raw log_struct()
invocations. In addition to removing one more function call this
allows generated structured log messages that contain two object
fields, as necessary for example for network interfaces that are
joined into another network interface, and whose messages shall be
indexed by both.
- The LOG_ERRNO() macro has been removed, in favour of
log_struct_errno(). The latter has the benefit of ensuring that %m in
format strings is properly resolved to the specified error number.
- A number of logging messages have been converted to use
log_unit_info() instead of log_info()
- The client code in sysv-generator no longer #includes core code from
src/core/.
- log_unit_full_errno() has been removed, log_unit_full() instead takes
an errno now, too.
- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
|
|
|
log_unit_debug(u, "Unknown serialization key: %s", key);
|
2013-07-01 00:03:57 +02:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-02-06 17:17:51 +01:00
|
|
|
static void scope_notify_cgroup_empty_event(Unit *u) {
|
|
|
|
Scope *s = SCOPE(u);
|
|
|
|
assert(u);
|
|
|
|
|
core,network: major per-object logging rework
This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.
Also contains a couple of other logging improvements:
- Drops a couple of strerror() invocations in favour of using %m.
- Not only .mount units now warn if a symlinks exist for the mount
point already, .automount units do that too, now.
- A few invocations of log_struct() that didn't actually pass any
additional structured data have been replaced by simpler invocations
of log_unit_info() and friends.
- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
that works like LOG_MESSAGE() but prefixes the message with the unit
name. Similar, there's now LOG_LINK_MESSAGE() and
LOG_NETDEV_MESSAGE().
- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
LOG_NETDEV_INTERFACE() macros have been added that generate the
necessary per object fields. The old log_unit_struct() call has been
removed in favour of these new macros used in raw log_struct()
invocations. In addition to removing one more function call this
allows generated structured log messages that contain two object
fields, as necessary for example for network interfaces that are
joined into another network interface, and whose messages shall be
indexed by both.
- The LOG_ERRNO() macro has been removed, in favour of
log_struct_errno(). The latter has the benefit of ensuring that %m in
format strings is properly resolved to the specified error number.
- A number of logging messages have been converted to use
log_unit_info() instead of log_info()
- The client code in sysv-generator no longer #includes core code from
src/core/.
- log_unit_full_errno() has been removed, log_unit_full() instead takes
an errno now, too.
- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
|
|
|
log_unit_debug(u, "cgroup is empty");
|
2014-02-06 17:17:51 +01:00
|
|
|
|
|
|
|
if (IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
|
|
|
|
scope_enter_dead(s, SCOPE_SUCCESS);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) {
|
2018-01-12 13:06:48 +01:00
|
|
|
assert(u);
|
2014-02-06 17:17:51 +01:00
|
|
|
|
2018-01-12 13:06:48 +01:00
|
|
|
/* If we get a SIGCHLD event for one of the processes we were interested in, then we look for others to
|
|
|
|
* watch, under the assumption that we'll sooner or later get a SIGCHLD for them, as the original
|
|
|
|
* process we watched was probably the parent of them, and they are hence now our children. */
|
2014-02-06 17:17:51 +01:00
|
|
|
|
core: rework how we track service and scope PIDs
This reworks how systemd tracks processes on cgroupv1 systems where
cgroup notification is not reliable. Previously, whenever we had reason
to believe that new processes showed up or got removed we'd scan the
cgroup of the scope or service unit for new processes, and would tidy up
the list of PIDs previously watched. This scanning is relatively slow,
and does not scale well. With this change behaviour is changed: instead
of scanning for new/removed processes right away we do this work in a
per-unit deferred event loop job. This event source is scheduled at a
very low priority, so that it is executed when we have time but does not
starve other event sources. This has two benefits: this expensive work is
coalesced, if events happen in quick succession, and we won't delay
SIGCHLD handling for too long.
This patch basically replaces all direct invocation of
unit_watch_all_pids() in scope.c and service.c with invocations of the
new unit_enqueue_rewatch_pids() call which just enqueues a request of
watching/tidying up the PID sets (with one exception: in
scope_enter_signal() and service_enter_signal() we'll still do
unit_watch_all_pids() synchronously first, since we really want to know
all processes we are about to kill so that we can track them properly.
Moreover, all direct invocations of unit_tidy_watch_pids() and
unit_synthesize_cgroup_empty_event() are removed too, when the
unit_enqueue_rewatch_pids() call is invoked, as the queued job will run
those operations too.
All of this is done on cgroupsv1 systems only, and is disabled on
cgroupsv2 systems as cgroup-empty notifications are reliable there, and
we do not need SIGCHLD events to track processes there.
Fixes: #9138
2018-05-31 15:41:59 +02:00
|
|
|
(void) unit_enqueue_rewatch_pids(u);
|
2014-02-06 17:17:51 +01:00
|
|
|
}
|
|
|
|
|
2013-11-19 21:12:59 +01:00
|
|
|
static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
|
|
|
|
Scope *s = SCOPE(userdata);
|
2013-07-01 00:03:57 +02:00
|
|
|
|
|
|
|
assert(s);
|
2013-11-19 21:12:59 +01:00
|
|
|
assert(s->timer_event_source == source);
|
2013-07-01 00:03:57 +02:00
|
|
|
|
|
|
|
switch (s->state) {
|
|
|
|
|
|
|
|
case SCOPE_STOP_SIGTERM:
|
|
|
|
if (s->kill_context.send_sigkill) {
|
core,network: major per-object logging rework
This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.
Also contains a couple of other logging improvements:
- Drops a couple of strerror() invocations in favour of using %m.
- Not only .mount units now warn if a symlinks exist for the mount
point already, .automount units do that too, now.
- A few invocations of log_struct() that didn't actually pass any
additional structured data have been replaced by simpler invocations
of log_unit_info() and friends.
- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
that works like LOG_MESSAGE() but prefixes the message with the unit
name. Similar, there's now LOG_LINK_MESSAGE() and
LOG_NETDEV_MESSAGE().
- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
LOG_NETDEV_INTERFACE() macros have been added that generate the
necessary per object fields. The old log_unit_struct() call has been
removed in favour of these new macros used in raw log_struct()
invocations. In addition to removing one more function call this
allows generated structured log messages that contain two object
fields, as necessary for example for network interfaces that are
joined into another network interface, and whose messages shall be
indexed by both.
- The LOG_ERRNO() macro has been removed, in favour of
log_struct_errno(). The latter has the benefit of ensuring that %m in
format strings is properly resolved to the specified error number.
- A number of logging messages have been converted to use
log_unit_info() instead of log_info()
- The client code in sysv-generator no longer #includes core code from
src/core/.
- log_unit_full_errno() has been removed, log_unit_full() instead takes
an errno now, too.
- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
|
|
|
log_unit_warning(UNIT(s), "Stopping timed out. Killing.");
|
2013-07-01 00:03:57 +02:00
|
|
|
scope_enter_signal(s, SCOPE_STOP_SIGKILL, SCOPE_FAILURE_TIMEOUT);
|
|
|
|
} else {
|
core,network: major per-object logging rework
This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.
Also contains a couple of other logging improvements:
- Drops a couple of strerror() invocations in favour of using %m.
- Not only .mount units now warn if a symlinks exist for the mount
point already, .automount units do that too, now.
- A few invocations of log_struct() that didn't actually pass any
additional structured data have been replaced by simpler invocations
of log_unit_info() and friends.
- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
that works like LOG_MESSAGE() but prefixes the message with the unit
name. Similar, there's now LOG_LINK_MESSAGE() and
LOG_NETDEV_MESSAGE().
- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
LOG_NETDEV_INTERFACE() macros have been added that generate the
necessary per object fields. The old log_unit_struct() call has been
removed in favour of these new macros used in raw log_struct()
invocations. In addition to removing one more function call this
allows generated structured log messages that contain two object
fields, as necessary for example for network interfaces that are
joined into another network interface, and whose messages shall be
indexed by both.
- The LOG_ERRNO() macro has been removed, in favour of
log_struct_errno(). The latter has the benefit of ensuring that %m in
format strings is properly resolved to the specified error number.
- A number of logging messages have been converted to use
log_unit_info() instead of log_info()
- The client code in sysv-generator no longer #includes core code from
src/core/.
- log_unit_full_errno() has been removed, log_unit_full() instead takes
an errno now, too.
- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
|
|
|
log_unit_warning(UNIT(s), "Stopping timed out. Skipping SIGKILL.");
|
2013-07-01 00:03:57 +02:00
|
|
|
scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
case SCOPE_STOP_SIGKILL:
|
core,network: major per-object logging rework
This changes log_unit_info() (and friends) to take a real Unit* object
insted of just a unit name as parameter. The call will now prefix all
logged messages with the unit name, thus allowing the unit name to be
dropped from the various passed romat strings, simplifying invocations
drastically, and unifying log output across messages. Also, UNIT= vs.
USER_UNIT= is now derived from the Manager object attached to the Unit
object, instead of getpid(). This has the benefit of correcting the
field for --test runs.
Also contains a couple of other logging improvements:
- Drops a couple of strerror() invocations in favour of using %m.
- Not only .mount units now warn if a symlinks exist for the mount
point already, .automount units do that too, now.
- A few invocations of log_struct() that didn't actually pass any
additional structured data have been replaced by simpler invocations
of log_unit_info() and friends.
- For structured data a new LOG_UNIT_MESSAGE() macro has been added,
that works like LOG_MESSAGE() but prefixes the message with the unit
name. Similar, there's now LOG_LINK_MESSAGE() and
LOG_NETDEV_MESSAGE().
- For structured data new LOG_UNIT_ID(), LOG_LINK_INTERFACE(),
LOG_NETDEV_INTERFACE() macros have been added that generate the
necessary per object fields. The old log_unit_struct() call has been
removed in favour of these new macros used in raw log_struct()
invocations. In addition to removing one more function call this
allows generated structured log messages that contain two object
fields, as necessary for example for network interfaces that are
joined into another network interface, and whose messages shall be
indexed by both.
- The LOG_ERRNO() macro has been removed, in favour of
log_struct_errno(). The latter has the benefit of ensuring that %m in
format strings is properly resolved to the specified error number.
- A number of logging messages have been converted to use
log_unit_info() instead of log_info()
- The client code in sysv-generator no longer #includes core code from
src/core/.
- log_unit_full_errno() has been removed, log_unit_full() instead takes
an errno now, too.
- log_unit_info(), log_link_info(), log_netdev_info() and friends, now
avoid double evaluation of their parameters
2015-05-11 20:38:21 +02:00
|
|
|
log_unit_warning(UNIT(s), "Still around after SIGKILL. Ignoring.");
|
2013-07-01 00:03:57 +02:00
|
|
|
scope_enter_dead(s, SCOPE_FAILURE_TIMEOUT);
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
assert_not_reached("Timeout at wrong time.");
|
|
|
|
}
|
2013-11-19 21:12:59 +01:00
|
|
|
|
|
|
|
return 0;
|
2013-07-01 00:03:57 +02:00
|
|
|
}
|
|
|
|
|
2014-02-06 17:17:51 +01:00
|
|
|
int scope_abandon(Scope *s) {
|
|
|
|
assert(s);
|
2013-07-01 00:03:57 +02:00
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
if (unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE))
|
|
|
|
return -EPERM;
|
|
|
|
|
2014-02-06 17:17:51 +01:00
|
|
|
if (!IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED))
|
|
|
|
return -ESTALE;
|
2013-07-01 00:03:57 +02:00
|
|
|
|
2016-07-20 13:41:32 +02:00
|
|
|
s->was_abandoned = true;
|
2017-11-23 20:15:48 +01:00
|
|
|
|
2015-09-08 18:43:11 +02:00
|
|
|
s->controller = mfree(s->controller);
|
2017-11-23 20:15:48 +01:00
|
|
|
s->controller_track = sd_bus_track_unref(s->controller_track);
|
|
|
|
|
2017-10-14 17:09:54 +02:00
|
|
|
scope_set_state(s, SCOPE_ABANDONED);
|
2013-07-01 00:03:57 +02:00
|
|
|
|
core: rework how we track service and scope PIDs
This reworks how systemd tracks processes on cgroupv1 systems where
cgroup notification is not reliable. Previously, whenever we had reason
to believe that new processes showed up or got removed we'd scan the
cgroup of the scope or service unit for new processes, and would tidy up
the list of PIDs previously watched. This scanning is relatively slow,
and does not scale well. With this change behaviour is changed: instead
of scanning for new/removed processes right away we do this work in a
per-unit deferred event loop job. This event source is scheduled at a
very low priority, so that it is executed when we have time but does not
starve other event sources. This has two benefits: this expensive work is
coalesced, if events happen in quick succession, and we won't delay
SIGCHLD handling for too long.
This patch basically replaces all direct invocation of
unit_watch_all_pids() in scope.c and service.c with invocations of the
new unit_enqueue_rewatch_pids() call which just enqueues a request of
watching/tidying up the PID sets (with one exception: in
scope_enter_signal() and service_enter_signal() we'll still do
unit_watch_all_pids() synchronously first, since we really want to know
all processes we are about to kill so that we can track them properly.
Moreover, all direct invocations of unit_tidy_watch_pids() and
unit_synthesize_cgroup_empty_event() are removed too, when the
unit_enqueue_rewatch_pids() call is invoked, as the queued job will run
those operations too.
All of this is done on cgroupsv1 systems only, and is disabled on
cgroupsv2 systems as cgroup-empty notifications are reliable there, and
we do not need SIGCHLD events to track processes there.
Fixes: #9138
2018-05-31 15:41:59 +02:00
|
|
|
/* The client is no longer watching the remaining processes, so let's step in here, under the assumption that
|
|
|
|
* the remaining processes will be sooner or later reassigned to us as parent. */
|
|
|
|
(void) unit_enqueue_rewatch_pids(UNIT(s));
|
2013-07-01 00:03:57 +02:00
|
|
|
|
2014-02-06 17:17:51 +01:00
|
|
|
return 0;
|
2013-07-01 00:03:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
_pure_ static UnitActiveState scope_active_state(Unit *u) {
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
return state_translation_table[SCOPE(u)->state];
|
|
|
|
}
|
|
|
|
|
|
|
|
_pure_ static const char *scope_sub_state_to_string(Unit *u) {
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
return scope_state_to_string(SCOPE(u)->state);
|
|
|
|
}
|
|
|
|
|
core: enumerate perpetual units in a separate per-unit-type method
Previously the enumerate() callback defined for each unit type would do
two things:
1. It would create perpetual units (i.e. -.slice, system.slice, -.mount and
init.scope)
2. It would enumerate units from /proc/self/mountinfo, /proc/swaps and
the udev database
With this change these two parts are split into two seperate methods:
enumerate() now only does #2, while enumerate_perpetual() is responsible
for #1. Why make this change? Well, perpetual units should have a
slightly different effect that those found through enumeration: as
perpetual units should be up unconditionally, perpetually and thus never
change state, they should also not pull in deps by their state changing,
not even when the state is first set to active. Thus, their state is
generally initialized through the per-device coldplug() method in
similar fashion to the deserialized state from a previous run would be
put into place. OTOH units found through regular enumeration should
result in state changes (and thus pull in deps due to state changes),
hence their state should be put in effect in the catchup() method
instead. Hence, given this difference, let's also separate the
functions, so that the rule is:
1. What is created in enumerate_perpetual() should be started in
coldplug()
2. What is created in enumerate() should be started in catchup().
2018-06-05 18:26:45 +02:00
|
|
|
static void scope_enumerate_perpetual(Manager *m) {
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
Unit *u;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(m);
|
|
|
|
|
|
|
|
/* Let's unconditionally add the "init.scope" special unit
|
|
|
|
* that encapsulates PID 1. Note that PID 1 already is in the
|
|
|
|
* cgroup for this, we hence just need to allocate the object
|
|
|
|
* for it and that's it. */
|
|
|
|
|
|
|
|
u = manager_get_unit(m, SPECIAL_INIT_SCOPE);
|
|
|
|
if (!u) {
|
2016-10-25 00:29:05 +02:00
|
|
|
r = unit_new_for_name(m, sizeof(Scope), SPECIAL_INIT_SCOPE, &u);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
if (r < 0) {
|
2016-10-25 00:29:05 +02:00
|
|
|
log_error_errno(r, "Failed to allocate the special " SPECIAL_INIT_SCOPE " unit: %m");
|
2015-11-10 20:36:37 +01:00
|
|
|
return;
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
u->transient = true;
|
2016-10-24 21:41:54 +02:00
|
|
|
u->perpetual = true;
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
SCOPE(u)->deserialized_state = SCOPE_RUNNING;
|
|
|
|
|
|
|
|
unit_add_to_load_queue(u);
|
|
|
|
unit_add_to_dbus_queue(u);
|
|
|
|
}
|
|
|
|
|
2013-07-01 00:03:57 +02:00
|
|
|
static const char* const scope_result_table[_SCOPE_RESULT_MAX] = {
|
|
|
|
[SCOPE_SUCCESS] = "success",
|
|
|
|
[SCOPE_FAILURE_RESOURCES] = "resources",
|
|
|
|
[SCOPE_FAILURE_TIMEOUT] = "timeout",
|
|
|
|
};
|
|
|
|
|
|
|
|
DEFINE_STRING_TABLE_LOOKUP(scope_result, ScopeResult);
|
|
|
|
|
|
|
|
const UnitVTable scope_vtable = {
|
|
|
|
.object_size = sizeof(Scope),
|
2013-11-19 21:12:59 +01:00
|
|
|
.cgroup_context_offset = offsetof(Scope, cgroup_context),
|
|
|
|
.kill_context_offset = offsetof(Scope, kill_context),
|
|
|
|
|
2013-07-01 00:03:57 +02:00
|
|
|
.sections =
|
|
|
|
"Unit\0"
|
|
|
|
"Scope\0"
|
|
|
|
"Install\0",
|
|
|
|
.private_section = "Scope",
|
|
|
|
|
2015-11-13 18:46:12 +01:00
|
|
|
.can_transient = true,
|
2018-02-06 11:57:35 +01:00
|
|
|
.can_delegate = true,
|
2018-04-27 20:35:10 +02:00
|
|
|
.once_only = true,
|
2013-07-01 00:03:57 +02:00
|
|
|
|
|
|
|
.init = scope_init,
|
|
|
|
.load = scope_load,
|
|
|
|
.done = scope_done,
|
|
|
|
|
|
|
|
.coldplug = scope_coldplug,
|
|
|
|
|
|
|
|
.dump = scope_dump,
|
|
|
|
|
|
|
|
.start = scope_start,
|
|
|
|
.stop = scope_stop,
|
|
|
|
|
|
|
|
.kill = scope_kill,
|
|
|
|
|
2014-01-27 06:57:34 +01:00
|
|
|
.get_timeout = scope_get_timeout,
|
|
|
|
|
2013-07-01 00:03:57 +02:00
|
|
|
.serialize = scope_serialize,
|
|
|
|
.deserialize_item = scope_deserialize_item,
|
|
|
|
|
|
|
|
.active_state = scope_active_state,
|
|
|
|
.sub_state_to_string = scope_sub_state_to_string,
|
|
|
|
|
2014-02-06 17:17:51 +01:00
|
|
|
.sigchld_event = scope_sigchld_event,
|
|
|
|
|
2013-07-02 01:35:35 +02:00
|
|
|
.reset_failed = scope_reset_failed,
|
|
|
|
|
2013-07-01 00:03:57 +02:00
|
|
|
.notify_cgroup_empty = scope_notify_cgroup_empty_event,
|
|
|
|
|
2013-11-19 21:12:59 +01:00
|
|
|
.bus_vtable = bus_scope_vtable,
|
2013-07-01 00:03:57 +02:00
|
|
|
.bus_set_property = bus_scope_set_property,
|
|
|
|
.bus_commit_properties = bus_scope_commit_properties,
|
|
|
|
|
core: enumerate perpetual units in a separate per-unit-type method
Previously the enumerate() callback defined for each unit type would do
two things:
1. It would create perpetual units (i.e. -.slice, system.slice, -.mount and
init.scope)
2. It would enumerate units from /proc/self/mountinfo, /proc/swaps and
the udev database
With this change these two parts are split into two seperate methods:
enumerate() now only does #2, while enumerate_perpetual() is responsible
for #1. Why make this change? Well, perpetual units should have a
slightly different effect that those found through enumeration: as
perpetual units should be up unconditionally, perpetually and thus never
change state, they should also not pull in deps by their state changing,
not even when the state is first set to active. Thus, their state is
generally initialized through the per-device coldplug() method in
similar fashion to the deserialized state from a previous run would be
put into place. OTOH units found through regular enumeration should
result in state changes (and thus pull in deps due to state changes),
hence their state should be put in effect in the catchup() method
instead. Hence, given this difference, let's also separate the
functions, so that the rule is:
1. What is created in enumerate_perpetual() should be started in
coldplug()
2. What is created in enumerate() should be started in catchup().
2018-06-05 18:26:45 +02:00
|
|
|
.enumerate_perpetual = scope_enumerate_perpetual,
|
2013-07-01 00:03:57 +02:00
|
|
|
};
|