2017-11-18 17:09:20 +01:00
|
|
|
/* SPDX-License-Identifier: LGPL-2.1+ */
|
2012-07-18 19:07:51 +02:00
|
|
|
#pragma once
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2015-02-10 12:56:53 +01:00
|
|
|
#include <stdbool.h>
|
|
|
|
|
2016-11-11 19:59:19 +01:00
|
|
|
#include "cgroup-util.h"
|
|
|
|
#include "ip-address-access.h"
|
2013-06-27 04:14:27 +02:00
|
|
|
#include "list.h"
|
2015-02-10 12:56:53 +01:00
|
|
|
#include "time-util.h"
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
typedef struct CGroupContext CGroupContext;
|
|
|
|
typedef struct CGroupDeviceAllow CGroupDeviceAllow;
|
2016-05-05 22:42:55 +02:00
|
|
|
typedef struct CGroupIODeviceWeight CGroupIODeviceWeight;
|
|
|
|
typedef struct CGroupIODeviceLimit CGroupIODeviceLimit;
|
2018-06-13 23:16:35 +02:00
|
|
|
typedef struct CGroupIODeviceLatency CGroupIODeviceLatency;
|
2013-06-27 04:14:27 +02:00
|
|
|
typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight;
|
|
|
|
typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
typedef enum CGroupDevicePolicy {
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
/* When devices listed, will allow those, plus built-in ones,
|
|
|
|
if none are listed will allow everything. */
|
|
|
|
CGROUP_AUTO,
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
/* Everything forbidden, except built-in ones and listed ones. */
|
|
|
|
CGROUP_CLOSED,
|
2010-04-21 04:01:24 +02:00
|
|
|
|
2019-04-27 02:22:40 +02:00
|
|
|
/* Everything forbidden, except for the listed devices */
|
2013-06-27 04:14:27 +02:00
|
|
|
CGROUP_STRICT,
|
2010-04-21 04:01:24 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
_CGROUP_DEVICE_POLICY_MAX,
|
|
|
|
_CGROUP_DEVICE_POLICY_INVALID = -1
|
|
|
|
} CGroupDevicePolicy;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
struct CGroupDeviceAllow {
|
|
|
|
LIST_FIELDS(CGroupDeviceAllow, device_allow);
|
|
|
|
char *path;
|
|
|
|
bool r:1;
|
|
|
|
bool w:1;
|
|
|
|
bool m:1;
|
|
|
|
};
|
2010-06-21 23:27:18 +02:00
|
|
|
|
2016-05-05 22:42:55 +02:00
|
|
|
struct CGroupIODeviceWeight {
|
|
|
|
LIST_FIELDS(CGroupIODeviceWeight, device_weights);
|
|
|
|
char *path;
|
|
|
|
uint64_t weight;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct CGroupIODeviceLimit {
|
|
|
|
LIST_FIELDS(CGroupIODeviceLimit, device_limits);
|
|
|
|
char *path;
|
2016-05-18 22:50:56 +02:00
|
|
|
uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
|
2016-05-05 22:42:55 +02:00
|
|
|
};
|
|
|
|
|
2018-06-13 23:16:35 +02:00
|
|
|
struct CGroupIODeviceLatency {
|
|
|
|
LIST_FIELDS(CGroupIODeviceLatency, device_latencies);
|
|
|
|
char *path;
|
|
|
|
usec_t target_usec;
|
|
|
|
};
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
struct CGroupBlockIODeviceWeight {
|
|
|
|
LIST_FIELDS(CGroupBlockIODeviceWeight, device_weights);
|
|
|
|
char *path;
|
2015-09-11 16:48:24 +02:00
|
|
|
uint64_t weight;
|
2010-03-31 16:29:55 +02:00
|
|
|
};
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
struct CGroupBlockIODeviceBandwidth {
|
|
|
|
LIST_FIELDS(CGroupBlockIODeviceBandwidth, device_bandwidths);
|
|
|
|
char *path;
|
2016-05-18 22:51:46 +02:00
|
|
|
uint64_t rbps;
|
|
|
|
uint64_t wbps;
|
2013-06-27 04:14:27 +02:00
|
|
|
};
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
struct CGroupContext {
|
|
|
|
bool cpu_accounting;
|
2016-05-05 22:42:55 +02:00
|
|
|
bool io_accounting;
|
2013-06-27 04:14:27 +02:00
|
|
|
bool blockio_accounting;
|
|
|
|
bool memory_accounting;
|
2015-09-10 12:32:16 +02:00
|
|
|
bool tasks_accounting;
|
2016-11-11 19:59:19 +01:00
|
|
|
bool ip_accounting;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2019-03-19 19:05:19 +01:00
|
|
|
/* Configures the memory.oom.group attribute (on unified) */
|
|
|
|
bool memory_oom_group;
|
|
|
|
|
2019-03-26 20:11:30 +01:00
|
|
|
bool delegate;
|
|
|
|
CGroupMask delegate_controllers;
|
|
|
|
CGroupMask disable_controllers;
|
|
|
|
|
2016-05-05 22:42:55 +02:00
|
|
|
/* For unified hierarchy */
|
2016-08-07 15:45:39 +02:00
|
|
|
uint64_t cpu_weight;
|
|
|
|
uint64_t startup_cpu_weight;
|
|
|
|
usec_t cpu_quota_per_sec_usec;
|
2018-11-02 17:21:57 +01:00
|
|
|
usec_t cpu_quota_period_usec;
|
2016-08-07 15:45:39 +02:00
|
|
|
|
2016-05-05 22:42:55 +02:00
|
|
|
uint64_t io_weight;
|
|
|
|
uint64_t startup_io_weight;
|
|
|
|
LIST_HEAD(CGroupIODeviceWeight, io_device_weights);
|
|
|
|
LIST_HEAD(CGroupIODeviceLimit, io_device_limits);
|
2018-06-13 23:16:35 +02:00
|
|
|
LIST_HEAD(CGroupIODeviceLatency, io_device_latencies);
|
2016-05-05 22:42:55 +02:00
|
|
|
|
2019-04-16 19:44:05 +02:00
|
|
|
uint64_t default_memory_min;
|
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow
In cgroup v2 we have protection tunables -- currently MemoryLow and
MemoryMin (there will be more in future for other resources, too). The
design of these protection tunables requires not only intermediate
cgroups to propagate protections, but also the units at the leaf of that
resource's operation to accept it (by setting MemoryLow or MemoryMin).
This makes sense from an low-level API design perspective, but it's a
good idea to also have a higher-level abstraction that can, by default,
propagate these resources to children recursively. In this patch, this
happens by having descendants set memory.low to N if their ancestor has
DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow
value.
Any affected unit can opt out of this propagation by manually setting
`MemoryLow` to some value in its unit configuration. A unit can also
stop further propagation by setting `DefaultMemoryLow=` with no
argument. This removes further propagation in the subtree, but has no
effect on the unit itself (for that, use `MemoryLow=0`).
Our use case in production is simplifying the configuration of machines
which heavily rely on memory protection tunables, but currently require
tweaking a huge number of unit files to make that a reality. This
directive makes that significantly less fragile, and decreases the risk
of misconfiguration.
After this patch is merged, I will implement DefaultMemoryMin= using the
same principles.
2019-03-28 13:50:50 +01:00
|
|
|
uint64_t default_memory_low;
|
2018-06-09 02:33:14 +02:00
|
|
|
uint64_t memory_min;
|
2016-05-27 18:10:18 +02:00
|
|
|
uint64_t memory_low;
|
|
|
|
uint64_t memory_high;
|
|
|
|
uint64_t memory_max;
|
2016-07-04 09:03:54 +02:00
|
|
|
uint64_t memory_swap_max;
|
2016-05-27 18:10:18 +02:00
|
|
|
|
2019-04-16 19:44:05 +02:00
|
|
|
bool default_memory_min_set;
|
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow
In cgroup v2 we have protection tunables -- currently MemoryLow and
MemoryMin (there will be more in future for other resources, too). The
design of these protection tunables requires not only intermediate
cgroups to propagate protections, but also the units at the leaf of that
resource's operation to accept it (by setting MemoryLow or MemoryMin).
This makes sense from an low-level API design perspective, but it's a
good idea to also have a higher-level abstraction that can, by default,
propagate these resources to children recursively. In this patch, this
happens by having descendants set memory.low to N if their ancestor has
DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow
value.
Any affected unit can opt out of this propagation by manually setting
`MemoryLow` to some value in its unit configuration. A unit can also
stop further propagation by setting `DefaultMemoryLow=` with no
argument. This removes further propagation in the subtree, but has no
effect on the unit itself (for that, use `MemoryLow=0`).
Our use case in production is simplifying the configuration of machines
which heavily rely on memory protection tunables, but currently require
tweaking a huge number of unit files to make that a reality. This
directive makes that significantly less fragile, and decreases the risk
of misconfiguration.
After this patch is merged, I will implement DefaultMemoryMin= using the
same principles.
2019-03-28 13:50:50 +01:00
|
|
|
bool default_memory_low_set;
|
2019-04-16 19:44:05 +02:00
|
|
|
bool memory_min_set;
|
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow
In cgroup v2 we have protection tunables -- currently MemoryLow and
MemoryMin (there will be more in future for other resources, too). The
design of these protection tunables requires not only intermediate
cgroups to propagate protections, but also the units at the leaf of that
resource's operation to accept it (by setting MemoryLow or MemoryMin).
This makes sense from an low-level API design perspective, but it's a
good idea to also have a higher-level abstraction that can, by default,
propagate these resources to children recursively. In this patch, this
happens by having descendants set memory.low to N if their ancestor has
DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow
value.
Any affected unit can opt out of this propagation by manually setting
`MemoryLow` to some value in its unit configuration. A unit can also
stop further propagation by setting `DefaultMemoryLow=` with no
argument. This removes further propagation in the subtree, but has no
effect on the unit itself (for that, use `MemoryLow=0`).
Our use case in production is simplifying the configuration of machines
which heavily rely on memory protection tunables, but currently require
tweaking a huge number of unit files to make that a reality. This
directive makes that significantly less fragile, and decreases the risk
of misconfiguration.
After this patch is merged, I will implement DefaultMemoryMin= using the
same principles.
2019-03-28 13:50:50 +01:00
|
|
|
bool memory_low_set;
|
|
|
|
|
2016-11-11 19:59:19 +01:00
|
|
|
LIST_HEAD(IPAddressAccessItem, ip_address_allow);
|
|
|
|
LIST_HEAD(IPAddressAccessItem, ip_address_deny);
|
|
|
|
|
2019-04-23 12:14:20 +02:00
|
|
|
char **ip_filters_ingress;
|
|
|
|
char **ip_filters_egress;
|
|
|
|
|
2016-05-05 22:42:55 +02:00
|
|
|
/* For legacy hierarchies */
|
2015-09-11 16:48:24 +02:00
|
|
|
uint64_t cpu_shares;
|
|
|
|
uint64_t startup_cpu_shares;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2015-09-11 16:48:24 +02:00
|
|
|
uint64_t blockio_weight;
|
|
|
|
uint64_t startup_blockio_weight;
|
2013-06-27 04:14:27 +02:00
|
|
|
LIST_HEAD(CGroupBlockIODeviceWeight, blockio_device_weights);
|
|
|
|
LIST_HEAD(CGroupBlockIODeviceBandwidth, blockio_device_bandwidths);
|
2013-01-12 04:24:12 +01:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
uint64_t memory_limit;
|
2011-06-30 00:11:25 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
CGroupDevicePolicy device_policy;
|
|
|
|
LIST_HEAD(CGroupDeviceAllow, device_allow);
|
2014-11-05 17:57:23 +01:00
|
|
|
|
2016-05-05 22:42:55 +02:00
|
|
|
/* Common */
|
2015-09-10 12:32:16 +02:00
|
|
|
uint64_t tasks_max;
|
2013-06-27 04:14:27 +02:00
|
|
|
};
|
2011-06-30 00:11:25 +02:00
|
|
|
|
2017-09-05 19:27:53 +02:00
|
|
|
/* Used when querying IP accounting data */
|
|
|
|
typedef enum CGroupIPAccountingMetric {
|
|
|
|
CGROUP_IP_INGRESS_BYTES,
|
|
|
|
CGROUP_IP_INGRESS_PACKETS,
|
|
|
|
CGROUP_IP_EGRESS_BYTES,
|
|
|
|
CGROUP_IP_EGRESS_PACKETS,
|
|
|
|
_CGROUP_IP_ACCOUNTING_METRIC_MAX,
|
|
|
|
_CGROUP_IP_ACCOUNTING_METRIC_INVALID = -1,
|
|
|
|
} CGroupIPAccountingMetric;
|
|
|
|
|
2019-03-22 12:16:03 +01:00
|
|
|
/* Used when querying IO accounting data */
|
|
|
|
typedef enum CGroupIOAccountingMetric {
|
|
|
|
CGROUP_IO_READ_BYTES,
|
|
|
|
CGROUP_IO_WRITE_BYTES,
|
|
|
|
CGROUP_IO_READ_OPERATIONS,
|
|
|
|
CGROUP_IO_WRITE_OPERATIONS,
|
|
|
|
_CGROUP_IO_ACCOUNTING_METRIC_MAX,
|
|
|
|
_CGROUP_IO_ACCOUNTING_METRIC_INVALID = -1,
|
|
|
|
} CGroupIOAccountingMetric;
|
|
|
|
|
2018-05-15 20:12:22 +02:00
|
|
|
typedef struct Unit Unit;
|
|
|
|
typedef struct Manager Manager;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2018-11-02 17:21:57 +01:00
|
|
|
usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period);
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
void cgroup_context_init(CGroupContext *c);
|
|
|
|
void cgroup_context_done(CGroupContext *c);
|
|
|
|
void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix);
|
2014-02-14 19:11:07 +01:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a);
|
2016-05-05 22:42:55 +02:00
|
|
|
void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w);
|
|
|
|
void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l);
|
2018-06-13 23:16:35 +02:00
|
|
|
void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l);
|
2013-06-27 04:14:27 +02:00
|
|
|
void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w);
|
|
|
|
void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b);
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2018-08-06 06:42:14 +02:00
|
|
|
int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode);
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
CGroupMask unit_get_own_mask(Unit *u);
|
2017-11-09 15:29:34 +01:00
|
|
|
CGroupMask unit_get_delegate_mask(Unit *u);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
CGroupMask unit_get_members_mask(Unit *u);
|
2017-11-09 15:29:34 +01:00
|
|
|
CGroupMask unit_get_siblings_mask(Unit *u);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
CGroupMask unit_get_subtree_mask(Unit *u);
|
cgroup: Add DisableControllers= directive to disable controller in subtree
Some controllers (like the CPU controller) have a performance cost that
is non-trivial on certain workloads. While this can be mitigated and
improved to an extent, there will for some controllers always be some
overheads associated with the benefits gained from the controller.
Inside Facebook, the fix applied has been to disable the CPU controller
forcibly with `cgroup_disable=cpu` on the kernel command line.
This presents a problem: to disable or reenable the controller, a reboot
is required, but this is quite cumbersome and slow to do for many
thousands of machines, especially machines where disabling/enabling a
stateful service on a machine is a matter of several minutes.
Currently systemd provides some configuration knobs for these in the
form of `[Default]CPUAccounting`, `[Default]MemoryAccounting`, and the
like. The limitation of these is that Default*Accounting is overrideable
by individual services, of which any one could decide to reenable a
controller within the hierarchy at any point just by using a controller
feature implicitly (eg. `CPUWeight`), even if the use of that CPU
feature could just be opportunistic. Since many services are provided by
the distribution, or by upstream teams at a particular organisation,
it's not a sustainable solution to simply try to find and remove
offending directives from these units.
This commit presents a more direct solution -- a DisableControllers=
directive that forcibly disallows a controller from being enabled within
a subtree.
2018-12-03 15:38:06 +01:00
|
|
|
CGroupMask unit_get_disable_mask(Unit *u);
|
|
|
|
CGroupMask unit_get_ancestor_disable_mask(Unit *u);
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
CGroupMask unit_get_target_mask(Unit *u);
|
|
|
|
CGroupMask unit_get_enable_mask(Unit *u);
|
2014-02-14 19:11:07 +01:00
|
|
|
|
cgroup: drastically simplify caching of cgroups members mask
Previously we tried to be smart: when a new unit appeared and it only
added controllers to the cgroup mask we'd update the cached members mask
in all parents by ORing in the controller flags in their cached values.
Unfortunately this was quite broken, as we missed some conditions when
this cache had to be reset (for example, when a unit got unloaded),
moreover the optimization doesn't work when a controller is removed
anyway (as in that case there's no other way for the parent to iterate
though all children if any other, remaining child unit still needs it).
Hence, let's simplify the logic substantially: instead of updating the
cache on the right events (which we didn't get right), let's simply
invalidate the cache, and generate it lazily when we encounter it later.
This should actually result in better behaviour as we don't have to
calculate the new members mask for a whole subtree whever we have the
suspicion something changed, but can delay it to the point where we
actually need the members mask.
This allows us to simplify things quite a bit, which is good, since
validating this cache for correctness is hard enough.
Fixes: #9512
2018-11-23 01:07:34 +01:00
|
|
|
void unit_invalidate_cgroup_members_masks(Unit *u);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
cgroup: be more careful with which controllers we can enable/disable on a cgroup
This changes cg_enable_everywhere() to return which controllers are
enabled for the specified cgroup. This information is then used to
correctly track the enablement mask currently in effect for a unit.
Moreover, when we try to turn off a controller, and this works, then
this is indicates that the parent unit might succesfully turn it off
now, too as our unit might have kept it busy.
So far, when realizing cgroups, i.e. when syncing up the kernel
representation of relevant cgroups with our own idea we would strictly
work from the root to the leaves. This is generally a good approach, as
when controllers are enabled this has to happen in root-to-leaves order.
However, when controllers are disabled this has to happen in the
opposite order: in leaves-to-root order (this is because controllers can
only be enabled in a child if it is already enabled in the parent, and
if it shall be disabled in the parent then it has to be disabled in the
child first, otherwise it is considered busy when it is attempted to
remove it in the parent).
To make things complicated when invalidating a unit's cgroup membershup
systemd can actually turn off some controllers previously turned on at
the very same time as it turns on other controllers previously turned
off. In such a case we have to work up leaves-to-root *and*
root-to-leaves right after each other. With this patch this is
implemented: we still generally operate root-to-leaves, but as soon as
we noticed we successfully turned off a controller previously turned on
for a cgroup we'll re-enqueue the cgroup realization for all parents of
a unit, thus implementing leaves-to-root where necessary.
2018-11-22 21:45:33 +01:00
|
|
|
void unit_add_to_cgroup_realize_queue(Unit *u);
|
|
|
|
|
2018-02-07 22:52:52 +01:00
|
|
|
const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask);
|
2018-12-12 16:45:33 +01:00
|
|
|
char *unit_default_cgroup_path(const Unit *u);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
int unit_set_cgroup_path(Unit *u, const char *path);
|
2017-11-24 22:02:22 +01:00
|
|
|
int unit_pick_cgroup_path(Unit *u);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2013-06-30 23:55:36 +02:00
|
|
|
int unit_realize_cgroup(Unit *u);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
void unit_release_cgroup(Unit *u);
|
|
|
|
void unit_prune_cgroup(Unit *u);
|
|
|
|
int unit_watch_cgroup(Unit *u);
|
2019-03-19 19:05:19 +01:00
|
|
|
int unit_watch_cgroup_memory(Unit *u);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2017-09-26 22:43:08 +02:00
|
|
|
void unit_add_to_cgroup_empty_queue(Unit *u);
|
2019-05-19 15:52:02 +02:00
|
|
|
int unit_check_oom(Unit *u);
|
2017-09-26 22:43:08 +02:00
|
|
|
|
2018-02-07 22:52:52 +01:00
|
|
|
int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path);
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
int manager_setup_cgroup(Manager *m);
|
|
|
|
void manager_shutdown_cgroup(Manager *m, bool delete);
|
2010-04-18 03:04:54 +02:00
|
|
|
|
2017-09-26 22:15:02 +02:00
|
|
|
unsigned manager_dispatch_cgroup_realize_queue(Manager *m);
|
2010-10-27 03:16:49 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup);
|
2015-09-03 14:57:44 +02:00
|
|
|
Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid);
|
2013-06-27 04:14:27 +02:00
|
|
|
Unit* manager_get_unit_by_pid(Manager *m, pid_t pid);
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2019-04-16 19:44:05 +02:00
|
|
|
uint64_t unit_get_ancestor_memory_min(Unit *u);
|
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow
In cgroup v2 we have protection tunables -- currently MemoryLow and
MemoryMin (there will be more in future for other resources, too). The
design of these protection tunables requires not only intermediate
cgroups to propagate protections, but also the units at the leaf of that
resource's operation to accept it (by setting MemoryLow or MemoryMin).
This makes sense from an low-level API design perspective, but it's a
good idea to also have a higher-level abstraction that can, by default,
propagate these resources to children recursively. In this patch, this
happens by having descendants set memory.low to N if their ancestor has
DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow
value.
Any affected unit can opt out of this propagation by manually setting
`MemoryLow` to some value in its unit configuration. A unit can also
stop further propagation by setting `DefaultMemoryLow=` with no
argument. This removes further propagation in the subtree, but has no
effect on the unit itself (for that, use `MemoryLow=0`).
Our use case in production is simplifying the configuration of machines
which heavily rely on memory protection tunables, but currently require
tweaking a huge number of unit files to make that a reality. This
directive makes that significantly less fragile, and decreases the risk
of misconfiguration.
After this patch is merged, I will implement DefaultMemoryMin= using the
same principles.
2019-03-28 13:50:50 +01:00
|
|
|
uint64_t unit_get_ancestor_memory_low(Unit *u);
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
int unit_search_main_pid(Unit *u, pid_t *ret);
|
|
|
|
int unit_watch_all_pids(Unit *u);
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2018-01-12 13:06:48 +01:00
|
|
|
int unit_synthesize_cgroup_empty_event(Unit *u);
|
|
|
|
|
2015-03-01 16:24:19 +01:00
|
|
|
int unit_get_memory_current(Unit *u, uint64_t *ret);
|
2015-09-10 12:32:16 +02:00
|
|
|
int unit_get_tasks_current(Unit *u, uint64_t *ret);
|
2015-03-01 16:24:19 +01:00
|
|
|
int unit_get_cpu_usage(Unit *u, nsec_t *ret);
|
2019-03-22 12:16:03 +01:00
|
|
|
int unit_get_io_accounting(Unit *u, CGroupIOAccountingMetric metric, bool allow_cache, uint64_t *ret);
|
2017-09-05 19:27:53 +02:00
|
|
|
int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret);
|
|
|
|
|
|
|
|
int unit_reset_cpu_accounting(Unit *u);
|
|
|
|
int unit_reset_ip_accounting(Unit *u);
|
2019-03-22 12:16:03 +01:00
|
|
|
int unit_reset_io_accounting(Unit *u);
|
2019-03-22 11:25:49 +01:00
|
|
|
int unit_reset_accounting(Unit *u);
|
2015-03-01 16:24:19 +01:00
|
|
|
|
2017-09-27 17:54:06 +02:00
|
|
|
#define UNIT_CGROUP_BOOL(u, name) \
|
|
|
|
({ \
|
|
|
|
CGroupContext *cc = unit_get_cgroup_context(u); \
|
|
|
|
cc ? cc->name : false; \
|
|
|
|
})
|
2015-09-01 17:25:59 +02:00
|
|
|
|
2018-11-20 22:42:16 +01:00
|
|
|
bool manager_owns_host_root_cgroup(Manager *m);
|
|
|
|
bool unit_has_host_root_cgroup(Unit *u);
|
2018-01-17 18:41:42 +01:00
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
int manager_notify_cgroup_empty(Manager *m, const char *group);
|
|
|
|
|
2015-09-11 18:21:53 +02:00
|
|
|
void unit_invalidate_cgroup(Unit *u, CGroupMask m);
|
2017-09-05 19:27:53 +02:00
|
|
|
void unit_invalidate_cgroup_bpf(Unit *u);
|
2015-09-11 18:21:53 +02:00
|
|
|
|
|
|
|
void manager_invalidate_startup_units(Manager *m);
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
const char* cgroup_device_policy_to_string(CGroupDevicePolicy i) _const_;
|
|
|
|
CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_;
|
2018-02-06 11:57:35 +01:00
|
|
|
|
|
|
|
bool unit_cgroup_delegate(Unit *u);
|
2019-05-22 12:12:17 +02:00
|
|
|
|
|
|
|
int compare_job_priority(const void *a, const void *b);
|