Systemd/src/core/cgroup.h

293 lines
9.3 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: LGPL-2.1+ */
#pragma once
2010-03-31 16:29:55 +02:00
#include <stdbool.h>
#include "cgroup-util.h"
#include "cpu-set-util.h"
#include "ip-address-access.h"
#include "list.h"
#include "time-util.h"
2010-03-31 16:29:55 +02:00
typedef struct TasksMax {
/* If scale == 0, just use value; otherwise, value / scale.
* See tasks_max_resolve(). */
uint64_t value;
uint64_t scale;
} TasksMax;
#define TASKS_MAX_UNSET ((TasksMax) { .value = UINT64_MAX, .scale = 0 })
static inline bool tasks_max_isset(const TasksMax *tasks_max) {
return tasks_max->value != UINT64_MAX || tasks_max->scale != 0;
}
uint64_t tasks_max_resolve(const TasksMax *tasks_max);
typedef struct CGroupContext CGroupContext;
typedef struct CGroupDeviceAllow CGroupDeviceAllow;
core: add io controller support on the unified hierarchy On the unified hierarchy, blkio controller is renamed to io and the interface is changed significantly. * blkio.weight and blkio.weight_device are consolidated into io.weight which uses the standardized weight range [1, 10000] with 100 as the default value. * blkio.throttle.{read|write}_{bps|iops}_device are consolidated into io.max. Expansion of throttling features is being worked on to support work-conserving absolute limits (io.low and io.high). * All stats are consolidated into io.stats. This patchset adds support for the new interface. As the interface has been revamped and new features are expected to be added, it seems best to treat it as a separate controller rather than trying to expand the blkio settings although we might add automatic translation if only blkio settings are specified. * io.weight handling is mostly identical to blkio.weight[_device] handling except that the weight range is different. * Both read and write bandwidth settings are consolidated into CGroupIODeviceLimit which describes all limits applicable to the device. This makes it less painful to add new limits. * "max" can be used to specify the maximum limit which is equivalent to no config for max limits and treated as such. If a given CGroupIODeviceLimit doesn't contain any non-default configs, the config struct is discarded once the no limit config is applied to cgroup. * lookup_blkio_device() is renamed to lookup_block_device(). Signed-off-by: Tejun Heo <htejun@fb.com>
2016-05-05 22:42:55 +02:00
typedef struct CGroupIODeviceWeight CGroupIODeviceWeight;
typedef struct CGroupIODeviceLimit CGroupIODeviceLimit;
typedef struct CGroupIODeviceLatency CGroupIODeviceLatency;
typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight;
typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth;
2010-03-31 16:29:55 +02:00
typedef enum CGroupDevicePolicy {
/* When devices listed, will allow those, plus built-in ones, if none are listed will allow
* everything. */
CGROUP_DEVICE_POLICY_AUTO,
2010-03-31 16:29:55 +02:00
/* Everything forbidden, except built-in ones and listed ones. */
CGROUP_DEVICE_POLICY_CLOSED,
2019-04-27 02:22:40 +02:00
/* Everything forbidden, except for the listed devices */
CGROUP_DEVICE_POLICY_STRICT,
_CGROUP_DEVICE_POLICY_MAX,
_CGROUP_DEVICE_POLICY_INVALID = -1
} CGroupDevicePolicy;
2010-03-31 16:29:55 +02:00
typedef enum FreezerAction {
FREEZER_FREEZE,
FREEZER_THAW,
_FREEZER_ACTION_MAX,
_FREEZER_ACTION_INVALID = -1,
} FreezerAction;
struct CGroupDeviceAllow {
LIST_FIELDS(CGroupDeviceAllow, device_allow);
char *path;
bool r:1;
bool w:1;
bool m:1;
};
core: add io controller support on the unified hierarchy On the unified hierarchy, blkio controller is renamed to io and the interface is changed significantly. * blkio.weight and blkio.weight_device are consolidated into io.weight which uses the standardized weight range [1, 10000] with 100 as the default value. * blkio.throttle.{read|write}_{bps|iops}_device are consolidated into io.max. Expansion of throttling features is being worked on to support work-conserving absolute limits (io.low and io.high). * All stats are consolidated into io.stats. This patchset adds support for the new interface. As the interface has been revamped and new features are expected to be added, it seems best to treat it as a separate controller rather than trying to expand the blkio settings although we might add automatic translation if only blkio settings are specified. * io.weight handling is mostly identical to blkio.weight[_device] handling except that the weight range is different. * Both read and write bandwidth settings are consolidated into CGroupIODeviceLimit which describes all limits applicable to the device. This makes it less painful to add new limits. * "max" can be used to specify the maximum limit which is equivalent to no config for max limits and treated as such. If a given CGroupIODeviceLimit doesn't contain any non-default configs, the config struct is discarded once the no limit config is applied to cgroup. * lookup_blkio_device() is renamed to lookup_block_device(). Signed-off-by: Tejun Heo <htejun@fb.com>
2016-05-05 22:42:55 +02:00
struct CGroupIODeviceWeight {
LIST_FIELDS(CGroupIODeviceWeight, device_weights);
char *path;
uint64_t weight;
};
struct CGroupIODeviceLimit {
LIST_FIELDS(CGroupIODeviceLimit, device_limits);
char *path;
uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
core: add io controller support on the unified hierarchy On the unified hierarchy, blkio controller is renamed to io and the interface is changed significantly. * blkio.weight and blkio.weight_device are consolidated into io.weight which uses the standardized weight range [1, 10000] with 100 as the default value. * blkio.throttle.{read|write}_{bps|iops}_device are consolidated into io.max. Expansion of throttling features is being worked on to support work-conserving absolute limits (io.low and io.high). * All stats are consolidated into io.stats. This patchset adds support for the new interface. As the interface has been revamped and new features are expected to be added, it seems best to treat it as a separate controller rather than trying to expand the blkio settings although we might add automatic translation if only blkio settings are specified. * io.weight handling is mostly identical to blkio.weight[_device] handling except that the weight range is different. * Both read and write bandwidth settings are consolidated into CGroupIODeviceLimit which describes all limits applicable to the device. This makes it less painful to add new limits. * "max" can be used to specify the maximum limit which is equivalent to no config for max limits and treated as such. If a given CGroupIODeviceLimit doesn't contain any non-default configs, the config struct is discarded once the no limit config is applied to cgroup. * lookup_blkio_device() is renamed to lookup_block_device(). Signed-off-by: Tejun Heo <htejun@fb.com>
2016-05-05 22:42:55 +02:00
};
struct CGroupIODeviceLatency {
LIST_FIELDS(CGroupIODeviceLatency, device_latencies);
char *path;
usec_t target_usec;
};
struct CGroupBlockIODeviceWeight {
LIST_FIELDS(CGroupBlockIODeviceWeight, device_weights);
char *path;
uint64_t weight;
2010-03-31 16:29:55 +02:00
};
struct CGroupBlockIODeviceBandwidth {
LIST_FIELDS(CGroupBlockIODeviceBandwidth, device_bandwidths);
char *path;
uint64_t rbps;
uint64_t wbps;
};
2010-03-31 16:29:55 +02:00
struct CGroupContext {
bool cpu_accounting;
core: add io controller support on the unified hierarchy On the unified hierarchy, blkio controller is renamed to io and the interface is changed significantly. * blkio.weight and blkio.weight_device are consolidated into io.weight which uses the standardized weight range [1, 10000] with 100 as the default value. * blkio.throttle.{read|write}_{bps|iops}_device are consolidated into io.max. Expansion of throttling features is being worked on to support work-conserving absolute limits (io.low and io.high). * All stats are consolidated into io.stats. This patchset adds support for the new interface. As the interface has been revamped and new features are expected to be added, it seems best to treat it as a separate controller rather than trying to expand the blkio settings although we might add automatic translation if only blkio settings are specified. * io.weight handling is mostly identical to blkio.weight[_device] handling except that the weight range is different. * Both read and write bandwidth settings are consolidated into CGroupIODeviceLimit which describes all limits applicable to the device. This makes it less painful to add new limits. * "max" can be used to specify the maximum limit which is equivalent to no config for max limits and treated as such. If a given CGroupIODeviceLimit doesn't contain any non-default configs, the config struct is discarded once the no limit config is applied to cgroup. * lookup_blkio_device() is renamed to lookup_block_device(). Signed-off-by: Tejun Heo <htejun@fb.com>
2016-05-05 22:42:55 +02:00
bool io_accounting;
bool blockio_accounting;
bool memory_accounting;
bool tasks_accounting;
bool ip_accounting;
2010-03-31 16:29:55 +02:00
/* Configures the memory.oom.group attribute (on unified) */
bool memory_oom_group;
bool delegate;
CGroupMask delegate_controllers;
CGroupMask disable_controllers;
core: add io controller support on the unified hierarchy On the unified hierarchy, blkio controller is renamed to io and the interface is changed significantly. * blkio.weight and blkio.weight_device are consolidated into io.weight which uses the standardized weight range [1, 10000] with 100 as the default value. * blkio.throttle.{read|write}_{bps|iops}_device are consolidated into io.max. Expansion of throttling features is being worked on to support work-conserving absolute limits (io.low and io.high). * All stats are consolidated into io.stats. This patchset adds support for the new interface. As the interface has been revamped and new features are expected to be added, it seems best to treat it as a separate controller rather than trying to expand the blkio settings although we might add automatic translation if only blkio settings are specified. * io.weight handling is mostly identical to blkio.weight[_device] handling except that the weight range is different. * Both read and write bandwidth settings are consolidated into CGroupIODeviceLimit which describes all limits applicable to the device. This makes it less painful to add new limits. * "max" can be used to specify the maximum limit which is equivalent to no config for max limits and treated as such. If a given CGroupIODeviceLimit doesn't contain any non-default configs, the config struct is discarded once the no limit config is applied to cgroup. * lookup_blkio_device() is renamed to lookup_block_device(). Signed-off-by: Tejun Heo <htejun@fb.com>
2016-05-05 22:42:55 +02:00
/* For unified hierarchy */
uint64_t cpu_weight;
uint64_t startup_cpu_weight;
usec_t cpu_quota_per_sec_usec;
usec_t cpu_quota_period_usec;
CPUSet cpuset_cpus;
CPUSet cpuset_mems;
core: add io controller support on the unified hierarchy On the unified hierarchy, blkio controller is renamed to io and the interface is changed significantly. * blkio.weight and blkio.weight_device are consolidated into io.weight which uses the standardized weight range [1, 10000] with 100 as the default value. * blkio.throttle.{read|write}_{bps|iops}_device are consolidated into io.max. Expansion of throttling features is being worked on to support work-conserving absolute limits (io.low and io.high). * All stats are consolidated into io.stats. This patchset adds support for the new interface. As the interface has been revamped and new features are expected to be added, it seems best to treat it as a separate controller rather than trying to expand the blkio settings although we might add automatic translation if only blkio settings are specified. * io.weight handling is mostly identical to blkio.weight[_device] handling except that the weight range is different. * Both read and write bandwidth settings are consolidated into CGroupIODeviceLimit which describes all limits applicable to the device. This makes it less painful to add new limits. * "max" can be used to specify the maximum limit which is equivalent to no config for max limits and treated as such. If a given CGroupIODeviceLimit doesn't contain any non-default configs, the config struct is discarded once the no limit config is applied to cgroup. * lookup_blkio_device() is renamed to lookup_block_device(). Signed-off-by: Tejun Heo <htejun@fb.com>
2016-05-05 22:42:55 +02:00
uint64_t io_weight;
uint64_t startup_io_weight;
LIST_HEAD(CGroupIODeviceWeight, io_device_weights);
LIST_HEAD(CGroupIODeviceLimit, io_device_limits);
LIST_HEAD(CGroupIODeviceLatency, io_device_latencies);
core: add io controller support on the unified hierarchy On the unified hierarchy, blkio controller is renamed to io and the interface is changed significantly. * blkio.weight and blkio.weight_device are consolidated into io.weight which uses the standardized weight range [1, 10000] with 100 as the default value. * blkio.throttle.{read|write}_{bps|iops}_device are consolidated into io.max. Expansion of throttling features is being worked on to support work-conserving absolute limits (io.low and io.high). * All stats are consolidated into io.stats. This patchset adds support for the new interface. As the interface has been revamped and new features are expected to be added, it seems best to treat it as a separate controller rather than trying to expand the blkio settings although we might add automatic translation if only blkio settings are specified. * io.weight handling is mostly identical to blkio.weight[_device] handling except that the weight range is different. * Both read and write bandwidth settings are consolidated into CGroupIODeviceLimit which describes all limits applicable to the device. This makes it less painful to add new limits. * "max" can be used to specify the maximum limit which is equivalent to no config for max limits and treated as such. If a given CGroupIODeviceLimit doesn't contain any non-default configs, the config struct is discarded once the no limit config is applied to cgroup. * lookup_blkio_device() is renamed to lookup_block_device(). Signed-off-by: Tejun Heo <htejun@fb.com>
2016-05-05 22:42:55 +02:00
2019-04-16 19:44:05 +02:00
uint64_t default_memory_min;
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow In cgroup v2 we have protection tunables -- currently MemoryLow and MemoryMin (there will be more in future for other resources, too). The design of these protection tunables requires not only intermediate cgroups to propagate protections, but also the units at the leaf of that resource's operation to accept it (by setting MemoryLow or MemoryMin). This makes sense from an low-level API design perspective, but it's a good idea to also have a higher-level abstraction that can, by default, propagate these resources to children recursively. In this patch, this happens by having descendants set memory.low to N if their ancestor has DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow value. Any affected unit can opt out of this propagation by manually setting `MemoryLow` to some value in its unit configuration. A unit can also stop further propagation by setting `DefaultMemoryLow=` with no argument. This removes further propagation in the subtree, but has no effect on the unit itself (for that, use `MemoryLow=0`). Our use case in production is simplifying the configuration of machines which heavily rely on memory protection tunables, but currently require tweaking a huge number of unit files to make that a reality. This directive makes that significantly less fragile, and decreases the risk of misconfiguration. After this patch is merged, I will implement DefaultMemoryMin= using the same principles.
2019-03-28 13:50:50 +01:00
uint64_t default_memory_low;
uint64_t memory_min;
uint64_t memory_low;
uint64_t memory_high;
uint64_t memory_max;
uint64_t memory_swap_max;
2019-04-16 19:44:05 +02:00
bool default_memory_min_set;
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow In cgroup v2 we have protection tunables -- currently MemoryLow and MemoryMin (there will be more in future for other resources, too). The design of these protection tunables requires not only intermediate cgroups to propagate protections, but also the units at the leaf of that resource's operation to accept it (by setting MemoryLow or MemoryMin). This makes sense from an low-level API design perspective, but it's a good idea to also have a higher-level abstraction that can, by default, propagate these resources to children recursively. In this patch, this happens by having descendants set memory.low to N if their ancestor has DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow value. Any affected unit can opt out of this propagation by manually setting `MemoryLow` to some value in its unit configuration. A unit can also stop further propagation by setting `DefaultMemoryLow=` with no argument. This removes further propagation in the subtree, but has no effect on the unit itself (for that, use `MemoryLow=0`). Our use case in production is simplifying the configuration of machines which heavily rely on memory protection tunables, but currently require tweaking a huge number of unit files to make that a reality. This directive makes that significantly less fragile, and decreases the risk of misconfiguration. After this patch is merged, I will implement DefaultMemoryMin= using the same principles.
2019-03-28 13:50:50 +01:00
bool default_memory_low_set;
2019-04-16 19:44:05 +02:00
bool memory_min_set;
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow In cgroup v2 we have protection tunables -- currently MemoryLow and MemoryMin (there will be more in future for other resources, too). The design of these protection tunables requires not only intermediate cgroups to propagate protections, but also the units at the leaf of that resource's operation to accept it (by setting MemoryLow or MemoryMin). This makes sense from an low-level API design perspective, but it's a good idea to also have a higher-level abstraction that can, by default, propagate these resources to children recursively. In this patch, this happens by having descendants set memory.low to N if their ancestor has DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow value. Any affected unit can opt out of this propagation by manually setting `MemoryLow` to some value in its unit configuration. A unit can also stop further propagation by setting `DefaultMemoryLow=` with no argument. This removes further propagation in the subtree, but has no effect on the unit itself (for that, use `MemoryLow=0`). Our use case in production is simplifying the configuration of machines which heavily rely on memory protection tunables, but currently require tweaking a huge number of unit files to make that a reality. This directive makes that significantly less fragile, and decreases the risk of misconfiguration. After this patch is merged, I will implement DefaultMemoryMin= using the same principles.
2019-03-28 13:50:50 +01:00
bool memory_low_set;
LIST_HEAD(IPAddressAccessItem, ip_address_allow);
LIST_HEAD(IPAddressAccessItem, ip_address_deny);
char **ip_filters_ingress;
char **ip_filters_egress;
core: add io controller support on the unified hierarchy On the unified hierarchy, blkio controller is renamed to io and the interface is changed significantly. * blkio.weight and blkio.weight_device are consolidated into io.weight which uses the standardized weight range [1, 10000] with 100 as the default value. * blkio.throttle.{read|write}_{bps|iops}_device are consolidated into io.max. Expansion of throttling features is being worked on to support work-conserving absolute limits (io.low and io.high). * All stats are consolidated into io.stats. This patchset adds support for the new interface. As the interface has been revamped and new features are expected to be added, it seems best to treat it as a separate controller rather than trying to expand the blkio settings although we might add automatic translation if only blkio settings are specified. * io.weight handling is mostly identical to blkio.weight[_device] handling except that the weight range is different. * Both read and write bandwidth settings are consolidated into CGroupIODeviceLimit which describes all limits applicable to the device. This makes it less painful to add new limits. * "max" can be used to specify the maximum limit which is equivalent to no config for max limits and treated as such. If a given CGroupIODeviceLimit doesn't contain any non-default configs, the config struct is discarded once the no limit config is applied to cgroup. * lookup_blkio_device() is renamed to lookup_block_device(). Signed-off-by: Tejun Heo <htejun@fb.com>
2016-05-05 22:42:55 +02:00
/* For legacy hierarchies */
uint64_t cpu_shares;
uint64_t startup_cpu_shares;
2010-03-31 16:29:55 +02:00
uint64_t blockio_weight;
uint64_t startup_blockio_weight;
LIST_HEAD(CGroupBlockIODeviceWeight, blockio_device_weights);
LIST_HEAD(CGroupBlockIODeviceBandwidth, blockio_device_bandwidths);
uint64_t memory_limit;
CGroupDevicePolicy device_policy;
LIST_HEAD(CGroupDeviceAllow, device_allow);
core: add io controller support on the unified hierarchy On the unified hierarchy, blkio controller is renamed to io and the interface is changed significantly. * blkio.weight and blkio.weight_device are consolidated into io.weight which uses the standardized weight range [1, 10000] with 100 as the default value. * blkio.throttle.{read|write}_{bps|iops}_device are consolidated into io.max. Expansion of throttling features is being worked on to support work-conserving absolute limits (io.low and io.high). * All stats are consolidated into io.stats. This patchset adds support for the new interface. As the interface has been revamped and new features are expected to be added, it seems best to treat it as a separate controller rather than trying to expand the blkio settings although we might add automatic translation if only blkio settings are specified. * io.weight handling is mostly identical to blkio.weight[_device] handling except that the weight range is different. * Both read and write bandwidth settings are consolidated into CGroupIODeviceLimit which describes all limits applicable to the device. This makes it less painful to add new limits. * "max" can be used to specify the maximum limit which is equivalent to no config for max limits and treated as such. If a given CGroupIODeviceLimit doesn't contain any non-default configs, the config struct is discarded once the no limit config is applied to cgroup. * lookup_blkio_device() is renamed to lookup_block_device(). Signed-off-by: Tejun Heo <htejun@fb.com>
2016-05-05 22:42:55 +02:00
/* Common */
TasksMax tasks_max;
/* Settings for systemd-oomd */
ManagedOOMMode moom_swap;
ManagedOOMMode moom_mem_pressure;
int moom_mem_pressure_limit;
};
/* Used when querying IP accounting data */
typedef enum CGroupIPAccountingMetric {
CGROUP_IP_INGRESS_BYTES,
CGROUP_IP_INGRESS_PACKETS,
CGROUP_IP_EGRESS_BYTES,
CGROUP_IP_EGRESS_PACKETS,
_CGROUP_IP_ACCOUNTING_METRIC_MAX,
_CGROUP_IP_ACCOUNTING_METRIC_INVALID = -1,
} CGroupIPAccountingMetric;
/* Used when querying IO accounting data */
typedef enum CGroupIOAccountingMetric {
CGROUP_IO_READ_BYTES,
CGROUP_IO_WRITE_BYTES,
CGROUP_IO_READ_OPERATIONS,
CGROUP_IO_WRITE_OPERATIONS,
_CGROUP_IO_ACCOUNTING_METRIC_MAX,
_CGROUP_IO_ACCOUNTING_METRIC_INVALID = -1,
} CGroupIOAccountingMetric;
typedef struct Unit Unit;
typedef struct Manager Manager;
2010-03-31 16:29:55 +02:00
usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period);
void cgroup_context_init(CGroupContext *c);
void cgroup_context_done(CGroupContext *c);
void cgroup_context_dump(Unit *u, FILE* f, const char *prefix);
void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a);
core: add io controller support on the unified hierarchy On the unified hierarchy, blkio controller is renamed to io and the interface is changed significantly. * blkio.weight and blkio.weight_device are consolidated into io.weight which uses the standardized weight range [1, 10000] with 100 as the default value. * blkio.throttle.{read|write}_{bps|iops}_device are consolidated into io.max. Expansion of throttling features is being worked on to support work-conserving absolute limits (io.low and io.high). * All stats are consolidated into io.stats. This patchset adds support for the new interface. As the interface has been revamped and new features are expected to be added, it seems best to treat it as a separate controller rather than trying to expand the blkio settings although we might add automatic translation if only blkio settings are specified. * io.weight handling is mostly identical to blkio.weight[_device] handling except that the weight range is different. * Both read and write bandwidth settings are consolidated into CGroupIODeviceLimit which describes all limits applicable to the device. This makes it less painful to add new limits. * "max" can be used to specify the maximum limit which is equivalent to no config for max limits and treated as such. If a given CGroupIODeviceLimit doesn't contain any non-default configs, the config struct is discarded once the no limit config is applied to cgroup. * lookup_blkio_device() is renamed to lookup_block_device(). Signed-off-by: Tejun Heo <htejun@fb.com>
2016-05-05 22:42:55 +02:00
void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w);
void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l);
void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l);
void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w);
void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b);
2010-03-31 16:29:55 +02:00
int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode);
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
CGroupMask unit_get_own_mask(Unit *u);
CGroupMask unit_get_delegate_mask(Unit *u);
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
CGroupMask unit_get_members_mask(Unit *u);
CGroupMask unit_get_siblings_mask(Unit *u);
cgroup: Add DisableControllers= directive to disable controller in subtree Some controllers (like the CPU controller) have a performance cost that is non-trivial on certain workloads. While this can be mitigated and improved to an extent, there will for some controllers always be some overheads associated with the benefits gained from the controller. Inside Facebook, the fix applied has been to disable the CPU controller forcibly with `cgroup_disable=cpu` on the kernel command line. This presents a problem: to disable or reenable the controller, a reboot is required, but this is quite cumbersome and slow to do for many thousands of machines, especially machines where disabling/enabling a stateful service on a machine is a matter of several minutes. Currently systemd provides some configuration knobs for these in the form of `[Default]CPUAccounting`, `[Default]MemoryAccounting`, and the like. The limitation of these is that Default*Accounting is overrideable by individual services, of which any one could decide to reenable a controller within the hierarchy at any point just by using a controller feature implicitly (eg. `CPUWeight`), even if the use of that CPU feature could just be opportunistic. Since many services are provided by the distribution, or by upstream teams at a particular organisation, it's not a sustainable solution to simply try to find and remove offending directives from these units. This commit presents a more direct solution -- a DisableControllers= directive that forcibly disallows a controller from being enabled within a subtree.
2018-12-03 15:38:06 +01:00
CGroupMask unit_get_ancestor_disable_mask(Unit *u);
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
CGroupMask unit_get_target_mask(Unit *u);
CGroupMask unit_get_enable_mask(Unit *u);
void unit_invalidate_cgroup_members_masks(Unit *u);
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
void unit_add_family_to_cgroup_realize_queue(Unit *u);
cgroup: be more careful with which controllers we can enable/disable on a cgroup This changes cg_enable_everywhere() to return which controllers are enabled for the specified cgroup. This information is then used to correctly track the enablement mask currently in effect for a unit. Moreover, when we try to turn off a controller, and this works, then this is indicates that the parent unit might succesfully turn it off now, too as our unit might have kept it busy. So far, when realizing cgroups, i.e. when syncing up the kernel representation of relevant cgroups with our own idea we would strictly work from the root to the leaves. This is generally a good approach, as when controllers are enabled this has to happen in root-to-leaves order. However, when controllers are disabled this has to happen in the opposite order: in leaves-to-root order (this is because controllers can only be enabled in a child if it is already enabled in the parent, and if it shall be disabled in the parent then it has to be disabled in the child first, otherwise it is considered busy when it is attempted to remove it in the parent). To make things complicated when invalidating a unit's cgroup membershup systemd can actually turn off some controllers previously turned on at the very same time as it turns on other controllers previously turned off. In such a case we have to work up leaves-to-root *and* root-to-leaves right after each other. With this patch this is implemented: we still generally operate root-to-leaves, but as soon as we noticed we successfully turned off a controller previously turned on for a cgroup we'll re-enqueue the cgroup realization for all parents of a unit, thus implementing leaves-to-root where necessary.
2018-11-22 21:45:33 +01:00
const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask);
char *unit_default_cgroup_path(const Unit *u);
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
int unit_set_cgroup_path(Unit *u, const char *path);
int unit_pick_cgroup_path(Unit *u);
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
int unit_realize_cgroup(Unit *u);
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
void unit_release_cgroup(Unit *u);
void unit_prune_cgroup(Unit *u);
int unit_watch_cgroup(Unit *u);
int unit_watch_cgroup_memory(Unit *u);
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
void unit_add_to_cgroup_empty_queue(Unit *u);
2020-09-09 09:24:23 +02:00
int unit_check_oomd_kill(Unit *u);
int unit_check_oom(Unit *u);
int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path);
2010-03-31 16:29:55 +02:00
int manager_setup_cgroup(Manager *m);
void manager_shutdown_cgroup(Manager *m, bool delete);
unsigned manager_dispatch_cgroup_realize_queue(Manager *m);
Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup);
Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid);
Unit* manager_get_unit_by_pid(Manager *m, pid_t pid);
2010-03-31 16:29:55 +02:00
2019-04-16 19:44:05 +02:00
uint64_t unit_get_ancestor_memory_min(Unit *u);
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow In cgroup v2 we have protection tunables -- currently MemoryLow and MemoryMin (there will be more in future for other resources, too). The design of these protection tunables requires not only intermediate cgroups to propagate protections, but also the units at the leaf of that resource's operation to accept it (by setting MemoryLow or MemoryMin). This makes sense from an low-level API design perspective, but it's a good idea to also have a higher-level abstraction that can, by default, propagate these resources to children recursively. In this patch, this happens by having descendants set memory.low to N if their ancestor has DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow value. Any affected unit can opt out of this propagation by manually setting `MemoryLow` to some value in its unit configuration. A unit can also stop further propagation by setting `DefaultMemoryLow=` with no argument. This removes further propagation in the subtree, but has no effect on the unit itself (for that, use `MemoryLow=0`). Our use case in production is simplifying the configuration of machines which heavily rely on memory protection tunables, but currently require tweaking a huge number of unit files to make that a reality. This directive makes that significantly less fragile, and decreases the risk of misconfiguration. After this patch is merged, I will implement DefaultMemoryMin= using the same principles.
2019-03-28 13:50:50 +01:00
uint64_t unit_get_ancestor_memory_low(Unit *u);
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
int unit_search_main_pid(Unit *u, pid_t *ret);
int unit_watch_all_pids(Unit *u);
2010-03-31 16:29:55 +02:00
int unit_synthesize_cgroup_empty_event(Unit *u);
int unit_get_memory_current(Unit *u, uint64_t *ret);
int unit_get_tasks_current(Unit *u, uint64_t *ret);
int unit_get_cpu_usage(Unit *u, nsec_t *ret);
int unit_get_io_accounting(Unit *u, CGroupIOAccountingMetric metric, bool allow_cache, uint64_t *ret);
int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret);
int unit_reset_cpu_accounting(Unit *u);
int unit_reset_ip_accounting(Unit *u);
int unit_reset_io_accounting(Unit *u);
int unit_reset_accounting(Unit *u);
#define UNIT_CGROUP_BOOL(u, name) \
({ \
CGroupContext *cc = unit_get_cgroup_context(u); \
cc ? cc->name : false; \
})
bool manager_owns_host_root_cgroup(Manager *m);
bool unit_has_host_root_cgroup(Unit *u);
core: unified cgroup hierarchy support This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
2015-09-01 19:22:36 +02:00
int manager_notify_cgroup_empty(Manager *m, const char *group);
void unit_invalidate_cgroup(Unit *u, CGroupMask m);
void unit_invalidate_cgroup_bpf(Unit *u);
void manager_invalidate_startup_units(Manager *m);
const char* cgroup_device_policy_to_string(CGroupDevicePolicy i) _const_;
CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_;
bool unit_cgroup_delegate(Unit *u);
int compare_job_priority(const void *a, const void *b);
int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name);
int unit_cgroup_freezer_action(Unit *u, FreezerAction action);
const char* freezer_action_to_string(FreezerAction a) _const_;
FreezerAction freezer_action_from_string(const char *s) _pure_;