2020-11-09 05:23:58 +01:00
|
|
|
/* SPDX-License-Identifier: LGPL-2.1-or-later */
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2010-07-11 00:50:49 +02:00
|
|
|
#include <fcntl.h>
|
2010-06-21 23:27:18 +02:00
|
|
|
|
2019-03-19 19:05:19 +01:00
|
|
|
#include "sd-messages.h"
|
|
|
|
|
2015-10-27 03:01:06 +01:00
|
|
|
#include "alloc-util.h"
|
2017-12-22 15:22:59 +01:00
|
|
|
#include "blockdev-util.h"
|
2019-03-14 13:14:33 +01:00
|
|
|
#include "bpf-devices.h"
|
2017-09-05 19:27:53 +02:00
|
|
|
#include "bpf-firewall.h"
|
2018-06-11 12:17:32 +02:00
|
|
|
#include "btrfs-util.h"
|
2018-02-07 22:52:52 +01:00
|
|
|
#include "bus-error.h"
|
2019-08-01 13:14:45 +02:00
|
|
|
#include "cgroup-setup.h"
|
2015-09-10 12:32:16 +02:00
|
|
|
#include "cgroup-util.h"
|
2015-10-25 13:14:12 +01:00
|
|
|
#include "cgroup.h"
|
|
|
|
#include "fd-util.h"
|
2015-10-26 18:05:03 +01:00
|
|
|
#include "fileio.h"
|
2015-10-27 14:58:05 +01:00
|
|
|
#include "fs-util.h"
|
2020-04-29 17:53:43 +02:00
|
|
|
#include "io-util.h"
|
2019-11-05 13:50:28 +01:00
|
|
|
#include "limits-util.h"
|
2020-04-29 17:53:43 +02:00
|
|
|
#include "nulstr-util.h"
|
2015-10-26 16:18:16 +01:00
|
|
|
#include "parse-util.h"
|
2012-05-07 21:36:12 +02:00
|
|
|
#include "path-util.h"
|
2015-09-10 12:32:16 +02:00
|
|
|
#include "process-util.h"
|
2018-01-17 15:39:39 +01:00
|
|
|
#include "procfs-util.h"
|
2013-06-20 03:45:08 +02:00
|
|
|
#include "special.h"
|
2018-06-29 12:03:33 +02:00
|
|
|
#include "stat-util.h"
|
2017-09-05 19:27:53 +02:00
|
|
|
#include "stdio-util.h"
|
2015-10-26 22:31:05 +01:00
|
|
|
#include "string-table.h"
|
2015-10-24 22:58:24 +02:00
|
|
|
#include "string-util.h"
|
2018-02-09 19:07:01 +01:00
|
|
|
#include "virt.h"
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2018-11-02 17:21:57 +01:00
|
|
|
#define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
|
2014-05-22 04:53:12 +02:00
|
|
|
|
2018-11-20 20:18:07 +01:00
|
|
|
/* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
|
|
|
|
* problems we downgrade to LOG_DEBUG. This is supposed to be nice to container managers and kernels which want to mask
|
|
|
|
* out specific attributes from us. */
|
|
|
|
#define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(abs(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING)
|
|
|
|
|
2019-11-05 13:50:28 +01:00
|
|
|
uint64_t tasks_max_resolve(const TasksMax *tasks_max) {
|
|
|
|
if (tasks_max->scale == 0)
|
|
|
|
return tasks_max->value;
|
|
|
|
|
|
|
|
return system_tasks_max_scale(tasks_max->value, tasks_max->scale);
|
|
|
|
}
|
|
|
|
|
2018-11-20 22:42:16 +01:00
|
|
|
bool manager_owns_host_root_cgroup(Manager *m) {
|
2018-02-09 19:07:01 +01:00
|
|
|
assert(m);
|
|
|
|
|
|
|
|
/* Returns true if we are managing the root cgroup. Note that it isn't sufficient to just check whether the
|
|
|
|
* group root path equals "/" since that will also be the case if CLONE_NEWCGROUP is in the mix. Since there's
|
|
|
|
* appears to be no nice way to detect whether we are in a CLONE_NEWCGROUP namespace we instead just check if
|
|
|
|
* we run in any kind of container virtualization. */
|
|
|
|
|
2018-11-20 22:46:03 +01:00
|
|
|
if (MANAGER_IS_USER(m))
|
|
|
|
return false;
|
|
|
|
|
2018-02-09 19:07:01 +01:00
|
|
|
if (detect_container() > 0)
|
|
|
|
return false;
|
|
|
|
|
2018-04-18 14:20:49 +02:00
|
|
|
return empty_or_root(m->cgroup_root);
|
2018-02-09 19:07:01 +01:00
|
|
|
}
|
|
|
|
|
2018-11-20 22:42:16 +01:00
|
|
|
bool unit_has_host_root_cgroup(Unit *u) {
|
2018-01-17 18:41:42 +01:00
|
|
|
assert(u);
|
|
|
|
|
2018-02-09 19:07:01 +01:00
|
|
|
/* Returns whether this unit manages the root cgroup. This will return true if this unit is the root slice and
|
|
|
|
* the manager manages the root cgroup. */
|
2018-01-17 18:41:42 +01:00
|
|
|
|
2018-11-20 22:42:16 +01:00
|
|
|
if (!manager_owns_host_root_cgroup(u->manager))
|
2018-01-17 18:41:42 +01:00
|
|
|
return false;
|
|
|
|
|
2018-02-09 19:07:01 +01:00
|
|
|
return unit_has_name(u, SPECIAL_ROOT_SLICE);
|
2018-01-17 18:41:42 +01:00
|
|
|
}
|
|
|
|
|
2018-11-20 20:19:58 +01:00
|
|
|
static int set_attribute_and_warn(Unit *u, const char *controller, const char *attribute, const char *value) {
|
|
|
|
int r;
|
|
|
|
|
|
|
|
r = cg_set_attribute(controller, u->cgroup_path, attribute, value);
|
|
|
|
if (r < 0)
|
2020-09-08 19:28:36 +02:00
|
|
|
log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r, "Failed to set '%s' attribute on '%s' to '%.*s': %m",
|
|
|
|
strna(attribute), isempty(u->cgroup_path) ? "/" : u->cgroup_path, (int) strcspn(value, NEWLINE), value);
|
2018-11-20 20:19:58 +01:00
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2016-06-24 15:59:24 +02:00
|
|
|
static void cgroup_compat_warn(void) {
|
2016-06-02 19:02:53 +02:00
|
|
|
static bool cgroup_compat_warned = false;
|
|
|
|
|
|
|
|
if (cgroup_compat_warned)
|
|
|
|
return;
|
|
|
|
|
2018-02-09 19:07:01 +01:00
|
|
|
log_warning("cgroup compatibility translation between legacy and unified hierarchy settings activated. "
|
|
|
|
"See cgroup-compat debug messages for details.");
|
|
|
|
|
2016-06-02 19:02:53 +02:00
|
|
|
cgroup_compat_warned = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define log_cgroup_compat(unit, fmt, ...) do { \
|
|
|
|
cgroup_compat_warn(); \
|
|
|
|
log_unit_debug(unit, "cgroup-compat: " fmt, ##__VA_ARGS__); \
|
2016-06-24 15:59:24 +02:00
|
|
|
} while (false)
|
2016-06-02 19:02:53 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
void cgroup_context_init(CGroupContext *c) {
|
|
|
|
assert(c);
|
|
|
|
|
2018-11-20 19:45:02 +01:00
|
|
|
/* Initialize everything to the kernel defaults. */
|
2013-06-27 04:14:27 +02:00
|
|
|
|
2018-11-20 19:45:02 +01:00
|
|
|
*c = (CGroupContext) {
|
|
|
|
.cpu_weight = CGROUP_WEIGHT_INVALID,
|
|
|
|
.startup_cpu_weight = CGROUP_WEIGHT_INVALID,
|
|
|
|
.cpu_quota_per_sec_usec = USEC_INFINITY,
|
2018-11-02 17:21:57 +01:00
|
|
|
.cpu_quota_period_usec = USEC_INFINITY,
|
2016-08-07 15:45:39 +02:00
|
|
|
|
2018-11-20 19:45:02 +01:00
|
|
|
.cpu_shares = CGROUP_CPU_SHARES_INVALID,
|
|
|
|
.startup_cpu_shares = CGROUP_CPU_SHARES_INVALID,
|
2015-09-11 16:48:24 +02:00
|
|
|
|
2018-11-20 19:45:02 +01:00
|
|
|
.memory_high = CGROUP_LIMIT_MAX,
|
|
|
|
.memory_max = CGROUP_LIMIT_MAX,
|
|
|
|
.memory_swap_max = CGROUP_LIMIT_MAX,
|
2016-05-27 18:10:18 +02:00
|
|
|
|
2018-11-20 19:45:02 +01:00
|
|
|
.memory_limit = CGROUP_LIMIT_MAX,
|
2014-04-25 13:27:25 +02:00
|
|
|
|
2018-11-20 19:45:02 +01:00
|
|
|
.io_weight = CGROUP_WEIGHT_INVALID,
|
|
|
|
.startup_io_weight = CGROUP_WEIGHT_INVALID,
|
2016-05-05 22:42:55 +02:00
|
|
|
|
2018-11-20 19:45:02 +01:00
|
|
|
.blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
|
|
|
|
.startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
|
2015-09-11 16:48:24 +02:00
|
|
|
|
2019-11-05 13:50:28 +01:00
|
|
|
.tasks_max = TASKS_MAX_UNSET,
|
2020-03-09 23:09:17 +01:00
|
|
|
|
|
|
|
.moom_swap = MANAGED_OOM_AUTO,
|
|
|
|
.moom_mem_pressure = MANAGED_OOM_AUTO,
|
2018-11-20 19:45:02 +01:00
|
|
|
};
|
2013-06-27 04:14:27 +02:00
|
|
|
}
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
|
|
|
|
assert(c);
|
|
|
|
assert(a);
|
|
|
|
|
2013-10-14 06:10:14 +02:00
|
|
|
LIST_REMOVE(device_allow, c->device_allow, a);
|
2013-06-27 04:14:27 +02:00
|
|
|
free(a->path);
|
|
|
|
free(a);
|
|
|
|
}
|
|
|
|
|
2016-05-05 22:42:55 +02:00
|
|
|
void cgroup_context_free_io_device_weight(CGroupContext *c, CGroupIODeviceWeight *w) {
|
|
|
|
assert(c);
|
|
|
|
assert(w);
|
|
|
|
|
|
|
|
LIST_REMOVE(device_weights, c->io_device_weights, w);
|
|
|
|
free(w->path);
|
|
|
|
free(w);
|
|
|
|
}
|
|
|
|
|
2018-06-13 23:16:35 +02:00
|
|
|
void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l) {
|
|
|
|
assert(c);
|
|
|
|
assert(l);
|
|
|
|
|
|
|
|
LIST_REMOVE(device_latencies, c->io_device_latencies, l);
|
|
|
|
free(l->path);
|
|
|
|
free(l);
|
|
|
|
}
|
|
|
|
|
2016-05-05 22:42:55 +02:00
|
|
|
void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit *l) {
|
|
|
|
assert(c);
|
|
|
|
assert(l);
|
|
|
|
|
|
|
|
LIST_REMOVE(device_limits, c->io_device_limits, l);
|
|
|
|
free(l->path);
|
|
|
|
free(l);
|
|
|
|
}
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
|
|
|
|
assert(c);
|
|
|
|
assert(w);
|
|
|
|
|
2013-10-14 06:10:14 +02:00
|
|
|
LIST_REMOVE(device_weights, c->blockio_device_weights, w);
|
2013-06-27 04:14:27 +02:00
|
|
|
free(w->path);
|
|
|
|
free(w);
|
|
|
|
}
|
|
|
|
|
|
|
|
void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
|
|
|
|
assert(c);
|
2010-03-31 16:29:55 +02:00
|
|
|
assert(b);
|
|
|
|
|
2013-10-14 06:10:14 +02:00
|
|
|
LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
|
2013-06-27 04:14:27 +02:00
|
|
|
free(b->path);
|
|
|
|
free(b);
|
|
|
|
}
|
|
|
|
|
|
|
|
void cgroup_context_done(CGroupContext *c) {
|
|
|
|
assert(c);
|
|
|
|
|
2016-05-05 22:42:55 +02:00
|
|
|
while (c->io_device_weights)
|
|
|
|
cgroup_context_free_io_device_weight(c, c->io_device_weights);
|
|
|
|
|
2018-06-13 23:16:35 +02:00
|
|
|
while (c->io_device_latencies)
|
|
|
|
cgroup_context_free_io_device_latency(c, c->io_device_latencies);
|
|
|
|
|
2016-05-05 22:42:55 +02:00
|
|
|
while (c->io_device_limits)
|
|
|
|
cgroup_context_free_io_device_limit(c, c->io_device_limits);
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
while (c->blockio_device_weights)
|
|
|
|
cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
|
|
|
|
|
|
|
|
while (c->blockio_device_bandwidths)
|
|
|
|
cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
|
|
|
|
|
|
|
|
while (c->device_allow)
|
|
|
|
cgroup_context_free_device_allow(c, c->device_allow);
|
2016-11-11 19:59:19 +01:00
|
|
|
|
|
|
|
c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow);
|
|
|
|
c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny);
|
2019-04-23 12:14:20 +02:00
|
|
|
|
|
|
|
c->ip_filters_ingress = strv_free(c->ip_filters_ingress);
|
|
|
|
c->ip_filters_egress = strv_free(c->ip_filters_egress);
|
2019-07-29 17:50:05 +02:00
|
|
|
|
|
|
|
cpu_set_reset(&c->cpuset_cpus);
|
|
|
|
cpu_set_reset(&c->cpuset_mems);
|
2013-06-27 04:14:27 +02:00
|
|
|
}
|
|
|
|
|
cgroup: Allow checking systemd-internal limits against the kernel
We currently don't have any mitigations against another privileged user
on the system messing with the cgroup hierarchy, bringing the system out
of line with what we've set in systemd. We also don't have any real way
to surface this to the user (we do have logs, but you have to know to
look in the first place).
There are a few possible solutions:
1. Maintaining our own cgroup tree with the new fsopen API and having a
read-only copy for everyone else. However, there are some
complications on this front, and this may be infeasible in some
environments. I'd rate this as a longer term effort that's tangential
to this patch.
2. Actively checking for changes with {fa,i}notify and changing them
back afterwards to match our configuration again. This is also
possible, but it's also good to have a way to do passive monitoring
of the situation without taking hard action. Also, currently daemons
like senpai do actually need to modify the tree behind systemd's
back (although hopefully this should be more integrated soon).
This patch implements another option, where one can, on demand, monitor
deviations in cgroup memory configuration from systemd's internal state.
Currently the only consumer is `systemd-analyze dump`, but the interface
is generic enough that it can also be exposed elsewhere later (for
example, over D-Bus).
Currently only memory limit style properties are supported, but later I
also plan to expand this out to other properties that systemd should
have ultimate control over.
2019-09-30 17:13:32 +02:00
|
|
|
static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) {
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (!u->cgroup_realized)
|
|
|
|
return -EOWNERDEAD;
|
|
|
|
|
2020-03-08 02:14:35 +01:00
|
|
|
return cg_get_attribute_as_uint64("memory", u->cgroup_path, file, ret);
|
cgroup: Allow checking systemd-internal limits against the kernel
We currently don't have any mitigations against another privileged user
on the system messing with the cgroup hierarchy, bringing the system out
of line with what we've set in systemd. We also don't have any real way
to surface this to the user (we do have logs, but you have to know to
look in the first place).
There are a few possible solutions:
1. Maintaining our own cgroup tree with the new fsopen API and having a
read-only copy for everyone else. However, there are some
complications on this front, and this may be infeasible in some
environments. I'd rate this as a longer term effort that's tangential
to this patch.
2. Actively checking for changes with {fa,i}notify and changing them
back afterwards to match our configuration again. This is also
possible, but it's also good to have a way to do passive monitoring
of the situation without taking hard action. Also, currently daemons
like senpai do actually need to modify the tree behind systemd's
back (although hopefully this should be more integrated soon).
This patch implements another option, where one can, on demand, monitor
deviations in cgroup memory configuration from systemd's internal state.
Currently the only consumer is `systemd-analyze dump`, but the interface
is generic enough that it can also be exposed elsewhere later (for
example, over D-Bus).
Currently only memory limit style properties are supported, but later I
also plan to expand this out to other properties that systemd should
have ultimate control over.
2019-09-30 17:13:32 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) {
|
|
|
|
CGroupContext *c;
|
|
|
|
CGroupMask m;
|
|
|
|
const char *file;
|
|
|
|
uint64_t unit_value;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
/* Compare kernel memcg configuration against our internal systemd state. Unsupported (and will
|
|
|
|
* return -ENODATA) on cgroup v1.
|
|
|
|
*
|
|
|
|
* Returns:
|
|
|
|
*
|
|
|
|
* <0: On error.
|
|
|
|
* 0: If the kernel memory setting doesn't match our configuration.
|
|
|
|
* >0: If the kernel memory setting matches our configuration.
|
|
|
|
*
|
|
|
|
* The following values are only guaranteed to be populated on return >=0:
|
|
|
|
*
|
|
|
|
* - ret_unit_value will contain our internal expected value for the unit, page-aligned.
|
|
|
|
* - ret_kernel_value will contain the actual value presented by the kernel. */
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
r = cg_all_unified();
|
|
|
|
if (r < 0)
|
|
|
|
return log_debug_errno(r, "Failed to determine cgroup hierarchy version: %m");
|
|
|
|
|
|
|
|
/* Unsupported on v1.
|
|
|
|
*
|
|
|
|
* We don't return ENOENT, since that could actually mask a genuine problem where somebody else has
|
|
|
|
* silently masked the controller. */
|
|
|
|
if (r == 0)
|
|
|
|
return -ENODATA;
|
|
|
|
|
|
|
|
/* The root slice doesn't have any controller files, so we can't compare anything. */
|
|
|
|
if (unit_has_name(u, SPECIAL_ROOT_SLICE))
|
|
|
|
return -ENODATA;
|
|
|
|
|
|
|
|
/* It's possible to have MemoryFoo set without systemd wanting to have the memory controller enabled,
|
|
|
|
* for example, in the case of DisableControllers= or cgroup_disable on the kernel command line. To
|
|
|
|
* avoid specious errors in these scenarios, check that we even expect the memory controller to be
|
|
|
|
* enabled at all. */
|
|
|
|
m = unit_get_target_mask(u);
|
|
|
|
if (!FLAGS_SET(m, CGROUP_MASK_MEMORY))
|
|
|
|
return -ENODATA;
|
|
|
|
|
|
|
|
c = unit_get_cgroup_context(u);
|
|
|
|
assert(c);
|
|
|
|
|
|
|
|
if (streq(property_name, "MemoryLow")) {
|
|
|
|
unit_value = unit_get_ancestor_memory_low(u);
|
|
|
|
file = "memory.low";
|
|
|
|
} else if (streq(property_name, "MemoryMin")) {
|
|
|
|
unit_value = unit_get_ancestor_memory_min(u);
|
|
|
|
file = "memory.min";
|
|
|
|
} else if (streq(property_name, "MemoryHigh")) {
|
|
|
|
unit_value = c->memory_high;
|
|
|
|
file = "memory.high";
|
|
|
|
} else if (streq(property_name, "MemoryMax")) {
|
|
|
|
unit_value = c->memory_max;
|
|
|
|
file = "memory.max";
|
|
|
|
} else if (streq(property_name, "MemorySwapMax")) {
|
|
|
|
unit_value = c->memory_swap_max;
|
|
|
|
file = "memory.swap.max";
|
|
|
|
} else
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
r = unit_get_kernel_memory_limit(u, file, ret_kernel_value);
|
|
|
|
if (r < 0)
|
|
|
|
return log_unit_debug_errno(u, r, "Failed to parse %s: %m", file);
|
|
|
|
|
|
|
|
/* It's intended (soon) in a future kernel to not expose cgroup memory limits rounded to page
|
|
|
|
* boundaries, but instead separate the user-exposed limit, which is whatever userspace told us, from
|
|
|
|
* our internal page-counting. To support those future kernels, just check the value itself first
|
|
|
|
* without any page-alignment. */
|
|
|
|
if (*ret_kernel_value == unit_value) {
|
|
|
|
*ret_unit_value = unit_value;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* The current kernel behaviour, by comparison, is that even if you write a particular number of
|
|
|
|
* bytes into a cgroup memory file, it always returns that number page-aligned down (since the kernel
|
|
|
|
* internally stores cgroup limits in pages). As such, so long as it aligns properly, everything is
|
|
|
|
* cricket. */
|
|
|
|
if (unit_value != CGROUP_LIMIT_MAX)
|
|
|
|
unit_value = PAGE_ALIGN_DOWN(unit_value);
|
|
|
|
|
|
|
|
*ret_unit_value = unit_value;
|
|
|
|
|
|
|
|
return *ret_kernel_value == *ret_unit_value;
|
|
|
|
}
|
|
|
|
|
2019-10-03 14:21:29 +02:00
|
|
|
#define FORMAT_CGROUP_DIFF_MAX 128
|
|
|
|
|
|
|
|
static char *format_cgroup_memory_limit_comparison(char *buf, size_t l, Unit *u, const char *property_name) {
|
|
|
|
uint64_t kval, sval;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
assert(buf);
|
|
|
|
assert(l > 0);
|
|
|
|
|
|
|
|
r = unit_compare_memory_limit(u, property_name, &sval, &kval);
|
|
|
|
|
|
|
|
/* memory.swap.max is special in that it relies on CONFIG_MEMCG_SWAP (and the default swapaccount=1).
|
|
|
|
* In the absence of reliably being able to detect whether memcg swap support is available or not,
|
|
|
|
* only complain if the error is not ENOENT. */
|
|
|
|
if (r > 0 || IN_SET(r, -ENODATA, -EOWNERDEAD) ||
|
|
|
|
(r == -ENOENT && streq(property_name, "MemorySwapMax"))) {
|
|
|
|
buf[0] = 0;
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (r < 0) {
|
|
|
|
snprintf(buf, l, " (error getting kernel value: %s)", strerror_safe(r));
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
snprintf(buf, l, " (different value in kernel: %" PRIu64 ")", kval);
|
|
|
|
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
|
2019-11-01 10:21:35 +01:00
|
|
|
_cleanup_free_ char *disable_controllers_str = NULL, *cpuset_cpus = NULL, *cpuset_mems = NULL;
|
2016-05-05 22:42:55 +02:00
|
|
|
CGroupIODeviceLimit *il;
|
|
|
|
CGroupIODeviceWeight *iw;
|
2018-06-13 23:16:35 +02:00
|
|
|
CGroupIODeviceLatency *l;
|
2013-06-27 04:14:27 +02:00
|
|
|
CGroupBlockIODeviceBandwidth *b;
|
|
|
|
CGroupBlockIODeviceWeight *w;
|
|
|
|
CGroupDeviceAllow *a;
|
2019-10-03 14:21:29 +02:00
|
|
|
CGroupContext *c;
|
2017-09-01 20:31:44 +02:00
|
|
|
IPAddressAccessItem *iaai;
|
2019-04-23 12:14:20 +02:00
|
|
|
char **path;
|
2019-10-03 14:21:29 +02:00
|
|
|
char q[FORMAT_TIMESPAN_MAX];
|
2018-11-02 17:21:57 +01:00
|
|
|
char v[FORMAT_TIMESPAN_MAX];
|
2013-06-27 04:14:27 +02:00
|
|
|
|
2019-10-03 14:21:29 +02:00
|
|
|
char cda[FORMAT_CGROUP_DIFF_MAX];
|
|
|
|
char cdb[FORMAT_CGROUP_DIFF_MAX];
|
|
|
|
char cdc[FORMAT_CGROUP_DIFF_MAX];
|
|
|
|
char cdd[FORMAT_CGROUP_DIFF_MAX];
|
|
|
|
char cde[FORMAT_CGROUP_DIFF_MAX];
|
|
|
|
|
|
|
|
assert(u);
|
2013-06-27 04:14:27 +02:00
|
|
|
assert(f);
|
|
|
|
|
2019-10-03 14:21:29 +02:00
|
|
|
c = unit_get_cgroup_context(u);
|
|
|
|
assert(c);
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
prefix = strempty(prefix);
|
|
|
|
|
2019-04-17 09:42:55 +02:00
|
|
|
(void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str);
|
|
|
|
|
2019-07-29 17:50:05 +02:00
|
|
|
cpuset_cpus = cpu_set_to_range_string(&c->cpuset_cpus);
|
|
|
|
cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems);
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
fprintf(f,
|
2019-09-30 17:09:18 +02:00
|
|
|
"%sCPUAccounting: %s\n"
|
|
|
|
"%sIOAccounting: %s\n"
|
|
|
|
"%sBlockIOAccounting: %s\n"
|
|
|
|
"%sMemoryAccounting: %s\n"
|
|
|
|
"%sTasksAccounting: %s\n"
|
|
|
|
"%sIPAccounting: %s\n"
|
|
|
|
"%sCPUWeight: %" PRIu64 "\n"
|
|
|
|
"%sStartupCPUWeight: %" PRIu64 "\n"
|
|
|
|
"%sCPUShares: %" PRIu64 "\n"
|
|
|
|
"%sStartupCPUShares: %" PRIu64 "\n"
|
|
|
|
"%sCPUQuotaPerSecSec: %s\n"
|
|
|
|
"%sCPUQuotaPeriodSec: %s\n"
|
|
|
|
"%sAllowedCPUs: %s\n"
|
|
|
|
"%sAllowedMemoryNodes: %s\n"
|
|
|
|
"%sIOWeight: %" PRIu64 "\n"
|
|
|
|
"%sStartupIOWeight: %" PRIu64 "\n"
|
|
|
|
"%sBlockIOWeight: %" PRIu64 "\n"
|
|
|
|
"%sStartupBlockIOWeight: %" PRIu64 "\n"
|
|
|
|
"%sDefaultMemoryMin: %" PRIu64 "\n"
|
|
|
|
"%sDefaultMemoryLow: %" PRIu64 "\n"
|
2019-10-03 14:21:29 +02:00
|
|
|
"%sMemoryMin: %" PRIu64 "%s\n"
|
|
|
|
"%sMemoryLow: %" PRIu64 "%s\n"
|
|
|
|
"%sMemoryHigh: %" PRIu64 "%s\n"
|
|
|
|
"%sMemoryMax: %" PRIu64 "%s\n"
|
|
|
|
"%sMemorySwapMax: %" PRIu64 "%s\n"
|
2019-09-30 17:09:18 +02:00
|
|
|
"%sMemoryLimit: %" PRIu64 "\n"
|
|
|
|
"%sTasksMax: %" PRIu64 "\n"
|
|
|
|
"%sDevicePolicy: %s\n"
|
|
|
|
"%sDisableControllers: %s\n"
|
2020-03-09 23:09:17 +01:00
|
|
|
"%sDelegate: %s\n"
|
|
|
|
"%sManagedOOMSwap: %s\n"
|
|
|
|
"%sManagedOOMMemoryPressure: %s\n"
|
|
|
|
"%sManagedOOMMemoryPressureLimitPercent: %d%%\n",
|
2013-06-27 04:14:27 +02:00
|
|
|
prefix, yes_no(c->cpu_accounting),
|
2016-05-05 22:42:55 +02:00
|
|
|
prefix, yes_no(c->io_accounting),
|
2013-06-27 04:14:27 +02:00
|
|
|
prefix, yes_no(c->blockio_accounting),
|
|
|
|
prefix, yes_no(c->memory_accounting),
|
2015-09-11 16:48:24 +02:00
|
|
|
prefix, yes_no(c->tasks_accounting),
|
2017-09-01 20:31:44 +02:00
|
|
|
prefix, yes_no(c->ip_accounting),
|
2016-08-07 15:45:39 +02:00
|
|
|
prefix, c->cpu_weight,
|
|
|
|
prefix, c->startup_cpu_weight,
|
2013-06-27 04:14:27 +02:00
|
|
|
prefix, c->cpu_shares,
|
2014-05-15 17:09:34 +02:00
|
|
|
prefix, c->startup_cpu_shares,
|
2019-10-03 14:21:29 +02:00
|
|
|
prefix, format_timespan(q, sizeof(q), c->cpu_quota_per_sec_usec, 1),
|
2018-11-02 17:21:57 +01:00
|
|
|
prefix, format_timespan(v, sizeof(v), c->cpu_quota_period_usec, 1),
|
2019-11-01 10:21:35 +01:00
|
|
|
prefix, strempty(cpuset_cpus),
|
|
|
|
prefix, strempty(cpuset_mems),
|
2016-05-05 22:42:55 +02:00
|
|
|
prefix, c->io_weight,
|
|
|
|
prefix, c->startup_io_weight,
|
2013-06-27 04:14:27 +02:00
|
|
|
prefix, c->blockio_weight,
|
2014-05-15 17:09:34 +02:00
|
|
|
prefix, c->startup_blockio_weight,
|
2019-04-16 19:44:05 +02:00
|
|
|
prefix, c->default_memory_min,
|
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow
In cgroup v2 we have protection tunables -- currently MemoryLow and
MemoryMin (there will be more in future for other resources, too). The
design of these protection tunables requires not only intermediate
cgroups to propagate protections, but also the units at the leaf of that
resource's operation to accept it (by setting MemoryLow or MemoryMin).
This makes sense from an low-level API design perspective, but it's a
good idea to also have a higher-level abstraction that can, by default,
propagate these resources to children recursively. In this patch, this
happens by having descendants set memory.low to N if their ancestor has
DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow
value.
Any affected unit can opt out of this propagation by manually setting
`MemoryLow` to some value in its unit configuration. A unit can also
stop further propagation by setting `DefaultMemoryLow=` with no
argument. This removes further propagation in the subtree, but has no
effect on the unit itself (for that, use `MemoryLow=0`).
Our use case in production is simplifying the configuration of machines
which heavily rely on memory protection tunables, but currently require
tweaking a huge number of unit files to make that a reality. This
directive makes that significantly less fragile, and decreases the risk
of misconfiguration.
After this patch is merged, I will implement DefaultMemoryMin= using the
same principles.
2019-03-28 13:50:50 +01:00
|
|
|
prefix, c->default_memory_low,
|
2019-10-03 14:21:29 +02:00
|
|
|
prefix, c->memory_min, format_cgroup_memory_limit_comparison(cda, sizeof(cda), u, "MemoryMin"),
|
|
|
|
prefix, c->memory_low, format_cgroup_memory_limit_comparison(cdb, sizeof(cdb), u, "MemoryLow"),
|
|
|
|
prefix, c->memory_high, format_cgroup_memory_limit_comparison(cdc, sizeof(cdc), u, "MemoryHigh"),
|
|
|
|
prefix, c->memory_max, format_cgroup_memory_limit_comparison(cdd, sizeof(cdd), u, "MemoryMax"),
|
|
|
|
prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(cde, sizeof(cde), u, "MemorySwapMax"),
|
2013-06-27 04:14:27 +02:00
|
|
|
prefix, c->memory_limit,
|
2019-11-05 13:50:28 +01:00
|
|
|
prefix, tasks_max_resolve(&c->tasks_max),
|
2014-11-05 17:57:23 +01:00
|
|
|
prefix, cgroup_device_policy_to_string(c->device_policy),
|
2019-07-05 13:49:43 +02:00
|
|
|
prefix, strempty(disable_controllers_str),
|
2020-03-09 23:09:17 +01:00
|
|
|
prefix, yes_no(c->delegate),
|
|
|
|
prefix, managed_oom_mode_to_string(c->moom_swap),
|
|
|
|
prefix, managed_oom_mode_to_string(c->moom_mem_pressure),
|
|
|
|
prefix, c->moom_mem_pressure_limit);
|
2013-06-27 04:14:27 +02:00
|
|
|
|
2017-11-09 15:29:34 +01:00
|
|
|
if (c->delegate) {
|
|
|
|
_cleanup_free_ char *t = NULL;
|
|
|
|
|
|
|
|
(void) cg_mask_to_string(c->delegate_controllers, &t);
|
|
|
|
|
2019-09-30 17:09:18 +02:00
|
|
|
fprintf(f, "%sDelegateControllers: %s\n",
|
2017-11-09 15:29:34 +01:00
|
|
|
prefix,
|
|
|
|
strempty(t));
|
|
|
|
}
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
LIST_FOREACH(device_allow, a, c->device_allow)
|
|
|
|
fprintf(f,
|
2019-09-30 17:09:18 +02:00
|
|
|
"%sDeviceAllow: %s %s%s%s\n",
|
2013-06-27 04:14:27 +02:00
|
|
|
prefix,
|
|
|
|
a->path,
|
|
|
|
a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
|
|
|
|
|
2016-05-05 22:42:55 +02:00
|
|
|
LIST_FOREACH(device_weights, iw, c->io_device_weights)
|
|
|
|
fprintf(f,
|
2019-09-30 17:09:18 +02:00
|
|
|
"%sIODeviceWeight: %s %" PRIu64 "\n",
|
2016-05-05 22:42:55 +02:00
|
|
|
prefix,
|
|
|
|
iw->path,
|
|
|
|
iw->weight);
|
|
|
|
|
2018-06-13 23:16:35 +02:00
|
|
|
LIST_FOREACH(device_latencies, l, c->io_device_latencies)
|
|
|
|
fprintf(f,
|
2019-09-30 17:09:18 +02:00
|
|
|
"%sIODeviceLatencyTargetSec: %s %s\n",
|
2018-06-13 23:16:35 +02:00
|
|
|
prefix,
|
|
|
|
l->path,
|
2019-10-03 14:21:29 +02:00
|
|
|
format_timespan(q, sizeof(q), l->target_usec, 1));
|
2018-06-13 23:16:35 +02:00
|
|
|
|
2016-05-05 22:42:55 +02:00
|
|
|
LIST_FOREACH(device_limits, il, c->io_device_limits) {
|
|
|
|
char buf[FORMAT_BYTES_MAX];
|
2016-05-18 22:50:56 +02:00
|
|
|
CGroupIOLimitType type;
|
|
|
|
|
|
|
|
for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
|
|
|
|
if (il->limits[type] != cgroup_io_limit_defaults[type])
|
|
|
|
fprintf(f,
|
2019-09-30 17:09:18 +02:00
|
|
|
"%s%s: %s %s\n",
|
2016-05-18 22:50:56 +02:00
|
|
|
prefix,
|
|
|
|
cgroup_io_limit_type_to_string(type),
|
|
|
|
il->path,
|
|
|
|
format_bytes(buf, sizeof(buf), il->limits[type]));
|
2016-05-05 22:42:55 +02:00
|
|
|
}
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
LIST_FOREACH(device_weights, w, c->blockio_device_weights)
|
|
|
|
fprintf(f,
|
2019-09-30 17:09:18 +02:00
|
|
|
"%sBlockIODeviceWeight: %s %" PRIu64,
|
2013-06-27 04:14:27 +02:00
|
|
|
prefix,
|
|
|
|
w->path,
|
|
|
|
w->weight);
|
|
|
|
|
|
|
|
LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
|
|
|
|
char buf[FORMAT_BYTES_MAX];
|
|
|
|
|
2016-05-18 22:51:46 +02:00
|
|
|
if (b->rbps != CGROUP_LIMIT_MAX)
|
|
|
|
fprintf(f,
|
2019-09-30 17:09:18 +02:00
|
|
|
"%sBlockIOReadBandwidth: %s %s\n",
|
2016-05-18 22:51:46 +02:00
|
|
|
prefix,
|
|
|
|
b->path,
|
|
|
|
format_bytes(buf, sizeof(buf), b->rbps));
|
|
|
|
if (b->wbps != CGROUP_LIMIT_MAX)
|
|
|
|
fprintf(f,
|
2019-09-30 17:09:18 +02:00
|
|
|
"%sBlockIOWriteBandwidth: %s %s\n",
|
2016-05-18 22:51:46 +02:00
|
|
|
prefix,
|
|
|
|
b->path,
|
|
|
|
format_bytes(buf, sizeof(buf), b->wbps));
|
2013-06-27 04:14:27 +02:00
|
|
|
}
|
2017-09-01 20:31:44 +02:00
|
|
|
|
|
|
|
LIST_FOREACH(items, iaai, c->ip_address_allow) {
|
|
|
|
_cleanup_free_ char *k = NULL;
|
|
|
|
|
|
|
|
(void) in_addr_to_string(iaai->family, &iaai->address, &k);
|
2019-09-30 17:09:18 +02:00
|
|
|
fprintf(f, "%sIPAddressAllow: %s/%u\n", prefix, strnull(k), iaai->prefixlen);
|
2017-09-01 20:31:44 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
LIST_FOREACH(items, iaai, c->ip_address_deny) {
|
|
|
|
_cleanup_free_ char *k = NULL;
|
|
|
|
|
|
|
|
(void) in_addr_to_string(iaai->family, &iaai->address, &k);
|
2019-09-30 17:09:18 +02:00
|
|
|
fprintf(f, "%sIPAddressDeny: %s/%u\n", prefix, strnull(k), iaai->prefixlen);
|
2017-09-01 20:31:44 +02:00
|
|
|
}
|
2019-04-23 12:14:20 +02:00
|
|
|
|
|
|
|
STRV_FOREACH(path, c->ip_filters_ingress)
|
2019-09-30 17:09:18 +02:00
|
|
|
fprintf(f, "%sIPIngressFilterPath: %s\n", prefix, *path);
|
2019-04-23 12:14:20 +02:00
|
|
|
|
|
|
|
STRV_FOREACH(path, c->ip_filters_egress)
|
2019-09-30 17:09:18 +02:00
|
|
|
fprintf(f, "%sIPEgressFilterPath: %s\n", prefix, *path);
|
2013-06-27 04:14:27 +02:00
|
|
|
}
|
|
|
|
|
2018-08-06 06:42:14 +02:00
|
|
|
int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode) {
|
|
|
|
_cleanup_free_ CGroupDeviceAllow *a = NULL;
|
|
|
|
_cleanup_free_ char *d = NULL;
|
|
|
|
|
|
|
|
assert(c);
|
|
|
|
assert(dev);
|
|
|
|
assert(isempty(mode) || in_charset(mode, "rwm"));
|
|
|
|
|
|
|
|
a = new(CGroupDeviceAllow, 1);
|
|
|
|
if (!a)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
d = strdup(dev);
|
|
|
|
if (!d)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
*a = (CGroupDeviceAllow) {
|
|
|
|
.path = TAKE_PTR(d),
|
2018-10-19 17:07:46 +02:00
|
|
|
.r = isempty(mode) || strchr(mode, 'r'),
|
|
|
|
.w = isempty(mode) || strchr(mode, 'w'),
|
|
|
|
.m = isempty(mode) || strchr(mode, 'm'),
|
2018-08-06 06:42:14 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
LIST_PREPEND(device_allow, c->device_allow, a);
|
|
|
|
TAKE_PTR(a);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-04-16 19:14:09 +02:00
|
|
|
#define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry) \
|
|
|
|
uint64_t unit_get_ancestor_##entry(Unit *u) { \
|
|
|
|
CGroupContext *c; \
|
|
|
|
\
|
|
|
|
/* 1. Is entry set in this unit? If so, use that. \
|
|
|
|
* 2. Is the default for this entry set in any \
|
|
|
|
* ancestor? If so, use that. \
|
|
|
|
* 3. Otherwise, return CGROUP_LIMIT_MIN. */ \
|
|
|
|
\
|
|
|
|
assert(u); \
|
|
|
|
\
|
|
|
|
c = unit_get_cgroup_context(u); \
|
2019-04-24 10:54:44 +02:00
|
|
|
if (c && c->entry##_set) \
|
2019-04-16 19:14:09 +02:00
|
|
|
return c->entry; \
|
|
|
|
\
|
2019-04-24 10:54:44 +02:00
|
|
|
while ((u = UNIT_DEREF(u->slice))) { \
|
2019-04-16 19:14:09 +02:00
|
|
|
c = unit_get_cgroup_context(u); \
|
2019-04-24 10:54:44 +02:00
|
|
|
if (c && c->default_##entry##_set) \
|
2019-04-16 19:14:09 +02:00
|
|
|
return c->default_##entry; \
|
|
|
|
} \
|
|
|
|
\
|
|
|
|
/* We've reached the root, but nobody had default for \
|
|
|
|
* this entry set, so set it to the kernel default. */ \
|
|
|
|
return CGROUP_LIMIT_MIN; \
|
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow
In cgroup v2 we have protection tunables -- currently MemoryLow and
MemoryMin (there will be more in future for other resources, too). The
design of these protection tunables requires not only intermediate
cgroups to propagate protections, but also the units at the leaf of that
resource's operation to accept it (by setting MemoryLow or MemoryMin).
This makes sense from an low-level API design perspective, but it's a
good idea to also have a higher-level abstraction that can, by default,
propagate these resources to children recursively. In this patch, this
happens by having descendants set memory.low to N if their ancestor has
DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow
value.
Any affected unit can opt out of this propagation by manually setting
`MemoryLow` to some value in its unit configuration. A unit can also
stop further propagation by setting `DefaultMemoryLow=` with no
argument. This removes further propagation in the subtree, but has no
effect on the unit itself (for that, use `MemoryLow=0`).
Our use case in production is simplifying the configuration of machines
which heavily rely on memory protection tunables, but currently require
tweaking a huge number of unit files to make that a reality. This
directive makes that significantly less fragile, and decreases the risk
of misconfiguration.
After this patch is merged, I will implement DefaultMemoryMin= using the
same principles.
2019-03-28 13:50:50 +01:00
|
|
|
}
|
|
|
|
|
2019-04-16 19:14:09 +02:00
|
|
|
UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low);
|
2019-04-16 19:44:05 +02:00
|
|
|
UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min);
|
2019-04-16 19:14:09 +02:00
|
|
|
|
2018-11-27 16:16:35 +01:00
|
|
|
static void cgroup_xattr_apply(Unit *u) {
|
|
|
|
char ids[SD_ID128_STRING_MAX];
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (!MANAGER_IS_SYSTEM(u->manager))
|
|
|
|
return;
|
|
|
|
|
2019-11-20 17:42:02 +01:00
|
|
|
if (!sd_id128_is_null(u->invocation_id)) {
|
|
|
|
r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
|
|
|
|
"trusted.invocation_id",
|
|
|
|
sd_id128_to_string(u->invocation_id, ids), 32,
|
|
|
|
0);
|
|
|
|
if (r < 0)
|
|
|
|
log_unit_debug_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path);
|
|
|
|
}
|
2018-11-27 16:16:35 +01:00
|
|
|
|
2019-11-20 17:42:02 +01:00
|
|
|
if (unit_cgroup_delegate(u)) {
|
|
|
|
r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path,
|
|
|
|
"trusted.delegate",
|
|
|
|
"1", 1,
|
|
|
|
0);
|
|
|
|
if (r < 0)
|
|
|
|
log_unit_debug_errno(u, r, "Failed to set delegate flag on control group %s, ignoring: %m", u->cgroup_path);
|
|
|
|
} else {
|
|
|
|
r = cg_remove_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "trusted.delegate");
|
|
|
|
if (r != -ENODATA)
|
|
|
|
log_unit_debug_errno(u, r, "Failed to remove delegate flag on control group %s, ignoring: %m", u->cgroup_path);
|
|
|
|
}
|
2018-11-27 16:16:35 +01:00
|
|
|
}
|
|
|
|
|
2018-06-11 12:17:32 +02:00
|
|
|
static int lookup_block_device(const char *p, dev_t *ret) {
|
2018-12-25 12:01:17 +01:00
|
|
|
dev_t rdev, dev = 0;
|
|
|
|
mode_t mode;
|
2018-06-11 12:17:32 +02:00
|
|
|
int r;
|
2013-06-27 04:14:27 +02:00
|
|
|
|
|
|
|
assert(p);
|
2018-06-11 12:17:32 +02:00
|
|
|
assert(ret);
|
2013-06-27 04:14:27 +02:00
|
|
|
|
2018-12-25 12:01:17 +01:00
|
|
|
r = device_path_parse_major_minor(p, &mode, &rdev);
|
2018-07-02 18:20:03 +02:00
|
|
|
if (r == -ENODEV) { /* not a parsable device node, need to go to disk */
|
2018-12-25 12:01:17 +01:00
|
|
|
struct stat st;
|
2020-01-13 20:02:01 +01:00
|
|
|
|
2018-07-02 18:20:03 +02:00
|
|
|
if (stat(p, &st) < 0)
|
|
|
|
return log_warning_errno(errno, "Couldn't stat device '%s': %m", p);
|
2020-01-13 20:02:01 +01:00
|
|
|
|
2018-12-25 12:01:17 +01:00
|
|
|
mode = st.st_mode;
|
2020-01-13 20:04:38 +01:00
|
|
|
rdev = st.st_rdev;
|
|
|
|
dev = st.st_dev;
|
2018-07-02 18:20:03 +02:00
|
|
|
} else if (r < 0)
|
|
|
|
return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p);
|
|
|
|
|
2020-01-13 20:02:01 +01:00
|
|
|
if (S_ISCHR(mode))
|
|
|
|
return log_warning_errno(SYNTHETIC_ERRNO(ENOTBLK),
|
|
|
|
"Device node '%s' is a character device, but block device needed.", p);
|
|
|
|
if (S_ISBLK(mode))
|
2018-12-25 12:01:17 +01:00
|
|
|
*ret = rdev;
|
|
|
|
else if (major(dev) != 0)
|
|
|
|
*ret = dev; /* If this is not a device node then use the block device this file is stored on */
|
2018-06-11 12:17:32 +02:00
|
|
|
else {
|
|
|
|
/* If this is btrfs, getting the backing block device is a bit harder */
|
|
|
|
r = btrfs_get_block_device(p, ret);
|
2020-01-13 20:02:01 +01:00
|
|
|
if (r == -ENOTTY)
|
|
|
|
return log_warning_errno(SYNTHETIC_ERRNO(ENODEV),
|
|
|
|
"'%s' is not a block device node, and file system block device cannot be determined or is not local.", p);
|
|
|
|
if (r < 0)
|
2018-06-11 12:17:32 +02:00
|
|
|
return log_warning_errno(r, "Failed to determine block device backing btrfs file system '%s': %m", p);
|
2013-06-27 04:14:27 +02:00
|
|
|
}
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2020-03-05 11:35:45 +01:00
|
|
|
/* If this is a LUKS/DM device, recursively try to get the originating block device */
|
|
|
|
while (block_get_originating(*ret, ret) > 0);
|
2018-06-11 12:17:32 +02:00
|
|
|
|
|
|
|
/* If this is a partition, try to get the originating block device */
|
|
|
|
(void) block_get_whole_disk(*ret, ret);
|
2010-03-31 16:29:55 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-08-07 15:45:39 +02:00
|
|
|
static bool cgroup_context_has_cpu_weight(CGroupContext *c) {
|
|
|
|
return c->cpu_weight != CGROUP_WEIGHT_INVALID ||
|
|
|
|
c->startup_cpu_weight != CGROUP_WEIGHT_INVALID;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool cgroup_context_has_cpu_shares(CGroupContext *c) {
|
|
|
|
return c->cpu_shares != CGROUP_CPU_SHARES_INVALID ||
|
|
|
|
c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state) {
|
|
|
|
if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
|
|
|
|
c->startup_cpu_weight != CGROUP_WEIGHT_INVALID)
|
|
|
|
return c->startup_cpu_weight;
|
|
|
|
else if (c->cpu_weight != CGROUP_WEIGHT_INVALID)
|
|
|
|
return c->cpu_weight;
|
|
|
|
else
|
|
|
|
return CGROUP_WEIGHT_DEFAULT;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint64_t cgroup_context_cpu_shares(CGroupContext *c, ManagerState state) {
|
|
|
|
if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
|
|
|
|
c->startup_cpu_shares != CGROUP_CPU_SHARES_INVALID)
|
|
|
|
return c->startup_cpu_shares;
|
|
|
|
else if (c->cpu_shares != CGROUP_CPU_SHARES_INVALID)
|
|
|
|
return c->cpu_shares;
|
|
|
|
else
|
|
|
|
return CGROUP_CPU_SHARES_DEFAULT;
|
|
|
|
}
|
|
|
|
|
2018-11-02 17:21:57 +01:00
|
|
|
usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period) {
|
|
|
|
/* kernel uses a minimum resolution of 1ms, so both period and (quota * period)
|
|
|
|
* need to be higher than that boundary. quota is specified in USecPerSec.
|
|
|
|
* Additionally, period must be at most max_period. */
|
|
|
|
assert(quota > 0);
|
|
|
|
|
|
|
|
return MIN(MAX3(period, resolution, resolution * USEC_PER_SEC / quota), max_period);
|
|
|
|
}
|
|
|
|
|
|
|
|
static usec_t cgroup_cpu_adjust_period_and_log(Unit *u, usec_t period, usec_t quota) {
|
|
|
|
usec_t new_period;
|
|
|
|
|
|
|
|
if (quota == USEC_INFINITY)
|
|
|
|
/* Always use default period for infinity quota. */
|
|
|
|
return CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
|
|
|
|
|
|
|
|
if (period == USEC_INFINITY)
|
|
|
|
/* Default period was requested. */
|
|
|
|
period = CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC;
|
|
|
|
|
|
|
|
/* Clamp to interval [1ms, 1s] */
|
|
|
|
new_period = cgroup_cpu_adjust_period(period, quota, USEC_PER_MSEC, USEC_PER_SEC);
|
|
|
|
|
|
|
|
if (new_period != period) {
|
|
|
|
char v[FORMAT_TIMESPAN_MAX];
|
2020-09-08 19:28:36 +02:00
|
|
|
log_unit_full(u, u->warned_clamping_cpu_quota_period ? LOG_DEBUG : LOG_WARNING,
|
2018-11-02 17:21:57 +01:00
|
|
|
"Clamping CPU interval for cpu.max: period is now %s",
|
|
|
|
format_timespan(v, sizeof(v), new_period, 1));
|
2019-01-24 05:19:44 +01:00
|
|
|
u->warned_clamping_cpu_quota_period = true;
|
2018-11-02 17:21:57 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return new_period;
|
|
|
|
}
|
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
static void cgroup_apply_unified_cpu_weight(Unit *u, uint64_t weight) {
|
|
|
|
char buf[DECIMAL_STR_MAX(uint64_t) + 2];
|
2016-08-07 15:45:39 +02:00
|
|
|
|
|
|
|
xsprintf(buf, "%" PRIu64 "\n", weight);
|
2018-11-20 20:19:58 +01:00
|
|
|
(void) set_attribute_and_warn(u, "cpu", "cpu.weight", buf);
|
2018-11-20 23:05:12 +01:00
|
|
|
}
|
|
|
|
|
2018-11-02 17:21:57 +01:00
|
|
|
static void cgroup_apply_unified_cpu_quota(Unit *u, usec_t quota, usec_t period) {
|
2018-11-20 23:05:12 +01:00
|
|
|
char buf[(DECIMAL_STR_MAX(usec_t) + 1) * 2 + 1];
|
2016-08-07 15:45:39 +02:00
|
|
|
|
2018-11-02 17:21:57 +01:00
|
|
|
period = cgroup_cpu_adjust_period_and_log(u, period, quota);
|
2016-08-07 15:45:39 +02:00
|
|
|
if (quota != USEC_INFINITY)
|
|
|
|
xsprintf(buf, USEC_FMT " " USEC_FMT "\n",
|
2018-11-02 17:21:57 +01:00
|
|
|
MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC), period);
|
2016-08-07 15:45:39 +02:00
|
|
|
else
|
2018-11-02 17:21:57 +01:00
|
|
|
xsprintf(buf, "max " USEC_FMT "\n", period);
|
2018-11-20 20:19:58 +01:00
|
|
|
(void) set_attribute_and_warn(u, "cpu", "cpu.max", buf);
|
2016-08-07 15:45:39 +02:00
|
|
|
}
|
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
static void cgroup_apply_legacy_cpu_shares(Unit *u, uint64_t shares) {
|
|
|
|
char buf[DECIMAL_STR_MAX(uint64_t) + 2];
|
2016-08-07 15:45:39 +02:00
|
|
|
|
|
|
|
xsprintf(buf, "%" PRIu64 "\n", shares);
|
2018-11-20 20:19:58 +01:00
|
|
|
(void) set_attribute_and_warn(u, "cpu", "cpu.shares", buf);
|
2018-11-20 23:05:12 +01:00
|
|
|
}
|
|
|
|
|
2018-11-02 17:21:57 +01:00
|
|
|
static void cgroup_apply_legacy_cpu_quota(Unit *u, usec_t quota, usec_t period) {
|
2018-11-20 23:05:12 +01:00
|
|
|
char buf[DECIMAL_STR_MAX(usec_t) + 2];
|
2016-08-07 15:45:39 +02:00
|
|
|
|
2018-11-02 17:21:57 +01:00
|
|
|
period = cgroup_cpu_adjust_period_and_log(u, period, quota);
|
|
|
|
|
|
|
|
xsprintf(buf, USEC_FMT "\n", period);
|
2018-11-20 20:19:58 +01:00
|
|
|
(void) set_attribute_and_warn(u, "cpu", "cpu.cfs_period_us", buf);
|
2016-08-07 15:45:39 +02:00
|
|
|
|
|
|
|
if (quota != USEC_INFINITY) {
|
2018-11-02 17:21:57 +01:00
|
|
|
xsprintf(buf, USEC_FMT "\n", MAX(quota * period / USEC_PER_SEC, USEC_PER_MSEC));
|
2018-11-20 20:19:58 +01:00
|
|
|
(void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", buf);
|
2016-08-07 15:45:39 +02:00
|
|
|
} else
|
2018-11-20 22:50:13 +01:00
|
|
|
(void) set_attribute_and_warn(u, "cpu", "cpu.cfs_quota_us", "-1\n");
|
2016-08-07 15:45:39 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static uint64_t cgroup_cpu_shares_to_weight(uint64_t shares) {
|
|
|
|
return CLAMP(shares * CGROUP_WEIGHT_DEFAULT / CGROUP_CPU_SHARES_DEFAULT,
|
|
|
|
CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
|
|
|
|
return CLAMP(weight * CGROUP_CPU_SHARES_DEFAULT / CGROUP_WEIGHT_DEFAULT,
|
|
|
|
CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
|
|
|
|
}
|
|
|
|
|
2019-10-11 11:43:32 +02:00
|
|
|
static void cgroup_apply_unified_cpuset(Unit *u, const CPUSet *cpus, const char *name) {
|
2019-07-29 17:50:05 +02:00
|
|
|
_cleanup_free_ char *buf = NULL;
|
|
|
|
|
2019-10-11 11:43:32 +02:00
|
|
|
buf = cpu_set_to_range_string(cpus);
|
2019-11-01 10:21:53 +01:00
|
|
|
if (!buf) {
|
|
|
|
log_oom();
|
|
|
|
return;
|
|
|
|
}
|
2019-07-29 17:50:05 +02:00
|
|
|
|
|
|
|
(void) set_attribute_and_warn(u, "cpuset", name, buf);
|
|
|
|
}
|
|
|
|
|
2016-05-20 22:46:42 +02:00
|
|
|
static bool cgroup_context_has_io_config(CGroupContext *c) {
|
2016-05-19 02:35:12 +02:00
|
|
|
return c->io_accounting ||
|
|
|
|
c->io_weight != CGROUP_WEIGHT_INVALID ||
|
|
|
|
c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
|
|
|
|
c->io_device_weights ||
|
2018-06-13 23:16:35 +02:00
|
|
|
c->io_device_latencies ||
|
2016-05-19 02:35:12 +02:00
|
|
|
c->io_device_limits;
|
|
|
|
}
|
|
|
|
|
2016-05-20 22:46:42 +02:00
|
|
|
static bool cgroup_context_has_blockio_config(CGroupContext *c) {
|
2016-05-19 02:35:12 +02:00
|
|
|
return c->blockio_accounting ||
|
|
|
|
c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
|
|
|
|
c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID ||
|
|
|
|
c->blockio_device_weights ||
|
|
|
|
c->blockio_device_bandwidths;
|
|
|
|
}
|
|
|
|
|
2016-05-20 22:46:42 +02:00
|
|
|
static uint64_t cgroup_context_io_weight(CGroupContext *c, ManagerState state) {
|
2016-05-18 22:51:46 +02:00
|
|
|
if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
|
|
|
|
c->startup_io_weight != CGROUP_WEIGHT_INVALID)
|
|
|
|
return c->startup_io_weight;
|
|
|
|
else if (c->io_weight != CGROUP_WEIGHT_INVALID)
|
|
|
|
return c->io_weight;
|
|
|
|
else
|
|
|
|
return CGROUP_WEIGHT_DEFAULT;
|
|
|
|
}
|
|
|
|
|
2016-05-20 22:46:42 +02:00
|
|
|
static uint64_t cgroup_context_blkio_weight(CGroupContext *c, ManagerState state) {
|
2016-05-18 22:51:46 +02:00
|
|
|
if (IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) &&
|
|
|
|
c->startup_blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
|
|
|
|
return c->startup_blockio_weight;
|
|
|
|
else if (c->blockio_weight != CGROUP_BLKIO_WEIGHT_INVALID)
|
|
|
|
return c->blockio_weight;
|
|
|
|
else
|
|
|
|
return CGROUP_BLKIO_WEIGHT_DEFAULT;
|
|
|
|
}
|
|
|
|
|
2016-05-20 22:46:42 +02:00
|
|
|
static uint64_t cgroup_weight_blkio_to_io(uint64_t blkio_weight) {
|
2016-05-19 02:35:12 +02:00
|
|
|
return CLAMP(blkio_weight * CGROUP_WEIGHT_DEFAULT / CGROUP_BLKIO_WEIGHT_DEFAULT,
|
|
|
|
CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
|
|
|
|
}
|
|
|
|
|
2016-05-20 22:46:42 +02:00
|
|
|
static uint64_t cgroup_weight_io_to_blkio(uint64_t io_weight) {
|
2016-05-19 02:35:12 +02:00
|
|
|
return CLAMP(io_weight * CGROUP_BLKIO_WEIGHT_DEFAULT / CGROUP_WEIGHT_DEFAULT,
|
|
|
|
CGROUP_BLKIO_WEIGHT_MIN, CGROUP_BLKIO_WEIGHT_MAX);
|
|
|
|
}
|
|
|
|
|
2016-06-02 19:02:49 +02:00
|
|
|
static void cgroup_apply_io_device_weight(Unit *u, const char *dev_path, uint64_t io_weight) {
|
2016-05-18 22:51:46 +02:00
|
|
|
char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
|
|
|
|
dev_t dev;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
r = lookup_block_device(dev_path, &dev);
|
|
|
|
if (r < 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), io_weight);
|
2018-11-20 20:19:58 +01:00
|
|
|
(void) set_attribute_and_warn(u, "io", "io.weight", buf);
|
2016-05-18 22:51:46 +02:00
|
|
|
}
|
|
|
|
|
2016-06-02 19:02:49 +02:00
|
|
|
static void cgroup_apply_blkio_device_weight(Unit *u, const char *dev_path, uint64_t blkio_weight) {
|
2016-05-18 22:51:46 +02:00
|
|
|
char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
|
|
|
|
dev_t dev;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
r = lookup_block_device(dev_path, &dev);
|
|
|
|
if (r < 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
xsprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), blkio_weight);
|
2018-11-20 20:19:58 +01:00
|
|
|
(void) set_attribute_and_warn(u, "blkio", "blkio.weight_device", buf);
|
2016-05-18 22:51:46 +02:00
|
|
|
}
|
|
|
|
|
2018-06-13 23:16:35 +02:00
|
|
|
static void cgroup_apply_io_device_latency(Unit *u, const char *dev_path, usec_t target) {
|
|
|
|
char buf[DECIMAL_STR_MAX(dev_t)*2+2+7+DECIMAL_STR_MAX(uint64_t)+1];
|
|
|
|
dev_t dev;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
r = lookup_block_device(dev_path, &dev);
|
|
|
|
if (r < 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (target != USEC_INFINITY)
|
|
|
|
xsprintf(buf, "%u:%u target=%" PRIu64 "\n", major(dev), minor(dev), target);
|
|
|
|
else
|
|
|
|
xsprintf(buf, "%u:%u target=max\n", major(dev), minor(dev));
|
|
|
|
|
2018-11-20 20:19:58 +01:00
|
|
|
(void) set_attribute_and_warn(u, "io", "io.latency", buf);
|
2018-06-13 23:16:35 +02:00
|
|
|
}
|
|
|
|
|
2018-06-12 19:37:22 +02:00
|
|
|
static void cgroup_apply_io_device_limit(Unit *u, const char *dev_path, uint64_t *limits) {
|
2016-05-18 22:51:46 +02:00
|
|
|
char limit_bufs[_CGROUP_IO_LIMIT_TYPE_MAX][DECIMAL_STR_MAX(uint64_t)];
|
|
|
|
char buf[DECIMAL_STR_MAX(dev_t)*2+2+(6+DECIMAL_STR_MAX(uint64_t)+1)*4];
|
|
|
|
CGroupIOLimitType type;
|
|
|
|
dev_t dev;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
r = lookup_block_device(dev_path, &dev);
|
|
|
|
if (r < 0)
|
2018-06-12 19:37:22 +02:00
|
|
|
return;
|
2016-05-18 22:51:46 +02:00
|
|
|
|
2018-06-12 19:37:22 +02:00
|
|
|
for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
|
|
|
|
if (limits[type] != cgroup_io_limit_defaults[type])
|
2016-05-18 22:51:46 +02:00
|
|
|
xsprintf(limit_bufs[type], "%" PRIu64, limits[type]);
|
2018-06-12 19:37:22 +02:00
|
|
|
else
|
2016-05-18 22:51:46 +02:00
|
|
|
xsprintf(limit_bufs[type], "%s", limits[type] == CGROUP_LIMIT_MAX ? "max" : "0");
|
|
|
|
|
|
|
|
xsprintf(buf, "%u:%u rbps=%s wbps=%s riops=%s wiops=%s\n", major(dev), minor(dev),
|
|
|
|
limit_bufs[CGROUP_IO_RBPS_MAX], limit_bufs[CGROUP_IO_WBPS_MAX],
|
|
|
|
limit_bufs[CGROUP_IO_RIOPS_MAX], limit_bufs[CGROUP_IO_WIOPS_MAX]);
|
2018-11-20 20:19:58 +01:00
|
|
|
(void) set_attribute_and_warn(u, "io", "io.max", buf);
|
2016-05-18 22:51:46 +02:00
|
|
|
}
|
|
|
|
|
2018-06-12 19:37:22 +02:00
|
|
|
static void cgroup_apply_blkio_device_limit(Unit *u, const char *dev_path, uint64_t rbps, uint64_t wbps) {
|
2016-05-18 22:51:46 +02:00
|
|
|
char buf[DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1];
|
|
|
|
dev_t dev;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
r = lookup_block_device(dev_path, &dev);
|
|
|
|
if (r < 0)
|
2018-06-12 19:37:22 +02:00
|
|
|
return;
|
2016-05-18 22:51:46 +02:00
|
|
|
|
|
|
|
sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), rbps);
|
2018-11-20 20:19:58 +01:00
|
|
|
(void) set_attribute_and_warn(u, "blkio", "blkio.throttle.read_bps_device", buf);
|
2016-05-18 22:51:46 +02:00
|
|
|
|
|
|
|
sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), wbps);
|
2018-11-20 20:19:58 +01:00
|
|
|
(void) set_attribute_and_warn(u, "blkio", "blkio.throttle.write_bps_device", buf);
|
2016-05-18 22:51:46 +02:00
|
|
|
}
|
|
|
|
|
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow
In cgroup v2 we have protection tunables -- currently MemoryLow and
MemoryMin (there will be more in future for other resources, too). The
design of these protection tunables requires not only intermediate
cgroups to propagate protections, but also the units at the leaf of that
resource's operation to accept it (by setting MemoryLow or MemoryMin).
This makes sense from an low-level API design perspective, but it's a
good idea to also have a higher-level abstraction that can, by default,
propagate these resources to children recursively. In this patch, this
happens by having descendants set memory.low to N if their ancestor has
DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow
value.
Any affected unit can opt out of this propagation by manually setting
`MemoryLow` to some value in its unit configuration. A unit can also
stop further propagation by setting `DefaultMemoryLow=` with no
argument. This removes further propagation in the subtree, but has no
effect on the unit itself (for that, use `MemoryLow=0`).
Our use case in production is simplifying the configuration of machines
which heavily rely on memory protection tunables, but currently require
tweaking a huge number of unit files to make that a reality. This
directive makes that significantly less fragile, and decreases the risk
of misconfiguration.
After this patch is merged, I will implement DefaultMemoryMin= using the
same principles.
2019-03-28 13:50:50 +01:00
|
|
|
static bool unit_has_unified_memory_config(Unit *u) {
|
|
|
|
CGroupContext *c;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
c = unit_get_cgroup_context(u);
|
|
|
|
assert(c);
|
|
|
|
|
2019-09-30 19:24:26 +02:00
|
|
|
return unit_get_ancestor_memory_min(u) > 0 || unit_get_ancestor_memory_low(u) > 0 ||
|
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow
In cgroup v2 we have protection tunables -- currently MemoryLow and
MemoryMin (there will be more in future for other resources, too). The
design of these protection tunables requires not only intermediate
cgroups to propagate protections, but also the units at the leaf of that
resource's operation to accept it (by setting MemoryLow or MemoryMin).
This makes sense from an low-level API design perspective, but it's a
good idea to also have a higher-level abstraction that can, by default,
propagate these resources to children recursively. In this patch, this
happens by having descendants set memory.low to N if their ancestor has
DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow
value.
Any affected unit can opt out of this propagation by manually setting
`MemoryLow` to some value in its unit configuration. A unit can also
stop further propagation by setting `DefaultMemoryLow=` with no
argument. This removes further propagation in the subtree, but has no
effect on the unit itself (for that, use `MemoryLow=0`).
Our use case in production is simplifying the configuration of machines
which heavily rely on memory protection tunables, but currently require
tweaking a huge number of unit files to make that a reality. This
directive makes that significantly less fragile, and decreases the risk
of misconfiguration.
After this patch is merged, I will implement DefaultMemoryMin= using the
same principles.
2019-03-28 13:50:50 +01:00
|
|
|
c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX ||
|
|
|
|
c->memory_swap_max != CGROUP_LIMIT_MAX;
|
2016-05-27 18:10:18 +02:00
|
|
|
}
|
|
|
|
|
2016-06-02 19:02:49 +02:00
|
|
|
static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_t v) {
|
2018-11-20 22:50:13 +01:00
|
|
|
char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max\n";
|
2016-05-27 18:10:18 +02:00
|
|
|
|
|
|
|
if (v != CGROUP_LIMIT_MAX)
|
|
|
|
xsprintf(buf, "%" PRIu64 "\n", v);
|
|
|
|
|
2018-11-20 20:19:58 +01:00
|
|
|
(void) set_attribute_and_warn(u, "memory", file, buf);
|
2016-05-27 18:10:18 +02:00
|
|
|
}
|
|
|
|
|
2017-11-24 19:37:01 +01:00
|
|
|
static void cgroup_apply_firewall(Unit *u) {
|
|
|
|
assert(u);
|
|
|
|
|
2018-02-16 15:35:49 +01:00
|
|
|
/* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
|
2017-09-05 19:27:53 +02:00
|
|
|
|
2018-02-16 15:35:49 +01:00
|
|
|
if (bpf_firewall_compile(u) < 0)
|
2017-09-05 19:27:53 +02:00
|
|
|
return;
|
|
|
|
|
2019-04-23 12:14:20 +02:00
|
|
|
(void) bpf_firewall_load_custom(u);
|
2017-09-05 19:27:53 +02:00
|
|
|
(void) bpf_firewall_install(u);
|
|
|
|
}
|
|
|
|
|
2019-11-08 16:19:43 +01:00
|
|
|
static int cgroup_apply_devices(Unit *u) {
|
|
|
|
_cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL;
|
|
|
|
const char *path;
|
|
|
|
CGroupContext *c;
|
|
|
|
CGroupDeviceAllow *a;
|
2019-11-10 23:08:21 +01:00
|
|
|
CGroupDevicePolicy policy;
|
2019-11-08 16:19:43 +01:00
|
|
|
int r;
|
|
|
|
|
|
|
|
assert_se(c = unit_get_cgroup_context(u));
|
|
|
|
assert_se(path = u->cgroup_path);
|
|
|
|
|
2019-11-10 23:08:21 +01:00
|
|
|
policy = c->device_policy;
|
|
|
|
|
2019-11-08 16:19:43 +01:00
|
|
|
if (cg_all_unified() > 0) {
|
2019-11-10 23:08:21 +01:00
|
|
|
r = bpf_devices_cgroup_init(&prog, policy, c->device_allow);
|
2019-11-08 16:19:43 +01:00
|
|
|
if (r < 0)
|
|
|
|
return log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m");
|
|
|
|
|
|
|
|
} else {
|
|
|
|
/* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore
|
|
|
|
* EINVAL here. */
|
|
|
|
|
2019-11-10 23:08:21 +01:00
|
|
|
if (c->device_allow || policy != CGROUP_DEVICE_POLICY_AUTO)
|
2019-11-08 16:19:43 +01:00
|
|
|
r = cg_set_attribute("devices", path, "devices.deny", "a");
|
|
|
|
else
|
|
|
|
r = cg_set_attribute("devices", path, "devices.allow", "a");
|
|
|
|
if (r < 0)
|
2020-09-08 19:28:36 +02:00
|
|
|
log_unit_full_errno(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
|
|
|
|
"Failed to reset devices.allow/devices.deny: %m");
|
2019-11-08 16:19:43 +01:00
|
|
|
}
|
|
|
|
|
2020-06-23 08:31:16 +02:00
|
|
|
bool allow_list_static = policy == CGROUP_DEVICE_POLICY_CLOSED ||
|
2019-11-10 23:08:21 +01:00
|
|
|
(policy == CGROUP_DEVICE_POLICY_AUTO && c->device_allow);
|
2020-06-23 08:31:16 +02:00
|
|
|
if (allow_list_static)
|
|
|
|
(void) bpf_devices_allow_list_static(prog, path);
|
2019-11-08 16:19:43 +01:00
|
|
|
|
2020-06-23 08:31:16 +02:00
|
|
|
bool any = allow_list_static;
|
2019-11-08 16:19:43 +01:00
|
|
|
LIST_FOREACH(device_allow, a, c->device_allow) {
|
|
|
|
char acc[4], *val;
|
|
|
|
unsigned k = 0;
|
|
|
|
|
|
|
|
if (a->r)
|
|
|
|
acc[k++] = 'r';
|
|
|
|
if (a->w)
|
|
|
|
acc[k++] = 'w';
|
|
|
|
if (a->m)
|
|
|
|
acc[k++] = 'm';
|
|
|
|
if (k == 0)
|
|
|
|
continue;
|
|
|
|
acc[k++] = 0;
|
|
|
|
|
|
|
|
if (path_startswith(a->path, "/dev/"))
|
2020-06-23 08:31:16 +02:00
|
|
|
r = bpf_devices_allow_list_device(prog, path, a->path, acc);
|
2019-11-08 16:19:43 +01:00
|
|
|
else if ((val = startswith(a->path, "block-")))
|
2020-06-23 08:31:16 +02:00
|
|
|
r = bpf_devices_allow_list_major(prog, path, val, 'b', acc);
|
2019-11-08 16:19:43 +01:00
|
|
|
else if ((val = startswith(a->path, "char-")))
|
2020-06-23 08:31:16 +02:00
|
|
|
r = bpf_devices_allow_list_major(prog, path, val, 'c', acc);
|
2019-11-10 23:08:21 +01:00
|
|
|
else {
|
2019-11-08 16:19:43 +01:00
|
|
|
log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path);
|
2019-11-10 23:08:21 +01:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (r >= 0)
|
|
|
|
any = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (prog && !any) {
|
|
|
|
log_unit_warning_errno(u, SYNTHETIC_ERRNO(ENODEV), "No devices matched by device filter.");
|
|
|
|
|
|
|
|
/* The kernel verifier would reject a program we would build with the normal intro and outro
|
2020-06-23 08:31:16 +02:00
|
|
|
but no allow-listing rules (outro would contain an unreachable instruction for successful
|
2019-11-10 23:08:21 +01:00
|
|
|
return). */
|
|
|
|
policy = CGROUP_DEVICE_POLICY_STRICT;
|
2019-11-08 16:19:43 +01:00
|
|
|
}
|
|
|
|
|
2019-11-10 23:08:21 +01:00
|
|
|
r = bpf_devices_apply_policy(prog, policy, any, path, &u->bpf_device_control_installed);
|
2019-11-08 16:19:43 +01:00
|
|
|
if (r < 0) {
|
|
|
|
static bool warned = false;
|
|
|
|
|
|
|
|
log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r,
|
|
|
|
"Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n"
|
|
|
|
"Proceeding WITHOUT applying ACL (all devices will be accessible)!\n"
|
|
|
|
"(This warning is only shown for the first loaded unit using device ACL.)", u->id);
|
|
|
|
|
|
|
|
warned = true;
|
|
|
|
}
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2017-09-05 19:27:53 +02:00
|
|
|
static void cgroup_context_apply(
|
|
|
|
Unit *u,
|
|
|
|
CGroupMask apply_mask,
|
|
|
|
ManagerState state) {
|
|
|
|
|
2016-06-02 19:02:49 +02:00
|
|
|
const char *path;
|
|
|
|
CGroupContext *c;
|
2018-11-20 23:05:12 +01:00
|
|
|
bool is_host_root, is_local_root;
|
2013-06-27 04:14:27 +02:00
|
|
|
int r;
|
|
|
|
|
2016-06-02 19:02:49 +02:00
|
|
|
assert(u);
|
|
|
|
|
2017-09-05 19:27:53 +02:00
|
|
|
/* Nothing to do? Exit early! */
|
2018-09-30 12:33:16 +02:00
|
|
|
if (apply_mask == 0)
|
2013-06-27 04:14:27 +02:00
|
|
|
return;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
/* Some cgroup attributes are not supported on the host root cgroup, hence silently ignore them here. And other
|
|
|
|
* attributes should only be managed for cgroups further down the tree. */
|
|
|
|
is_local_root = unit_has_name(u, SPECIAL_ROOT_SLICE);
|
|
|
|
is_host_root = unit_has_host_root_cgroup(u);
|
2018-01-17 18:41:42 +01:00
|
|
|
|
|
|
|
assert_se(c = unit_get_cgroup_context(u));
|
|
|
|
assert_se(path = u->cgroup_path);
|
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
if (is_local_root) /* Make sure we don't try to display messages with an empty path. */
|
2015-01-06 01:03:23 +01:00
|
|
|
path = "/";
|
2014-02-24 03:38:24 +01:00
|
|
|
|
2018-11-30 18:45:22 +01:00
|
|
|
/* We generally ignore errors caused by read-only mounted cgroup trees (assuming we are running in a container
|
|
|
|
* then), and missing cgroups, i.e. EROFS and ENOENT. */
|
2014-12-30 01:56:42 +01:00
|
|
|
|
2018-11-30 18:45:22 +01:00
|
|
|
/* In fully unified mode these attributes don't exist on the host cgroup root. On legacy the weights exist, but
|
|
|
|
* setting the weight makes very little sense on the host root cgroup, as there are no other cgroups at this
|
|
|
|
* level. The quota exists there too, but any attempt to write to it is refused with EINVAL. Inside of
|
2019-01-02 21:15:15 +01:00
|
|
|
* containers we want to leave control of these to the container manager (and if cgroup v2 delegation is used
|
2018-11-30 18:45:22 +01:00
|
|
|
* we couldn't even write to them if we wanted to). */
|
|
|
|
if ((apply_mask & CGROUP_MASK_CPU) && !is_local_root) {
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2017-02-24 17:52:58 +01:00
|
|
|
if (cg_all_unified() > 0) {
|
2018-11-30 18:45:22 +01:00
|
|
|
uint64_t weight;
|
2014-04-25 13:27:25 +02:00
|
|
|
|
2018-11-30 18:45:22 +01:00
|
|
|
if (cgroup_context_has_cpu_weight(c))
|
|
|
|
weight = cgroup_context_cpu_weight(c, state);
|
|
|
|
else if (cgroup_context_has_cpu_shares(c)) {
|
|
|
|
uint64_t shares;
|
2016-08-07 15:45:39 +02:00
|
|
|
|
2018-11-30 18:45:22 +01:00
|
|
|
shares = cgroup_context_cpu_shares(c, state);
|
|
|
|
weight = cgroup_cpu_shares_to_weight(shares);
|
2016-08-07 15:45:39 +02:00
|
|
|
|
2018-11-30 18:45:22 +01:00
|
|
|
log_cgroup_compat(u, "Applying [Startup]CPUShares=%" PRIu64 " as [Startup]CPUWeight=%" PRIu64 " on %s",
|
|
|
|
shares, weight, path);
|
|
|
|
} else
|
|
|
|
weight = CGROUP_WEIGHT_DEFAULT;
|
2016-08-07 15:45:39 +02:00
|
|
|
|
2018-11-30 18:45:22 +01:00
|
|
|
cgroup_apply_unified_cpu_weight(u, weight);
|
2018-11-02 17:21:57 +01:00
|
|
|
cgroup_apply_unified_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
|
2016-08-07 15:45:39 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
} else {
|
2018-11-30 18:45:22 +01:00
|
|
|
uint64_t shares;
|
2018-11-20 23:05:12 +01:00
|
|
|
|
2018-11-30 18:45:22 +01:00
|
|
|
if (cgroup_context_has_cpu_weight(c)) {
|
|
|
|
uint64_t weight;
|
2018-11-20 23:05:12 +01:00
|
|
|
|
2018-11-30 18:45:22 +01:00
|
|
|
weight = cgroup_context_cpu_weight(c, state);
|
|
|
|
shares = cgroup_cpu_weight_to_shares(weight);
|
2018-11-20 23:05:12 +01:00
|
|
|
|
2018-11-30 18:45:22 +01:00
|
|
|
log_cgroup_compat(u, "Applying [Startup]CPUWeight=%" PRIu64 " as [Startup]CPUShares=%" PRIu64 " on %s",
|
|
|
|
weight, shares, path);
|
|
|
|
} else if (cgroup_context_has_cpu_shares(c))
|
|
|
|
shares = cgroup_context_cpu_shares(c, state);
|
|
|
|
else
|
|
|
|
shares = CGROUP_CPU_SHARES_DEFAULT;
|
2016-08-07 15:45:39 +02:00
|
|
|
|
2018-11-30 18:45:22 +01:00
|
|
|
cgroup_apply_legacy_cpu_shares(u, shares);
|
2018-11-02 17:21:57 +01:00
|
|
|
cgroup_apply_legacy_cpu_quota(u, c->cpu_quota_per_sec_usec, c->cpu_quota_period_usec);
|
2016-08-07 15:45:39 +02:00
|
|
|
}
|
2013-06-27 04:14:27 +02:00
|
|
|
}
|
|
|
|
|
2019-07-29 17:50:05 +02:00
|
|
|
if ((apply_mask & CGROUP_MASK_CPUSET) && !is_local_root) {
|
2019-10-11 11:43:32 +02:00
|
|
|
cgroup_apply_unified_cpuset(u, &c->cpuset_cpus, "cpuset.cpus");
|
|
|
|
cgroup_apply_unified_cpuset(u, &c->cpuset_mems, "cpuset.mems");
|
2019-07-29 17:50:05 +02:00
|
|
|
}
|
|
|
|
|
2019-01-02 21:15:15 +01:00
|
|
|
/* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2
|
2018-11-20 23:05:12 +01:00
|
|
|
* controller), and in case of containers we want to leave control of these attributes to the container manager
|
|
|
|
* (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */
|
|
|
|
if ((apply_mask & CGROUP_MASK_IO) && !is_local_root) {
|
|
|
|
char buf[8+DECIMAL_STR_MAX(uint64_t)+1];
|
|
|
|
bool has_io, has_blockio;
|
|
|
|
uint64_t weight;
|
2016-05-05 22:42:55 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
has_io = cgroup_context_has_io_config(c);
|
|
|
|
has_blockio = cgroup_context_has_blockio_config(c);
|
2016-05-05 22:42:55 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
if (has_io)
|
|
|
|
weight = cgroup_context_io_weight(c, state);
|
|
|
|
else if (has_blockio) {
|
|
|
|
uint64_t blkio_weight;
|
2016-06-02 19:02:53 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
blkio_weight = cgroup_context_blkio_weight(c, state);
|
|
|
|
weight = cgroup_weight_blkio_to_io(blkio_weight);
|
2016-06-02 19:02:53 +02:00
|
|
|
|
2018-11-30 18:46:42 +01:00
|
|
|
log_cgroup_compat(u, "Applying [Startup]BlockIOWeight=%" PRIu64 " as [Startup]IOWeight=%" PRIu64,
|
2018-11-20 23:05:12 +01:00
|
|
|
blkio_weight, weight);
|
|
|
|
} else
|
|
|
|
weight = CGROUP_WEIGHT_DEFAULT;
|
2016-05-05 22:42:55 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
xsprintf(buf, "default %" PRIu64 "\n", weight);
|
|
|
|
(void) set_attribute_and_warn(u, "io", "io.weight", buf);
|
2016-05-19 02:35:12 +02:00
|
|
|
|
2019-08-17 02:33:43 +02:00
|
|
|
/* FIXME: drop this when distro kernels properly support BFQ through "io.weight"
|
|
|
|
* See also: https://github.com/systemd/systemd/pull/13335 */
|
|
|
|
xsprintf(buf, "%" PRIu64 "\n", weight);
|
|
|
|
(void) set_attribute_and_warn(u, "io", "io.bfq.weight", buf);
|
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
if (has_io) {
|
|
|
|
CGroupIODeviceLatency *latency;
|
|
|
|
CGroupIODeviceLimit *limit;
|
|
|
|
CGroupIODeviceWeight *w;
|
2016-06-02 19:02:53 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
LIST_FOREACH(device_weights, w, c->io_device_weights)
|
|
|
|
cgroup_apply_io_device_weight(u, w->path, w->weight);
|
2016-06-02 19:02:53 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
LIST_FOREACH(device_limits, limit, c->io_device_limits)
|
|
|
|
cgroup_apply_io_device_limit(u, limit->path, limit->limits);
|
2018-06-13 23:16:35 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
LIST_FOREACH(device_latencies, latency, c->io_device_latencies)
|
|
|
|
cgroup_apply_io_device_latency(u, latency->path, latency->target_usec);
|
2018-06-13 23:16:35 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
} else if (has_blockio) {
|
|
|
|
CGroupBlockIODeviceWeight *w;
|
|
|
|
CGroupBlockIODeviceBandwidth *b;
|
2016-05-05 22:42:55 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
|
|
|
|
weight = cgroup_weight_blkio_to_io(w->weight);
|
2018-06-12 19:37:22 +02:00
|
|
|
|
2018-11-30 18:46:42 +01:00
|
|
|
log_cgroup_compat(u, "Applying BlockIODeviceWeight=%" PRIu64 " as IODeviceWeight=%" PRIu64 " for %s",
|
2018-11-20 23:05:12 +01:00
|
|
|
w->weight, weight, w->path);
|
2016-05-19 02:35:12 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
cgroup_apply_io_device_weight(u, w->path, weight);
|
|
|
|
}
|
2016-05-19 02:35:12 +02:00
|
|
|
|
2018-06-12 19:37:22 +02:00
|
|
|
LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
|
2016-05-19 02:35:12 +02:00
|
|
|
uint64_t limits[_CGROUP_IO_LIMIT_TYPE_MAX];
|
|
|
|
CGroupIOLimitType type;
|
|
|
|
|
|
|
|
for (type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
|
|
|
|
limits[type] = cgroup_io_limit_defaults[type];
|
|
|
|
|
|
|
|
limits[CGROUP_IO_RBPS_MAX] = b->rbps;
|
|
|
|
limits[CGROUP_IO_WBPS_MAX] = b->wbps;
|
|
|
|
|
2018-11-30 18:46:42 +01:00
|
|
|
log_cgroup_compat(u, "Applying BlockIO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as IO{Read|Write}BandwidthMax= for %s",
|
2016-06-02 19:02:53 +02:00
|
|
|
b->rbps, b->wbps, b->path);
|
|
|
|
|
2018-06-12 19:37:22 +02:00
|
|
|
cgroup_apply_io_device_limit(u, b->path, limits);
|
2016-05-19 02:35:12 +02:00
|
|
|
}
|
2016-05-05 22:42:55 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-09-05 19:27:53 +02:00
|
|
|
if (apply_mask & CGROUP_MASK_BLKIO) {
|
2018-11-20 23:05:12 +01:00
|
|
|
bool has_io, has_blockio;
|
2013-06-27 04:14:27 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
has_io = cgroup_context_has_io_config(c);
|
|
|
|
has_blockio = cgroup_context_has_blockio_config(c);
|
|
|
|
|
|
|
|
/* Applying a 'weight' never makes sense for the host root cgroup, and for containers this should be
|
|
|
|
* left to our container manager, too. */
|
|
|
|
if (!is_local_root) {
|
2016-05-18 22:51:46 +02:00
|
|
|
char buf[DECIMAL_STR_MAX(uint64_t)+1];
|
|
|
|
uint64_t weight;
|
|
|
|
|
2016-10-15 03:07:16 +02:00
|
|
|
if (has_io) {
|
2018-11-20 23:05:12 +01:00
|
|
|
uint64_t io_weight;
|
2016-06-02 19:02:53 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
io_weight = cgroup_context_io_weight(c, state);
|
2016-05-19 02:35:12 +02:00
|
|
|
weight = cgroup_weight_io_to_blkio(cgroup_context_io_weight(c, state));
|
2016-06-02 19:02:53 +02:00
|
|
|
|
2018-11-30 18:46:42 +01:00
|
|
|
log_cgroup_compat(u, "Applying [Startup]IOWeight=%" PRIu64 " as [Startup]BlockIOWeight=%" PRIu64,
|
2016-06-02 19:02:53 +02:00
|
|
|
io_weight, weight);
|
2016-10-15 03:07:16 +02:00
|
|
|
} else if (has_blockio)
|
|
|
|
weight = cgroup_context_blkio_weight(c, state);
|
|
|
|
else
|
2016-05-19 02:35:12 +02:00
|
|
|
weight = CGROUP_BLKIO_WEIGHT_DEFAULT;
|
2016-05-18 22:51:46 +02:00
|
|
|
|
|
|
|
xsprintf(buf, "%" PRIu64 "\n", weight);
|
2018-11-20 20:19:58 +01:00
|
|
|
(void) set_attribute_and_warn(u, "blkio", "blkio.weight", buf);
|
2013-06-27 04:14:27 +02:00
|
|
|
|
2020-11-25 09:05:36 +01:00
|
|
|
/* FIXME: drop this when distro kernels properly support BFQ through "blkio.weight"
|
|
|
|
* See also: https://github.com/systemd/systemd/pull/13335 */
|
|
|
|
xsprintf(buf, "%" PRIu64 "\n", weight);
|
|
|
|
(void) set_attribute_and_warn(u, "blkio", "blkio.bfq.weight", buf);
|
|
|
|
|
2016-10-15 03:07:16 +02:00
|
|
|
if (has_io) {
|
2016-05-19 02:35:12 +02:00
|
|
|
CGroupIODeviceWeight *w;
|
|
|
|
|
2016-06-02 19:02:53 +02:00
|
|
|
LIST_FOREACH(device_weights, w, c->io_device_weights) {
|
|
|
|
weight = cgroup_weight_io_to_blkio(w->weight);
|
|
|
|
|
2018-11-30 18:46:42 +01:00
|
|
|
log_cgroup_compat(u, "Applying IODeviceWeight=%" PRIu64 " as BlockIODeviceWeight=%" PRIu64 " for %s",
|
2016-06-02 19:02:53 +02:00
|
|
|
w->weight, weight, w->path);
|
|
|
|
|
|
|
|
cgroup_apply_blkio_device_weight(u, w->path, weight);
|
|
|
|
}
|
2016-10-15 03:07:16 +02:00
|
|
|
} else if (has_blockio) {
|
|
|
|
CGroupBlockIODeviceWeight *w;
|
|
|
|
|
|
|
|
LIST_FOREACH(device_weights, w, c->blockio_device_weights)
|
|
|
|
cgroup_apply_blkio_device_weight(u, w->path, w->weight);
|
2016-05-19 02:35:12 +02:00
|
|
|
}
|
2013-06-27 04:14:27 +02:00
|
|
|
}
|
|
|
|
|
2019-04-27 02:22:40 +02:00
|
|
|
/* The bandwidth limits are something that make sense to be applied to the host's root but not container
|
2018-11-20 23:05:12 +01:00
|
|
|
* roots, as there we want the container manager to handle it */
|
|
|
|
if (is_host_root || !is_local_root) {
|
|
|
|
if (has_io) {
|
|
|
|
CGroupIODeviceLimit *l;
|
2016-05-19 02:35:12 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
LIST_FOREACH(device_limits, l, c->io_device_limits) {
|
2018-11-30 18:46:42 +01:00
|
|
|
log_cgroup_compat(u, "Applying IO{Read|Write}Bandwidth=%" PRIu64 " %" PRIu64 " as BlockIO{Read|Write}BandwidthMax= for %s",
|
2018-11-20 23:05:12 +01:00
|
|
|
l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX], l->path);
|
2016-06-02 19:02:53 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
cgroup_apply_blkio_device_limit(u, l->path, l->limits[CGROUP_IO_RBPS_MAX], l->limits[CGROUP_IO_WBPS_MAX]);
|
|
|
|
}
|
|
|
|
} else if (has_blockio) {
|
|
|
|
CGroupBlockIODeviceBandwidth *b;
|
2016-10-15 03:07:16 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths)
|
|
|
|
cgroup_apply_blkio_device_limit(u, b->path, b->rbps, b->wbps);
|
|
|
|
}
|
2010-11-17 21:27:53 +01:00
|
|
|
}
|
2010-03-31 16:29:55 +02:00
|
|
|
}
|
|
|
|
|
2018-11-30 18:45:22 +01:00
|
|
|
/* In unified mode 'memory' attributes do not exist on the root cgroup. In legacy mode 'memory.limit_in_bytes'
|
|
|
|
* exists on the root cgroup, but any writes to it are refused with EINVAL. And if we run in a container we
|
2019-01-02 21:15:15 +01:00
|
|
|
* want to leave control to the container manager (and if proper cgroup v2 delegation is used we couldn't even
|
2018-11-30 18:45:22 +01:00
|
|
|
* write to this if we wanted to.) */
|
|
|
|
if ((apply_mask & CGROUP_MASK_MEMORY) && !is_local_root) {
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
if (cg_all_unified() > 0) {
|
2018-11-30 18:45:22 +01:00
|
|
|
uint64_t max, swap_max = CGROUP_LIMIT_MAX;
|
|
|
|
|
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow
In cgroup v2 we have protection tunables -- currently MemoryLow and
MemoryMin (there will be more in future for other resources, too). The
design of these protection tunables requires not only intermediate
cgroups to propagate protections, but also the units at the leaf of that
resource's operation to accept it (by setting MemoryLow or MemoryMin).
This makes sense from an low-level API design perspective, but it's a
good idea to also have a higher-level abstraction that can, by default,
propagate these resources to children recursively. In this patch, this
happens by having descendants set memory.low to N if their ancestor has
DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow
value.
Any affected unit can opt out of this propagation by manually setting
`MemoryLow` to some value in its unit configuration. A unit can also
stop further propagation by setting `DefaultMemoryLow=` with no
argument. This removes further propagation in the subtree, but has no
effect on the unit itself (for that, use `MemoryLow=0`).
Our use case in production is simplifying the configuration of machines
which heavily rely on memory protection tunables, but currently require
tweaking a huge number of unit files to make that a reality. This
directive makes that significantly less fragile, and decreases the risk
of misconfiguration.
After this patch is merged, I will implement DefaultMemoryMin= using the
same principles.
2019-03-28 13:50:50 +01:00
|
|
|
if (unit_has_unified_memory_config(u)) {
|
2018-11-30 18:45:22 +01:00
|
|
|
max = c->memory_max;
|
|
|
|
swap_max = c->memory_swap_max;
|
|
|
|
} else {
|
|
|
|
max = c->memory_limit;
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2018-11-30 18:45:22 +01:00
|
|
|
if (max != CGROUP_LIMIT_MAX)
|
|
|
|
log_cgroup_compat(u, "Applying MemoryLimit=%" PRIu64 " as MemoryMax=", max);
|
2016-06-02 19:02:53 +02:00
|
|
|
}
|
2016-05-27 18:10:18 +02:00
|
|
|
|
2019-09-30 19:25:09 +02:00
|
|
|
cgroup_apply_unified_memory_limit(u, "memory.min", unit_get_ancestor_memory_min(u));
|
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow
In cgroup v2 we have protection tunables -- currently MemoryLow and
MemoryMin (there will be more in future for other resources, too). The
design of these protection tunables requires not only intermediate
cgroups to propagate protections, but also the units at the leaf of that
resource's operation to accept it (by setting MemoryLow or MemoryMin).
This makes sense from an low-level API design perspective, but it's a
good idea to also have a higher-level abstraction that can, by default,
propagate these resources to children recursively. In this patch, this
happens by having descendants set memory.low to N if their ancestor has
DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow
value.
Any affected unit can opt out of this propagation by manually setting
`MemoryLow` to some value in its unit configuration. A unit can also
stop further propagation by setting `DefaultMemoryLow=` with no
argument. This removes further propagation in the subtree, but has no
effect on the unit itself (for that, use `MemoryLow=0`).
Our use case in production is simplifying the configuration of machines
which heavily rely on memory protection tunables, but currently require
tweaking a huge number of unit files to make that a reality. This
directive makes that significantly less fragile, and decreases the risk
of misconfiguration.
After this patch is merged, I will implement DefaultMemoryMin= using the
same principles.
2019-03-28 13:50:50 +01:00
|
|
|
cgroup_apply_unified_memory_limit(u, "memory.low", unit_get_ancestor_memory_low(u));
|
2018-11-30 18:45:22 +01:00
|
|
|
cgroup_apply_unified_memory_limit(u, "memory.high", c->memory_high);
|
|
|
|
cgroup_apply_unified_memory_limit(u, "memory.max", max);
|
|
|
|
cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
|
2016-06-02 19:02:53 +02:00
|
|
|
|
2019-03-19 19:05:19 +01:00
|
|
|
(void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group));
|
|
|
|
|
2018-11-30 18:45:22 +01:00
|
|
|
} else {
|
|
|
|
char buf[DECIMAL_STR_MAX(uint64_t) + 1];
|
|
|
|
uint64_t val;
|
2018-11-20 23:05:12 +01:00
|
|
|
|
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow
In cgroup v2 we have protection tunables -- currently MemoryLow and
MemoryMin (there will be more in future for other resources, too). The
design of these protection tunables requires not only intermediate
cgroups to propagate protections, but also the units at the leaf of that
resource's operation to accept it (by setting MemoryLow or MemoryMin).
This makes sense from an low-level API design perspective, but it's a
good idea to also have a higher-level abstraction that can, by default,
propagate these resources to children recursively. In this patch, this
happens by having descendants set memory.low to N if their ancestor has
DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow
value.
Any affected unit can opt out of this propagation by manually setting
`MemoryLow` to some value in its unit configuration. A unit can also
stop further propagation by setting `DefaultMemoryLow=` with no
argument. This removes further propagation in the subtree, but has no
effect on the unit itself (for that, use `MemoryLow=0`).
Our use case in production is simplifying the configuration of machines
which heavily rely on memory protection tunables, but currently require
tweaking a huge number of unit files to make that a reality. This
directive makes that significantly less fragile, and decreases the risk
of misconfiguration.
After this patch is merged, I will implement DefaultMemoryMin= using the
same principles.
2019-03-28 13:50:50 +01:00
|
|
|
if (unit_has_unified_memory_config(u)) {
|
2018-11-30 18:45:22 +01:00
|
|
|
val = c->memory_max;
|
|
|
|
log_cgroup_compat(u, "Applying MemoryMax=%" PRIi64 " as MemoryLimit=", val);
|
|
|
|
} else
|
|
|
|
val = c->memory_limit;
|
cgroup: fix memory cgroup limit regression on kernel 3.10 (#3673)
Commit da4d897e ("core: add cgroup memory controller support on the unified
hierarchy (#3315)") changed the code in src/core/cgroup.c to always write
the real numeric value from the cgroup parameters to the
"memory.limit_in_bytes" attribute file.
For parameters set to CGROUP_LIMIT_MAX, this results in the string
"18446744073709551615" being written into that file, which is UINT64_MAX.
Before that commit, CGROUP_LIMIT_MAX was special-cased to the string "-1".
This causes a regression on CentOS 7, which is based on kernel 3.10, as the
value is interpreted as *signed* 64 bit, and clamped to 0:
[root@n54 ~]# echo 18446744073709551615 >/sys/fs/cgroup/memory/user.slice/memory.limit_in_bytes
[root@n54 ~]# cat /sys/fs/cgroup/memory/user.slice/memory.limit_in_bytes
0
[root@n54 ~]# echo -1 >/sys/fs/cgroup/memory/user.slice/memory.limit_in_bytes
[root@n54 ~]# cat /sys/fs/cgroup/memory/user.slice/memory.limit_in_bytes
9223372036854775807
Hence, all units that are subject to the limits enforced by the memory
controller will crash immediately, even though they have no actual limit
set. This happens to for the user.slice, for instance:
[ 453.577153] Hardware name: SeaMicro SM15000-64-CC-AA-1Ox1/AMD Server CRB, BIOS Estoc.3.72.19.0018 08/19/2014
[ 453.587024] ffff880810c56780 00000000aae9501f ffff880813d7fcd0 ffffffff816360fc
[ 453.594544] ffff880813d7fd60 ffffffff8163109c ffff88080ffc5000 ffff880813d7fd28
[ 453.602120] ffffffff00000202 fffeefff00000000 0000000000000001 ffff880810c56c03
[ 453.609680] Call Trace:
[ 453.612156] [<ffffffff816360fc>] dump_stack+0x19/0x1b
[ 453.617324] [<ffffffff8163109c>] dump_header+0x8e/0x214
[ 453.622671] [<ffffffff8116d20e>] oom_kill_process+0x24e/0x3b0
[ 453.628559] [<ffffffff81088dae>] ? has_capability_noaudit+0x1e/0x30
[ 453.634969] [<ffffffff811d4155>] mem_cgroup_oom_synchronize+0x575/0x5a0
[ 453.641721] [<ffffffff811d3520>] ? mem_cgroup_charge_common+0xc0/0xc0
[ 453.648299] [<ffffffff8116da84>] pagefault_out_of_memory+0x14/0x90
[ 453.654621] [<ffffffff8162f4cc>] mm_fault_error+0x68/0x12b
[ 453.660233] [<ffffffff81642012>] __do_page_fault+0x3e2/0x450
[ 453.666017] [<ffffffff816420a3>] do_page_fault+0x23/0x80
[ 453.671467] [<ffffffff8163e308>] page_fault+0x28/0x30
[ 453.676656] Task in /user.slice/user-0.slice/user@0.service killed as a result of limit of /user.slice/user-0.slice/user@0.service
[ 453.688477] memory: usage 0kB, limit 0kB, failcnt 7
[ 453.693391] memory+swap: usage 0kB, limit 9007199254740991kB, failcnt 0
[ 453.700039] kmem: usage 0kB, limit 9007199254740991kB, failcnt 0
[ 453.706076] Memory cgroup stats for /user.slice/user-0.slice/user@0.service: cache:0KB rss:0KB rss_huge:0KB mapped_file:0KB swap:0KB inactive_anon:0KB active_anon:0KB inactive_file:0KB active_file:0KB unevictable:0KB
[ 453.725702] [ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name
[ 453.733614] [ 2837] 0 2837 11950 899 23 0 0 (systemd)
[ 453.741919] Memory cgroup out of memory: Kill process 2837 ((systemd)) score 1 or sacrifice child
[ 453.750831] Killed process 2837 ((systemd)) total-vm:47800kB, anon-rss:3188kB, file-rss:408kB
Fix this issue by special-casing the UINT64_MAX case again.
2016-07-08 04:29:35 +02:00
|
|
|
|
2018-11-30 18:45:22 +01:00
|
|
|
if (val == CGROUP_LIMIT_MAX)
|
|
|
|
strncpy(buf, "-1\n", sizeof(buf));
|
|
|
|
else
|
|
|
|
xsprintf(buf, "%" PRIu64 "\n", val);
|
|
|
|
|
|
|
|
(void) set_attribute_and_warn(u, "memory", "memory.limit_in_bytes", buf);
|
2016-05-27 18:10:18 +02:00
|
|
|
}
|
2013-06-27 04:14:27 +02:00
|
|
|
}
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2019-01-02 21:15:15 +01:00
|
|
|
/* On cgroup v2 we can apply BPF everywhere. On cgroup v1 we apply it everywhere except for the root of
|
2018-11-20 23:05:12 +01:00
|
|
|
* containers, where we leave this to the manager */
|
|
|
|
if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) &&
|
2019-11-08 16:19:43 +01:00
|
|
|
(is_host_root || cg_all_unified() > 0 || !is_local_root))
|
|
|
|
(void) cgroup_apply_devices(u);
|
2015-09-10 12:32:16 +02:00
|
|
|
|
2018-01-17 18:50:27 +01:00
|
|
|
if (apply_mask & CGROUP_MASK_PIDS) {
|
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
if (is_host_root) {
|
2018-01-17 18:50:27 +01:00
|
|
|
/* So, the "pids" controller does not expose anything on the root cgroup, in order not to
|
|
|
|
* replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when
|
|
|
|
* the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a
|
|
|
|
* non-obvious asymmetry however: unlike the cgroup properties we don't really want to take
|
|
|
|
* exclusive ownership of the sysctls, but we still want to honour things if the user sets
|
|
|
|
* limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit
|
|
|
|
* through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded)
|
|
|
|
* it also counts. But if the user never set a limit through us (i.e. we are the default of
|
|
|
|
* "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on
|
|
|
|
* the first time we set a limit. Note that this boolean is flushed out on manager reload,
|
2019-04-27 02:22:40 +02:00
|
|
|
* which is desirable so that there's an official way to release control of the sysctl from
|
2018-01-17 18:50:27 +01:00
|
|
|
* systemd: set the limit to unbounded and reload. */
|
|
|
|
|
2019-11-05 13:50:28 +01:00
|
|
|
if (tasks_max_isset(&c->tasks_max)) {
|
2018-01-17 18:50:27 +01:00
|
|
|
u->manager->sysctl_pid_max_changed = true;
|
2019-11-05 13:50:28 +01:00
|
|
|
r = procfs_tasks_set_limit(tasks_max_resolve(&c->tasks_max));
|
2018-01-17 18:50:27 +01:00
|
|
|
} else if (u->manager->sysctl_pid_max_changed)
|
|
|
|
r = procfs_tasks_set_limit(TASKS_MAX);
|
|
|
|
else
|
|
|
|
r = 0;
|
|
|
|
if (r < 0)
|
2020-09-08 19:28:36 +02:00
|
|
|
log_unit_full_errno(u, LOG_LEVEL_CGROUP_WRITE(r), r,
|
|
|
|
"Failed to write to tasks limit sysctls: %m");
|
2018-11-20 23:05:12 +01:00
|
|
|
}
|
2015-09-10 12:32:16 +02:00
|
|
|
|
2018-11-20 23:05:12 +01:00
|
|
|
/* The attribute itself is not available on the host root cgroup, and in the container case we want to
|
|
|
|
* leave it for the container manager. */
|
|
|
|
if (!is_local_root) {
|
2019-11-05 13:50:28 +01:00
|
|
|
if (tasks_max_isset(&c->tasks_max)) {
|
|
|
|
char buf[DECIMAL_STR_MAX(uint64_t) + 1];
|
2015-09-10 12:32:16 +02:00
|
|
|
|
2019-11-05 13:50:28 +01:00
|
|
|
xsprintf(buf, "%" PRIu64 "\n", tasks_max_resolve(&c->tasks_max));
|
2018-11-20 20:19:58 +01:00
|
|
|
(void) set_attribute_and_warn(u, "pids", "pids.max", buf);
|
2018-01-17 18:50:27 +01:00
|
|
|
} else
|
2018-11-20 22:50:13 +01:00
|
|
|
(void) set_attribute_and_warn(u, "pids", "pids.max", "max\n");
|
2018-01-17 18:50:27 +01:00
|
|
|
}
|
2015-09-10 12:32:16 +02:00
|
|
|
}
|
2017-09-05 19:27:53 +02:00
|
|
|
|
2018-09-30 12:33:16 +02:00
|
|
|
if (apply_mask & CGROUP_MASK_BPF_FIREWALL)
|
2017-11-24 19:37:01 +01:00
|
|
|
cgroup_apply_firewall(u);
|
2010-07-10 17:34:42 +02:00
|
|
|
}
|
|
|
|
|
2018-11-21 17:44:14 +01:00
|
|
|
static bool unit_get_needs_bpf_firewall(Unit *u) {
|
|
|
|
CGroupContext *c;
|
|
|
|
Unit *p;
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
c = unit_get_cgroup_context(u);
|
|
|
|
if (!c)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (c->ip_accounting ||
|
|
|
|
c->ip_address_allow ||
|
2019-04-23 12:14:20 +02:00
|
|
|
c->ip_address_deny ||
|
|
|
|
c->ip_filters_ingress ||
|
|
|
|
c->ip_filters_egress)
|
2018-11-21 17:44:14 +01:00
|
|
|
return true;
|
|
|
|
|
|
|
|
/* If any parent slice has an IP access list defined, it applies too */
|
|
|
|
for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) {
|
|
|
|
c = unit_get_cgroup_context(p);
|
|
|
|
if (!c)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (c->ip_address_allow ||
|
|
|
|
c->ip_address_deny)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow
In cgroup v2 we have protection tunables -- currently MemoryLow and
MemoryMin (there will be more in future for other resources, too). The
design of these protection tunables requires not only intermediate
cgroups to propagate protections, but also the units at the leaf of that
resource's operation to accept it (by setting MemoryLow or MemoryMin).
This makes sense from an low-level API design perspective, but it's a
good idea to also have a higher-level abstraction that can, by default,
propagate these resources to children recursively. In this patch, this
happens by having descendants set memory.low to N if their ancestor has
DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow
value.
Any affected unit can opt out of this propagation by manually setting
`MemoryLow` to some value in its unit configuration. A unit can also
stop further propagation by setting `DefaultMemoryLow=` with no
argument. This removes further propagation in the subtree, but has no
effect on the unit itself (for that, use `MemoryLow=0`).
Our use case in production is simplifying the configuration of machines
which heavily rely on memory protection tunables, but currently require
tweaking a huge number of unit files to make that a reality. This
directive makes that significantly less fragile, and decreases the risk
of misconfiguration.
After this patch is merged, I will implement DefaultMemoryMin= using the
same principles.
2019-03-28 13:50:50 +01:00
|
|
|
static CGroupMask unit_get_cgroup_mask(Unit *u) {
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
CGroupMask mask = 0;
|
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow
In cgroup v2 we have protection tunables -- currently MemoryLow and
MemoryMin (there will be more in future for other resources, too). The
design of these protection tunables requires not only intermediate
cgroups to propagate protections, but also the units at the leaf of that
resource's operation to accept it (by setting MemoryLow or MemoryMin).
This makes sense from an low-level API design perspective, but it's a
good idea to also have a higher-level abstraction that can, by default,
propagate these resources to children recursively. In this patch, this
happens by having descendants set memory.low to N if their ancestor has
DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow
value.
Any affected unit can opt out of this propagation by manually setting
`MemoryLow` to some value in its unit configuration. A unit can also
stop further propagation by setting `DefaultMemoryLow=` with no
argument. This removes further propagation in the subtree, but has no
effect on the unit itself (for that, use `MemoryLow=0`).
Our use case in production is simplifying the configuration of machines
which heavily rely on memory protection tunables, but currently require
tweaking a huge number of unit files to make that a reality. This
directive makes that significantly less fragile, and decreases the risk
of misconfiguration.
After this patch is merged, I will implement DefaultMemoryMin= using the
same principles.
2019-03-28 13:50:50 +01:00
|
|
|
CGroupContext *c;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
c = unit_get_cgroup_context(u);
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2019-06-07 07:28:10 +02:00
|
|
|
assert(c);
|
|
|
|
|
2018-11-15 21:15:19 +01:00
|
|
|
/* Figure out which controllers we need, based on the cgroup context object */
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2018-11-15 21:15:19 +01:00
|
|
|
if (c->cpu_accounting)
|
2018-11-17 12:19:07 +01:00
|
|
|
mask |= get_cpu_accounting_mask();
|
2018-11-15 21:15:19 +01:00
|
|
|
|
|
|
|
if (cgroup_context_has_cpu_weight(c) ||
|
2016-08-07 15:45:39 +02:00
|
|
|
cgroup_context_has_cpu_shares(c) ||
|
2014-07-29 12:23:31 +02:00
|
|
|
c->cpu_quota_per_sec_usec != USEC_INFINITY)
|
2018-11-15 21:15:19 +01:00
|
|
|
mask |= CGROUP_MASK_CPU;
|
2012-04-13 23:24:47 +02:00
|
|
|
|
2019-07-29 17:50:05 +02:00
|
|
|
if (c->cpuset_cpus.set || c->cpuset_mems.set)
|
|
|
|
mask |= CGROUP_MASK_CPUSET;
|
|
|
|
|
2016-05-19 02:35:12 +02:00
|
|
|
if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
|
|
|
|
mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
|
2012-04-13 23:24:47 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
if (c->memory_accounting ||
|
2016-05-27 18:10:18 +02:00
|
|
|
c->memory_limit != CGROUP_LIMIT_MAX ||
|
cgroup: Implement default propagation of MemoryLow with DefaultMemoryLow
In cgroup v2 we have protection tunables -- currently MemoryLow and
MemoryMin (there will be more in future for other resources, too). The
design of these protection tunables requires not only intermediate
cgroups to propagate protections, but also the units at the leaf of that
resource's operation to accept it (by setting MemoryLow or MemoryMin).
This makes sense from an low-level API design perspective, but it's a
good idea to also have a higher-level abstraction that can, by default,
propagate these resources to children recursively. In this patch, this
happens by having descendants set memory.low to N if their ancestor has
DefaultMemoryLow=N -- assuming they don't set a separate MemoryLow
value.
Any affected unit can opt out of this propagation by manually setting
`MemoryLow` to some value in its unit configuration. A unit can also
stop further propagation by setting `DefaultMemoryLow=` with no
argument. This removes further propagation in the subtree, but has no
effect on the unit itself (for that, use `MemoryLow=0`).
Our use case in production is simplifying the configuration of machines
which heavily rely on memory protection tunables, but currently require
tweaking a huge number of unit files to make that a reality. This
directive makes that significantly less fragile, and decreases the risk
of misconfiguration.
After this patch is merged, I will implement DefaultMemoryMin= using the
same principles.
2019-03-28 13:50:50 +01:00
|
|
|
unit_has_unified_memory_config(u))
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
mask |= CGROUP_MASK_MEMORY;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2014-11-05 17:57:23 +01:00
|
|
|
if (c->device_allow ||
|
2019-11-08 15:12:23 +01:00
|
|
|
c->device_policy != CGROUP_DEVICE_POLICY_AUTO)
|
2018-10-08 23:33:05 +02:00
|
|
|
mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES;
|
2013-06-27 04:14:27 +02:00
|
|
|
|
2015-09-10 12:32:16 +02:00
|
|
|
if (c->tasks_accounting ||
|
2019-11-05 13:50:28 +01:00
|
|
|
tasks_max_isset(&c->tasks_max))
|
2015-09-10 12:32:16 +02:00
|
|
|
mask |= CGROUP_MASK_PIDS;
|
|
|
|
|
2018-11-15 21:15:19 +01:00
|
|
|
return CGROUP_MASK_EXTEND_JOINED(mask);
|
2010-03-31 16:29:55 +02:00
|
|
|
}
|
|
|
|
|
2018-11-21 17:42:40 +01:00
|
|
|
static CGroupMask unit_get_bpf_mask(Unit *u) {
|
2018-09-30 12:33:16 +02:00
|
|
|
CGroupMask mask = 0;
|
|
|
|
|
2018-11-15 21:15:19 +01:00
|
|
|
/* Figure out which controllers we need, based on the cgroup context, possibly taking into account children
|
|
|
|
* too. */
|
|
|
|
|
2018-09-30 12:33:16 +02:00
|
|
|
if (unit_get_needs_bpf_firewall(u))
|
|
|
|
mask |= CGROUP_MASK_BPF_FIREWALL;
|
|
|
|
|
|
|
|
return mask;
|
|
|
|
}
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
CGroupMask unit_get_own_mask(Unit *u) {
|
2013-06-27 04:14:27 +02:00
|
|
|
CGroupContext *c;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2018-11-21 18:25:37 +01:00
|
|
|
/* Returns the mask of controllers the unit needs for itself. If a unit is not properly loaded, return an empty
|
|
|
|
* mask, as we shouldn't reflect it in the cgroup hierarchy then. */
|
|
|
|
|
|
|
|
if (u->load_state != UNIT_LOADED)
|
|
|
|
return 0;
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
c = unit_get_cgroup_context(u);
|
|
|
|
if (!c)
|
|
|
|
return 0;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2020-05-01 14:00:42 +02:00
|
|
|
return unit_get_cgroup_mask(u) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u);
|
2017-11-09 15:29:34 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
CGroupMask unit_get_delegate_mask(Unit *u) {
|
|
|
|
CGroupContext *c;
|
|
|
|
|
|
|
|
/* If delegation is turned on, then turn on selected controllers, unless we are on the legacy hierarchy and the
|
|
|
|
* process we fork into is known to drop privileges, and hence shouldn't get access to the controllers.
|
2015-09-04 09:23:07 +02:00
|
|
|
*
|
2017-11-09 15:29:34 +01:00
|
|
|
* Note that on the unified hierarchy it is safe to delegate controllers to unprivileged services. */
|
2014-11-05 17:57:23 +01:00
|
|
|
|
2018-02-06 11:57:35 +01:00
|
|
|
if (!unit_cgroup_delegate(u))
|
2017-11-09 15:29:34 +01:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (cg_all_unified() <= 0) {
|
2014-11-05 17:57:23 +01:00
|
|
|
ExecContext *e;
|
|
|
|
|
|
|
|
e = unit_get_exec_context(u);
|
2017-11-09 15:29:34 +01:00
|
|
|
if (e && !exec_context_maintains_privileges(e))
|
|
|
|
return 0;
|
2014-11-05 17:57:23 +01:00
|
|
|
}
|
|
|
|
|
2018-02-06 11:57:35 +01:00
|
|
|
assert_se(c = unit_get_cgroup_context(u));
|
2018-11-15 21:15:19 +01:00
|
|
|
return CGROUP_MASK_EXTEND_JOINED(c->delegate_controllers);
|
2010-03-31 16:29:55 +02:00
|
|
|
}
|
|
|
|
|
2020-05-01 14:00:42 +02:00
|
|
|
static CGroupMask unit_get_subtree_mask(Unit *u) {
|
|
|
|
|
|
|
|
/* Returns the mask of this subtree, meaning of the group
|
|
|
|
* itself and its children. */
|
|
|
|
|
|
|
|
return unit_get_own_mask(u) | unit_get_members_mask(u);
|
|
|
|
}
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
CGroupMask unit_get_members_mask(Unit *u) {
|
2013-06-27 04:14:27 +02:00
|
|
|
assert(u);
|
2014-02-14 19:11:07 +01:00
|
|
|
|
2017-11-09 15:29:34 +01:00
|
|
|
/* Returns the mask of controllers all of the unit's children require, merged */
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2014-02-14 19:11:07 +01:00
|
|
|
if (u->cgroup_members_mask_valid)
|
2018-11-21 18:27:15 +01:00
|
|
|
return u->cgroup_members_mask; /* Use cached value if possible */
|
2014-02-14 19:11:07 +01:00
|
|
|
|
2017-11-17 19:25:18 +01:00
|
|
|
u->cgroup_members_mask = 0;
|
2014-02-14 19:11:07 +01:00
|
|
|
|
|
|
|
if (u->type == UNIT_SLICE) {
|
core: track why unit dependencies came to be
This replaces the dependencies Set* objects by Hashmap* objects, where
the key is the depending Unit, and the value is a bitmask encoding why
the specific dependency was created.
The bitmask contains a number of different, defined bits, that indicate
why dependencies exist, for example whether they are created due to
explicitly configured deps in files, by udev rules or implicitly.
Note that memory usage is not increased by this change, even though we
store more information, as we manage to encode the bit mask inside the
value pointer each Hashmap entry contains.
Why this all? When we know how a dependency came to be, we can update
dependencies correctly when a configuration source changes but others
are left unaltered. Specifically:
1. We can fix UDEV_WANTS dependency generation: so far we kept adding
dependencies configured that way, but if a device lost such a
dependency we couldn't them again as there was no scheme for removing
of dependencies in place.
2. We can implement "pin-pointed" reload of unit files. If we know what
dependencies were created as result of configuration in a unit file,
then we know what to flush out when we want to reload it.
3. It's useful for debugging: "systemd-analyze dump" now shows
this information, helping substantially with understanding how
systemd's dependency tree came to be the way it came to be.
2017-10-25 20:46:01 +02:00
|
|
|
void *v;
|
2014-02-14 19:11:07 +01:00
|
|
|
Unit *member;
|
|
|
|
|
2020-09-08 11:58:29 +02:00
|
|
|
HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE])
|
2018-12-12 11:49:35 +01:00
|
|
|
if (UNIT_DEREF(member->slice) == u)
|
|
|
|
u->cgroup_members_mask |= unit_get_subtree_mask(member); /* note that this calls ourselves again, for the children */
|
2014-02-14 19:11:07 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
u->cgroup_members_mask_valid = true;
|
2013-11-11 10:03:31 +01:00
|
|
|
return u->cgroup_members_mask;
|
2013-01-12 04:24:12 +01:00
|
|
|
}
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
CGroupMask unit_get_siblings_mask(Unit *u) {
|
2013-06-27 04:14:27 +02:00
|
|
|
assert(u);
|
2013-01-12 04:24:12 +01:00
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
/* Returns the mask of controllers all of the unit's siblings
|
|
|
|
* require, i.e. the members mask of the unit's parent slice
|
|
|
|
* if there is one. */
|
|
|
|
|
2014-02-14 19:11:07 +01:00
|
|
|
if (UNIT_ISSET(u->slice))
|
2014-05-22 00:50:03 +02:00
|
|
|
return unit_get_members_mask(UNIT_DEREF(u->slice));
|
2013-06-27 04:14:27 +02:00
|
|
|
|
2017-11-17 19:25:18 +01:00
|
|
|
return unit_get_subtree_mask(u); /* we are the top-level slice */
|
2013-01-12 04:24:12 +01:00
|
|
|
}
|
|
|
|
|
2020-05-01 14:00:42 +02:00
|
|
|
static CGroupMask unit_get_disable_mask(Unit *u) {
|
2018-11-27 16:49:41 +01:00
|
|
|
CGroupContext *c;
|
|
|
|
|
|
|
|
c = unit_get_cgroup_context(u);
|
|
|
|
if (!c)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return c->disable_controllers;
|
|
|
|
}
|
|
|
|
|
|
|
|
CGroupMask unit_get_ancestor_disable_mask(Unit *u) {
|
|
|
|
CGroupMask mask;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
mask = unit_get_disable_mask(u);
|
|
|
|
|
|
|
|
/* Returns the mask of controllers which are marked as forcibly
|
|
|
|
* disabled in any ancestor unit or the unit in question. */
|
|
|
|
|
|
|
|
if (UNIT_ISSET(u->slice))
|
|
|
|
mask |= unit_get_ancestor_disable_mask(UNIT_DEREF(u->slice));
|
|
|
|
|
|
|
|
return mask;
|
|
|
|
}
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
CGroupMask unit_get_target_mask(Unit *u) {
|
|
|
|
CGroupMask mask;
|
|
|
|
|
|
|
|
/* This returns the cgroup mask of all controllers to enable
|
|
|
|
* for a specific cgroup, i.e. everything it needs itself,
|
|
|
|
* plus all that its children need, plus all that its siblings
|
|
|
|
* need. This is primarily useful on the legacy cgroup
|
|
|
|
* hierarchy, where we need to duplicate each cgroup in each
|
|
|
|
* hierarchy that shall be enabled for it. */
|
2013-11-11 10:03:31 +01:00
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
|
2019-06-04 15:01:27 +02:00
|
|
|
|
|
|
|
if (mask & CGROUP_MASK_BPF_FIREWALL & ~u->manager->cgroup_supported)
|
|
|
|
emit_bpf_firewall_warning(u);
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
mask &= u->manager->cgroup_supported;
|
cgroup: Add DisableControllers= directive to disable controller in subtree
Some controllers (like the CPU controller) have a performance cost that
is non-trivial on certain workloads. While this can be mitigated and
improved to an extent, there will for some controllers always be some
overheads associated with the benefits gained from the controller.
Inside Facebook, the fix applied has been to disable the CPU controller
forcibly with `cgroup_disable=cpu` on the kernel command line.
This presents a problem: to disable or reenable the controller, a reboot
is required, but this is quite cumbersome and slow to do for many
thousands of machines, especially machines where disabling/enabling a
stateful service on a machine is a matter of several minutes.
Currently systemd provides some configuration knobs for these in the
form of `[Default]CPUAccounting`, `[Default]MemoryAccounting`, and the
like. The limitation of these is that Default*Accounting is overrideable
by individual services, of which any one could decide to reenable a
controller within the hierarchy at any point just by using a controller
feature implicitly (eg. `CPUWeight`), even if the use of that CPU
feature could just be opportunistic. Since many services are provided by
the distribution, or by upstream teams at a particular organisation,
it's not a sustainable solution to simply try to find and remove
offending directives from these units.
This commit presents a more direct solution -- a DisableControllers=
directive that forcibly disallows a controller from being enabled within
a subtree.
2018-12-03 15:38:06 +01:00
|
|
|
mask &= ~unit_get_ancestor_disable_mask(u);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
|
|
|
return mask;
|
|
|
|
}
|
|
|
|
|
|
|
|
CGroupMask unit_get_enable_mask(Unit *u) {
|
|
|
|
CGroupMask mask;
|
|
|
|
|
|
|
|
/* This returns the cgroup mask of all controllers to enable
|
|
|
|
* for the children of a specific cgroup. This is primarily
|
|
|
|
* useful for the unified cgroup hierarchy, where each cgroup
|
|
|
|
* controls which controllers are enabled for its children. */
|
|
|
|
|
|
|
|
mask = unit_get_members_mask(u);
|
2013-11-11 10:03:31 +01:00
|
|
|
mask &= u->manager->cgroup_supported;
|
cgroup: Add DisableControllers= directive to disable controller in subtree
Some controllers (like the CPU controller) have a performance cost that
is non-trivial on certain workloads. While this can be mitigated and
improved to an extent, there will for some controllers always be some
overheads associated with the benefits gained from the controller.
Inside Facebook, the fix applied has been to disable the CPU controller
forcibly with `cgroup_disable=cpu` on the kernel command line.
This presents a problem: to disable or reenable the controller, a reboot
is required, but this is quite cumbersome and slow to do for many
thousands of machines, especially machines where disabling/enabling a
stateful service on a machine is a matter of several minutes.
Currently systemd provides some configuration knobs for these in the
form of `[Default]CPUAccounting`, `[Default]MemoryAccounting`, and the
like. The limitation of these is that Default*Accounting is overrideable
by individual services, of which any one could decide to reenable a
controller within the hierarchy at any point just by using a controller
feature implicitly (eg. `CPUWeight`), even if the use of that CPU
feature could just be opportunistic. Since many services are provided by
the distribution, or by upstream teams at a particular organisation,
it's not a sustainable solution to simply try to find and remove
offending directives from these units.
This commit presents a more direct solution -- a DisableControllers=
directive that forcibly disallows a controller from being enabled within
a subtree.
2018-12-03 15:38:06 +01:00
|
|
|
mask &= ~unit_get_ancestor_disable_mask(u);
|
2013-11-11 10:03:31 +01:00
|
|
|
|
|
|
|
return mask;
|
|
|
|
}
|
|
|
|
|
cgroup: drastically simplify caching of cgroups members mask
Previously we tried to be smart: when a new unit appeared and it only
added controllers to the cgroup mask we'd update the cached members mask
in all parents by ORing in the controller flags in their cached values.
Unfortunately this was quite broken, as we missed some conditions when
this cache had to be reset (for example, when a unit got unloaded),
moreover the optimization doesn't work when a controller is removed
anyway (as in that case there's no other way for the parent to iterate
though all children if any other, remaining child unit still needs it).
Hence, let's simplify the logic substantially: instead of updating the
cache on the right events (which we didn't get right), let's simply
invalidate the cache, and generate it lazily when we encounter it later.
This should actually result in better behaviour as we don't have to
calculate the new members mask for a whole subtree whever we have the
suspicion something changed, but can delay it to the point where we
actually need the members mask.
This allows us to simplify things quite a bit, which is good, since
validating this cache for correctness is hard enough.
Fixes: #9512
2018-11-23 01:07:34 +01:00
|
|
|
void unit_invalidate_cgroup_members_masks(Unit *u) {
|
2014-02-14 19:11:07 +01:00
|
|
|
assert(u);
|
|
|
|
|
cgroup: drastically simplify caching of cgroups members mask
Previously we tried to be smart: when a new unit appeared and it only
added controllers to the cgroup mask we'd update the cached members mask
in all parents by ORing in the controller flags in their cached values.
Unfortunately this was quite broken, as we missed some conditions when
this cache had to be reset (for example, when a unit got unloaded),
moreover the optimization doesn't work when a controller is removed
anyway (as in that case there's no other way for the parent to iterate
though all children if any other, remaining child unit still needs it).
Hence, let's simplify the logic substantially: instead of updating the
cache on the right events (which we didn't get right), let's simply
invalidate the cache, and generate it lazily when we encounter it later.
This should actually result in better behaviour as we don't have to
calculate the new members mask for a whole subtree whever we have the
suspicion something changed, but can delay it to the point where we
actually need the members mask.
This allows us to simplify things quite a bit, which is good, since
validating this cache for correctness is hard enough.
Fixes: #9512
2018-11-23 01:07:34 +01:00
|
|
|
/* Recurse invalidate the member masks cache all the way up the tree */
|
|
|
|
u->cgroup_members_mask_valid = false;
|
2014-02-14 19:11:07 +01:00
|
|
|
|
cgroup: drastically simplify caching of cgroups members mask
Previously we tried to be smart: when a new unit appeared and it only
added controllers to the cgroup mask we'd update the cached members mask
in all parents by ORing in the controller flags in their cached values.
Unfortunately this was quite broken, as we missed some conditions when
this cache had to be reset (for example, when a unit got unloaded),
moreover the optimization doesn't work when a controller is removed
anyway (as in that case there's no other way for the parent to iterate
though all children if any other, remaining child unit still needs it).
Hence, let's simplify the logic substantially: instead of updating the
cache on the right events (which we didn't get right), let's simply
invalidate the cache, and generate it lazily when we encounter it later.
This should actually result in better behaviour as we don't have to
calculate the new members mask for a whole subtree whever we have the
suspicion something changed, but can delay it to the point where we
actually need the members mask.
This allows us to simplify things quite a bit, which is good, since
validating this cache for correctness is hard enough.
Fixes: #9512
2018-11-23 01:07:34 +01:00
|
|
|
if (UNIT_ISSET(u->slice))
|
|
|
|
unit_invalidate_cgroup_members_masks(UNIT_DEREF(u->slice));
|
2013-11-11 10:03:31 +01:00
|
|
|
}
|
|
|
|
|
2018-02-07 22:52:52 +01:00
|
|
|
const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) {
|
2014-02-17 02:06:32 +01:00
|
|
|
|
2018-02-07 22:52:52 +01:00
|
|
|
/* Returns the realized cgroup path of the specified unit where all specified controllers are available. */
|
2014-02-17 02:06:32 +01:00
|
|
|
|
|
|
|
while (u) {
|
2018-02-07 22:52:52 +01:00
|
|
|
|
2014-02-17 02:06:32 +01:00
|
|
|
if (u->cgroup_path &&
|
|
|
|
u->cgroup_realized &&
|
2018-04-20 15:36:20 +02:00
|
|
|
FLAGS_SET(u->cgroup_realized_mask, mask))
|
2014-02-17 02:06:32 +01:00
|
|
|
return u->cgroup_path;
|
|
|
|
|
|
|
|
u = UNIT_DEREF(u->slice);
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2018-02-07 22:52:52 +01:00
|
|
|
static const char *migrate_callback(CGroupMask mask, void *userdata) {
|
cgroup: Swap cgroup v1 deletion and migration
When we are about to derealize a controller on v1 cgroup, we first
attempt to delete the controller cgroup and migrate afterwards. This
doesn't work in practice because populated cgroup cannot be deleted.
Furthermore, we leave out slices from migration completely, so
(un)setting a control value on them won't realize their controller
cgroup.
Rework actual realization, unit_create_cgroup() becomes
unit_update_cgroup() and make sure that controller hierarchies are
reduced when given controller cgroup ceased to be needed.
Note that with this we introduce slight deviation between v1 and v2 code
-- when a descendant unit turns off a delegated controller, we attempt
to disable it in ancestor slices. On v2 this may fail (kernel enforced,
because of child cgroups using the controller), on v1 we'll migrate
whole subtree and trim the subhierachy. (Previously, we wouldn't take
away delegated controller, however, derealization was broken anyway.)
Fixes: #14149
2020-05-01 14:00:42 +02:00
|
|
|
/* If not realized at all, migrate to root ("").
|
|
|
|
* It may happen if we're upgrading from older version that didn't clean up.
|
|
|
|
*/
|
|
|
|
return strempty(unit_get_realized_cgroup_path(userdata, mask));
|
2018-02-07 22:52:52 +01:00
|
|
|
}
|
|
|
|
|
2018-12-12 16:45:33 +01:00
|
|
|
char *unit_default_cgroup_path(const Unit *u) {
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
_cleanup_free_ char *escaped = NULL, *slice = NULL;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (unit_has_name(u, SPECIAL_ROOT_SLICE))
|
|
|
|
return strdup(u->manager->cgroup_root);
|
|
|
|
|
|
|
|
if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
|
|
|
|
r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
|
|
|
|
if (r < 0)
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
escaped = cg_escape(u->id);
|
|
|
|
if (!escaped)
|
|
|
|
return NULL;
|
|
|
|
|
2019-06-20 20:07:01 +02:00
|
|
|
return path_join(empty_to_root(u->manager->cgroup_root), slice, escaped);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
int unit_set_cgroup_path(Unit *u, const char *path) {
|
|
|
|
_cleanup_free_ char *p = NULL;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
2019-03-19 13:01:12 +01:00
|
|
|
if (streq_ptr(u->cgroup_path, path))
|
|
|
|
return 0;
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
if (path) {
|
|
|
|
p = strdup(path);
|
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
2019-03-19 13:01:12 +01:00
|
|
|
}
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
|
|
|
if (p) {
|
|
|
|
r = hashmap_put(u->manager->cgroup_unit, p, u);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
unit_release_cgroup(u);
|
2018-03-22 16:53:26 +01:00
|
|
|
u->cgroup_path = TAKE_PTR(p);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int unit_watch_cgroup(Unit *u) {
|
2016-03-25 16:38:50 +01:00
|
|
|
_cleanup_free_ char *events = NULL;
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
2019-03-19 17:17:31 +01:00
|
|
|
/* Watches the "cgroups.events" attribute of this unit's cgroup for "empty" events, but only if
|
|
|
|
* cgroupv2 is available. */
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
if (!u->cgroup_path)
|
|
|
|
return 0;
|
|
|
|
|
2019-03-19 17:17:31 +01:00
|
|
|
if (u->cgroup_control_inotify_wd >= 0)
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Only applies to the unified hierarchy */
|
2017-02-24 18:00:04 +01:00
|
|
|
r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
|
2017-02-24 17:52:58 +01:00
|
|
|
if (r < 0)
|
|
|
|
return log_error_errno(r, "Failed to determine whether the name=systemd hierarchy is unified: %m");
|
|
|
|
if (r == 0)
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
return 0;
|
|
|
|
|
2019-03-19 17:17:31 +01:00
|
|
|
/* No point in watch the top-level slice, it's never going to run empty. */
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
if (unit_has_name(u, SPECIAL_ROOT_SLICE))
|
|
|
|
return 0;
|
|
|
|
|
2019-03-19 17:17:31 +01:00
|
|
|
r = hashmap_ensure_allocated(&u->manager->cgroup_control_inotify_wd_unit, &trivial_hash_ops);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
if (r < 0)
|
|
|
|
return log_oom();
|
|
|
|
|
2016-03-25 16:38:50 +01:00
|
|
|
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events", &events);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
if (r < 0)
|
|
|
|
return log_oom();
|
|
|
|
|
2019-03-19 17:17:31 +01:00
|
|
|
u->cgroup_control_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
|
|
|
|
if (u->cgroup_control_inotify_wd < 0) {
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2019-03-19 17:17:31 +01:00
|
|
|
if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
|
|
|
|
* is not an error */
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
return 0;
|
|
|
|
|
2019-03-19 17:17:31 +01:00
|
|
|
return log_unit_error_errno(u, errno, "Failed to add control inotify watch descriptor for control group %s: %m", u->cgroup_path);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
}
|
|
|
|
|
2019-03-19 17:17:31 +01:00
|
|
|
r = hashmap_put(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd), u);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
if (r < 0)
|
2019-03-19 17:17:31 +01:00
|
|
|
return log_unit_error_errno(u, r, "Failed to add control inotify watch descriptor to hash map: %m");
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-03-19 19:05:19 +01:00
|
|
|
int unit_watch_cgroup_memory(Unit *u) {
|
|
|
|
_cleanup_free_ char *events = NULL;
|
|
|
|
CGroupContext *c;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
/* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if
|
|
|
|
* cgroupv2 is available. */
|
|
|
|
|
|
|
|
if (!u->cgroup_path)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
c = unit_get_cgroup_context(u);
|
|
|
|
if (!c)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie
|
|
|
|
* this to memory accounting, in a way watching for OOM kills is a form of memory accounting after
|
|
|
|
* all. */
|
|
|
|
if (!c->memory_accounting)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and
|
|
|
|
* we also don't want to generate a log message for each parent cgroup of a process. */
|
|
|
|
if (u->type == UNIT_SLICE)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (u->cgroup_memory_inotify_wd >= 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Only applies to the unified hierarchy */
|
|
|
|
r = cg_all_unified();
|
|
|
|
if (r < 0)
|
|
|
|
return log_error_errno(r, "Failed to determine whether the memory controller is unified: %m");
|
|
|
|
if (r == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops);
|
|
|
|
if (r < 0)
|
|
|
|
return log_oom();
|
|
|
|
|
|
|
|
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events);
|
|
|
|
if (r < 0)
|
|
|
|
return log_oom();
|
|
|
|
|
|
|
|
u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
|
|
|
|
if (u->cgroup_memory_inotify_wd < 0) {
|
|
|
|
|
|
|
|
if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
|
|
|
|
* is not an error */
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", u->cgroup_path);
|
|
|
|
}
|
|
|
|
|
|
|
|
r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u);
|
|
|
|
if (r < 0)
|
|
|
|
return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor to hash map: %m");
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-11-24 22:02:22 +01:00
|
|
|
int unit_pick_cgroup_path(Unit *u) {
|
|
|
|
_cleanup_free_ char *path = NULL;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (u->cgroup_path)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (!UNIT_HAS_CGROUP_CONTEXT(u))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
path = unit_default_cgroup_path(u);
|
|
|
|
if (!path)
|
|
|
|
return log_oom();
|
|
|
|
|
|
|
|
r = unit_set_cgroup_path(u, path);
|
|
|
|
if (r == -EEXIST)
|
|
|
|
return log_unit_error_errno(u, r, "Control group %s exists already.", path);
|
|
|
|
if (r < 0)
|
|
|
|
return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-12-07 23:18:28 +01:00
|
|
|
static int cg_v1_errno_to_log_level(int r) {
|
|
|
|
return r == -EROFS ? LOG_DEBUG : LOG_WARNING;
|
|
|
|
}
|
|
|
|
|
cgroup: Swap cgroup v1 deletion and migration
When we are about to derealize a controller on v1 cgroup, we first
attempt to delete the controller cgroup and migrate afterwards. This
doesn't work in practice because populated cgroup cannot be deleted.
Furthermore, we leave out slices from migration completely, so
(un)setting a control value on them won't realize their controller
cgroup.
Rework actual realization, unit_create_cgroup() becomes
unit_update_cgroup() and make sure that controller hierarchies are
reduced when given controller cgroup ceased to be needed.
Note that with this we introduce slight deviation between v1 and v2 code
-- when a descendant unit turns off a delegated controller, we attempt
to disable it in ancestor slices. On v2 this may fail (kernel enforced,
because of child cgroups using the controller), on v1 we'll migrate
whole subtree and trim the subhierachy. (Previously, we wouldn't take
away delegated controller, however, derealization was broken anyway.)
Fixes: #14149
2020-05-01 14:00:42 +02:00
|
|
|
static int unit_update_cgroup(
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
Unit *u,
|
|
|
|
CGroupMask target_mask,
|
2018-11-27 16:16:35 +01:00
|
|
|
CGroupMask enable_mask,
|
|
|
|
ManagerState state) {
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
cgroup: Swap cgroup v1 deletion and migration
When we are about to derealize a controller on v1 cgroup, we first
attempt to delete the controller cgroup and migrate afterwards. This
doesn't work in practice because populated cgroup cannot be deleted.
Furthermore, we leave out slices from migration completely, so
(un)setting a control value on them won't realize their controller
cgroup.
Rework actual realization, unit_create_cgroup() becomes
unit_update_cgroup() and make sure that controller hierarchies are
reduced when given controller cgroup ceased to be needed.
Note that with this we introduce slight deviation between v1 and v2 code
-- when a descendant unit turns off a delegated controller, we attempt
to disable it in ancestor slices. On v2 this may fail (kernel enforced,
because of child cgroups using the controller), on v1 we'll migrate
whole subtree and trim the subhierachy. (Previously, we wouldn't take
away delegated controller, however, derealization was broken anyway.)
Fixes: #14149
2020-05-01 14:00:42 +02:00
|
|
|
bool created, is_root_slice;
|
|
|
|
CGroupMask migrate_mask = 0;
|
cgroup: be more careful with which controllers we can enable/disable on a cgroup
This changes cg_enable_everywhere() to return which controllers are
enabled for the specified cgroup. This information is then used to
correctly track the enablement mask currently in effect for a unit.
Moreover, when we try to turn off a controller, and this works, then
this is indicates that the parent unit might succesfully turn it off
now, too as our unit might have kept it busy.
So far, when realizing cgroups, i.e. when syncing up the kernel
representation of relevant cgroups with our own idea we would strictly
work from the root to the leaves. This is generally a good approach, as
when controllers are enabled this has to happen in root-to-leaves order.
However, when controllers are disabled this has to happen in the
opposite order: in leaves-to-root order (this is because controllers can
only be enabled in a child if it is already enabled in the parent, and
if it shall be disabled in the parent then it has to be disabled in the
child first, otherwise it is considered busy when it is attempted to
remove it in the parent).
To make things complicated when invalidating a unit's cgroup membershup
systemd can actually turn off some controllers previously turned on at
the very same time as it turns on other controllers previously turned
off. In such a case we have to work up leaves-to-root *and*
root-to-leaves right after each other. With this patch this is
implemented: we still generally operate root-to-leaves, but as soon as
we noticed we successfully turned off a controller previously turned on
for a cgroup we'll re-enqueue the cgroup realization for all parents of
a unit, thus implementing leaves-to-root where necessary.
2018-11-22 21:45:33 +01:00
|
|
|
int r;
|
2011-06-30 00:11:25 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
assert(u);
|
2011-06-30 00:11:25 +02:00
|
|
|
|
2018-11-22 22:11:07 +01:00
|
|
|
if (!UNIT_HAS_CGROUP_CONTEXT(u))
|
2014-12-10 20:38:24 +01:00
|
|
|
return 0;
|
|
|
|
|
2017-11-24 22:02:22 +01:00
|
|
|
/* Figure out our cgroup path */
|
|
|
|
r = unit_pick_cgroup_path(u);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
2013-08-28 15:33:35 +02:00
|
|
|
|
2014-02-17 02:06:32 +01:00
|
|
|
/* First, create our own group */
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
|
2014-11-28 18:23:20 +01:00
|
|
|
if (r < 0)
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
|
2018-10-19 17:07:46 +02:00
|
|
|
created = r;
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
|
|
|
/* Start watching it */
|
|
|
|
(void) unit_watch_cgroup(u);
|
2019-03-19 19:05:19 +01:00
|
|
|
(void) unit_watch_cgroup_memory(u);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
cgroup: Swap cgroup v1 deletion and migration
When we are about to derealize a controller on v1 cgroup, we first
attempt to delete the controller cgroup and migrate afterwards. This
doesn't work in practice because populated cgroup cannot be deleted.
Furthermore, we leave out slices from migration completely, so
(un)setting a control value on them won't realize their controller
cgroup.
Rework actual realization, unit_create_cgroup() becomes
unit_update_cgroup() and make sure that controller hierarchies are
reduced when given controller cgroup ceased to be needed.
Note that with this we introduce slight deviation between v1 and v2 code
-- when a descendant unit turns off a delegated controller, we attempt
to disable it in ancestor slices. On v2 this may fail (kernel enforced,
because of child cgroups using the controller), on v1 we'll migrate
whole subtree and trim the subhierachy. (Previously, we wouldn't take
away delegated controller, however, derealization was broken anyway.)
Fixes: #14149
2020-05-01 14:00:42 +02:00
|
|
|
|
|
|
|
/* For v2 we preserve enabled controllers in delegated units, adjust others,
|
|
|
|
* for v1 we figure out which controller hierarchies need migration. */
|
2018-11-23 01:03:18 +01:00
|
|
|
if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) {
|
cgroup: be more careful with which controllers we can enable/disable on a cgroup
This changes cg_enable_everywhere() to return which controllers are
enabled for the specified cgroup. This information is then used to
correctly track the enablement mask currently in effect for a unit.
Moreover, when we try to turn off a controller, and this works, then
this is indicates that the parent unit might succesfully turn it off
now, too as our unit might have kept it busy.
So far, when realizing cgroups, i.e. when syncing up the kernel
representation of relevant cgroups with our own idea we would strictly
work from the root to the leaves. This is generally a good approach, as
when controllers are enabled this has to happen in root-to-leaves order.
However, when controllers are disabled this has to happen in the
opposite order: in leaves-to-root order (this is because controllers can
only be enabled in a child if it is already enabled in the parent, and
if it shall be disabled in the parent then it has to be disabled in the
child first, otherwise it is considered busy when it is attempted to
remove it in the parent).
To make things complicated when invalidating a unit's cgroup membershup
systemd can actually turn off some controllers previously turned on at
the very same time as it turns on other controllers previously turned
off. In such a case we have to work up leaves-to-root *and*
root-to-leaves right after each other. With this patch this is
implemented: we still generally operate root-to-leaves, but as soon as
we noticed we successfully turned off a controller previously turned on
for a cgroup we'll re-enqueue the cgroup realization for all parents of
a unit, thus implementing leaves-to-root where necessary.
2018-11-22 21:45:33 +01:00
|
|
|
CGroupMask result_mask = 0;
|
2018-05-29 12:19:09 +02:00
|
|
|
|
|
|
|
/* Enable all controllers we need */
|
cgroup: be more careful with which controllers we can enable/disable on a cgroup
This changes cg_enable_everywhere() to return which controllers are
enabled for the specified cgroup. This information is then used to
correctly track the enablement mask currently in effect for a unit.
Moreover, when we try to turn off a controller, and this works, then
this is indicates that the parent unit might succesfully turn it off
now, too as our unit might have kept it busy.
So far, when realizing cgroups, i.e. when syncing up the kernel
representation of relevant cgroups with our own idea we would strictly
work from the root to the leaves. This is generally a good approach, as
when controllers are enabled this has to happen in root-to-leaves order.
However, when controllers are disabled this has to happen in the
opposite order: in leaves-to-root order (this is because controllers can
only be enabled in a child if it is already enabled in the parent, and
if it shall be disabled in the parent then it has to be disabled in the
child first, otherwise it is considered busy when it is attempted to
remove it in the parent).
To make things complicated when invalidating a unit's cgroup membershup
systemd can actually turn off some controllers previously turned on at
the very same time as it turns on other controllers previously turned
off. In such a case we have to work up leaves-to-root *and*
root-to-leaves right after each other. With this patch this is
implemented: we still generally operate root-to-leaves, but as soon as
we noticed we successfully turned off a controller previously turned on
for a cgroup we'll re-enqueue the cgroup realization for all parents of
a unit, thus implementing leaves-to-root where necessary.
2018-11-22 21:45:33 +01:00
|
|
|
r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path, &result_mask);
|
2018-05-29 12:19:09 +02:00
|
|
|
if (r < 0)
|
cgroup: be more careful with which controllers we can enable/disable on a cgroup
This changes cg_enable_everywhere() to return which controllers are
enabled for the specified cgroup. This information is then used to
correctly track the enablement mask currently in effect for a unit.
Moreover, when we try to turn off a controller, and this works, then
this is indicates that the parent unit might succesfully turn it off
now, too as our unit might have kept it busy.
So far, when realizing cgroups, i.e. when syncing up the kernel
representation of relevant cgroups with our own idea we would strictly
work from the root to the leaves. This is generally a good approach, as
when controllers are enabled this has to happen in root-to-leaves order.
However, when controllers are disabled this has to happen in the
opposite order: in leaves-to-root order (this is because controllers can
only be enabled in a child if it is already enabled in the parent, and
if it shall be disabled in the parent then it has to be disabled in the
child first, otherwise it is considered busy when it is attempted to
remove it in the parent).
To make things complicated when invalidating a unit's cgroup membershup
systemd can actually turn off some controllers previously turned on at
the very same time as it turns on other controllers previously turned
off. In such a case we have to work up leaves-to-root *and*
root-to-leaves right after each other. With this patch this is
implemented: we still generally operate root-to-leaves, but as soon as
we noticed we successfully turned off a controller previously turned on
for a cgroup we'll re-enqueue the cgroup realization for all parents of
a unit, thus implementing leaves-to-root where necessary.
2018-11-22 21:45:33 +01:00
|
|
|
log_unit_warning_errno(u, r, "Failed to enable/disable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
|
|
|
|
|
|
|
|
/* Remember what's actually enabled now */
|
|
|
|
u->cgroup_enabled_mask = result_mask;
|
cgroup: Swap cgroup v1 deletion and migration
When we are about to derealize a controller on v1 cgroup, we first
attempt to delete the controller cgroup and migrate afterwards. This
doesn't work in practice because populated cgroup cannot be deleted.
Furthermore, we leave out slices from migration completely, so
(un)setting a control value on them won't realize their controller
cgroup.
Rework actual realization, unit_create_cgroup() becomes
unit_update_cgroup() and make sure that controller hierarchies are
reduced when given controller cgroup ceased to be needed.
Note that with this we introduce slight deviation between v1 and v2 code
-- when a descendant unit turns off a delegated controller, we attempt
to disable it in ancestor slices. On v2 this may fail (kernel enforced,
because of child cgroups using the controller), on v1 we'll migrate
whole subtree and trim the subhierachy. (Previously, we wouldn't take
away delegated controller, however, derealization was broken anyway.)
Fixes: #14149
2020-05-01 14:00:42 +02:00
|
|
|
|
|
|
|
migrate_mask = u->cgroup_realized_mask ^ target_mask;
|
2018-05-29 12:19:09 +02:00
|
|
|
}
|
2014-02-17 02:06:32 +01:00
|
|
|
|
|
|
|
/* Keep track that this is now realized */
|
2013-06-27 04:14:27 +02:00
|
|
|
u->cgroup_realized = true;
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
u->cgroup_realized_mask = target_mask;
|
2013-06-27 04:14:27 +02:00
|
|
|
|
cgroup: Swap cgroup v1 deletion and migration
When we are about to derealize a controller on v1 cgroup, we first
attempt to delete the controller cgroup and migrate afterwards. This
doesn't work in practice because populated cgroup cannot be deleted.
Furthermore, we leave out slices from migration completely, so
(un)setting a control value on them won't realize their controller
cgroup.
Rework actual realization, unit_create_cgroup() becomes
unit_update_cgroup() and make sure that controller hierarchies are
reduced when given controller cgroup ceased to be needed.
Note that with this we introduce slight deviation between v1 and v2 code
-- when a descendant unit turns off a delegated controller, we attempt
to disable it in ancestor slices. On v2 this may fail (kernel enforced,
because of child cgroups using the controller), on v1 we'll migrate
whole subtree and trim the subhierachy. (Previously, we wouldn't take
away delegated controller, however, derealization was broken anyway.)
Fixes: #14149
2020-05-01 14:00:42 +02:00
|
|
|
/* Migrate processes in controller hierarchies both downwards (enabling) and upwards (disabling).
|
|
|
|
*
|
|
|
|
* Unnecessary controller cgroups are trimmed (after emptied by upward migration).
|
|
|
|
* We perform migration also with whole slices for cases when users don't care about leave
|
|
|
|
* granularity. Since delegated_mask is subset of target mask, we won't trim slice subtree containing
|
|
|
|
* delegated units.
|
2020-12-07 23:18:28 +01:00
|
|
|
*
|
|
|
|
* If we're in an nspawn container and using legacy cgroups, the controller hierarchies are mounted
|
|
|
|
* read-only into the container. We skip migration/trim in this scenario since it would fail
|
|
|
|
* regardless with noisy "Read-only filesystem" warnings.
|
cgroup: Swap cgroup v1 deletion and migration
When we are about to derealize a controller on v1 cgroup, we first
attempt to delete the controller cgroup and migrate afterwards. This
doesn't work in practice because populated cgroup cannot be deleted.
Furthermore, we leave out slices from migration completely, so
(un)setting a control value on them won't realize their controller
cgroup.
Rework actual realization, unit_create_cgroup() becomes
unit_update_cgroup() and make sure that controller hierarchies are
reduced when given controller cgroup ceased to be needed.
Note that with this we introduce slight deviation between v1 and v2 code
-- when a descendant unit turns off a delegated controller, we attempt
to disable it in ancestor slices. On v2 this may fail (kernel enforced,
because of child cgroups using the controller), on v1 we'll migrate
whole subtree and trim the subhierachy. (Previously, we wouldn't take
away delegated controller, however, derealization was broken anyway.)
Fixes: #14149
2020-05-01 14:00:42 +02:00
|
|
|
*/
|
|
|
|
if (cg_all_unified() == 0) {
|
|
|
|
r = cg_migrate_v1_controllers(u->manager->cgroup_supported, migrate_mask, u->cgroup_path, migrate_callback, u);
|
|
|
|
if (r < 0)
|
2020-12-07 23:18:28 +01:00
|
|
|
log_unit_full_errno(
|
|
|
|
u,
|
|
|
|
cg_v1_errno_to_log_level(r),
|
|
|
|
r,
|
|
|
|
"Failed to migrate controller cgroups from %s, ignoring: %m",
|
|
|
|
u->cgroup_path);
|
2014-12-10 20:38:24 +01:00
|
|
|
|
cgroup: Swap cgroup v1 deletion and migration
When we are about to derealize a controller on v1 cgroup, we first
attempt to delete the controller cgroup and migrate afterwards. This
doesn't work in practice because populated cgroup cannot be deleted.
Furthermore, we leave out slices from migration completely, so
(un)setting a control value on them won't realize their controller
cgroup.
Rework actual realization, unit_create_cgroup() becomes
unit_update_cgroup() and make sure that controller hierarchies are
reduced when given controller cgroup ceased to be needed.
Note that with this we introduce slight deviation between v1 and v2 code
-- when a descendant unit turns off a delegated controller, we attempt
to disable it in ancestor slices. On v2 this may fail (kernel enforced,
because of child cgroups using the controller), on v1 we'll migrate
whole subtree and trim the subhierachy. (Previously, we wouldn't take
away delegated controller, however, derealization was broken anyway.)
Fixes: #14149
2020-05-01 14:00:42 +02:00
|
|
|
is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
|
|
|
|
r = cg_trim_v1_controllers(u->manager->cgroup_supported, ~target_mask, u->cgroup_path, !is_root_slice);
|
2014-12-10 20:38:24 +01:00
|
|
|
if (r < 0)
|
2020-12-07 23:18:28 +01:00
|
|
|
log_unit_full_errno(
|
|
|
|
u,
|
|
|
|
cg_v1_errno_to_log_level(r),
|
|
|
|
r,
|
|
|
|
"Failed to delete controller cgroups %s, ignoring: %m",
|
|
|
|
u->cgroup_path);
|
2014-12-10 20:38:24 +01:00
|
|
|
}
|
2014-02-17 02:06:32 +01:00
|
|
|
|
2018-11-27 16:16:35 +01:00
|
|
|
/* Set attributes */
|
|
|
|
cgroup_context_apply(u, target_mask, state);
|
|
|
|
cgroup_xattr_apply(u);
|
|
|
|
|
2011-06-30 00:11:25 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-02-07 22:52:52 +01:00
|
|
|
static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) {
|
|
|
|
_cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
|
|
|
|
char *pp;
|
2014-12-10 22:06:44 +01:00
|
|
|
int r;
|
2018-02-07 22:52:52 +01:00
|
|
|
|
2014-12-10 22:06:44 +01:00
|
|
|
assert(u);
|
|
|
|
|
2018-02-07 22:52:52 +01:00
|
|
|
if (MANAGER_IS_SYSTEM(u->manager))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (!u->manager->system_bus)
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
if (!u->cgroup_path)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/* Determine this unit's cgroup path relative to our cgroup root */
|
|
|
|
pp = path_startswith(u->cgroup_path, u->manager->cgroup_root);
|
|
|
|
if (!pp)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
pp = strjoina("/", pp, suffix_path);
|
2018-05-31 16:39:31 +02:00
|
|
|
path_simplify(pp, false);
|
2018-02-07 22:52:52 +01:00
|
|
|
|
|
|
|
r = sd_bus_call_method(u->manager->system_bus,
|
|
|
|
"org.freedesktop.systemd1",
|
|
|
|
"/org/freedesktop/systemd1",
|
|
|
|
"org.freedesktop.systemd1.Manager",
|
|
|
|
"AttachProcessesToUnit",
|
|
|
|
&error, NULL,
|
|
|
|
"ssau",
|
|
|
|
NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid);
|
2014-12-10 22:06:44 +01:00
|
|
|
if (r < 0)
|
2018-02-07 22:52:52 +01:00
|
|
|
return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r));
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
|
|
|
|
CGroupMask delegated_mask;
|
|
|
|
const char *p;
|
|
|
|
void *pidp;
|
|
|
|
int r, q;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (!UNIT_HAS_CGROUP_CONTEXT(u))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (set_isempty(pids))
|
|
|
|
return 0;
|
2014-12-10 22:06:44 +01:00
|
|
|
|
2019-04-23 12:14:20 +02:00
|
|
|
/* Load any custom firewall BPF programs here once to test if they are existing and actually loadable.
|
|
|
|
* Fail here early since later errors in the call chain unit_realize_cgroup to cgroup_context_apply are ignored. */
|
|
|
|
r = bpf_firewall_load_custom(u);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
2018-02-07 22:52:52 +01:00
|
|
|
r = unit_realize_cgroup(u);
|
2014-12-10 22:06:44 +01:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
2018-02-07 22:52:52 +01:00
|
|
|
if (isempty(suffix_path))
|
|
|
|
p = u->cgroup_path;
|
|
|
|
else
|
2019-06-24 17:24:23 +02:00
|
|
|
p = prefix_roota(u->cgroup_path, suffix_path);
|
2018-02-07 22:52:52 +01:00
|
|
|
|
|
|
|
delegated_mask = unit_get_delegate_mask(u);
|
|
|
|
|
|
|
|
r = 0;
|
2020-09-08 11:58:29 +02:00
|
|
|
SET_FOREACH(pidp, pids) {
|
2018-02-07 22:52:52 +01:00
|
|
|
pid_t pid = PTR_TO_PID(pidp);
|
|
|
|
CGroupController c;
|
|
|
|
|
|
|
|
/* First, attach the PID to the main cgroup hierarchy */
|
|
|
|
q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid);
|
|
|
|
if (q < 0) {
|
|
|
|
log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p);
|
|
|
|
|
2020-09-22 14:13:18 +02:00
|
|
|
if (MANAGER_IS_USER(u->manager) && ERRNO_IS_PRIVILEGE(q)) {
|
2018-02-07 22:52:52 +01:00
|
|
|
int z;
|
|
|
|
|
|
|
|
/* If we are in a user instance, and we can't move the process ourselves due to
|
|
|
|
* permission problems, let's ask the system instance about it instead. Since it's more
|
|
|
|
* privileged it might be able to move the process across the leaves of a subtree who's
|
|
|
|
* top node is not owned by us. */
|
|
|
|
|
|
|
|
z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path);
|
|
|
|
if (z < 0)
|
|
|
|
log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p);
|
|
|
|
else
|
|
|
|
continue; /* When the bus thing worked via the bus we are fully done for this PID. */
|
|
|
|
}
|
|
|
|
|
|
|
|
if (r >= 0)
|
|
|
|
r = q; /* Remember first error */
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
q = cg_all_unified();
|
|
|
|
if (q < 0)
|
|
|
|
return q;
|
|
|
|
if (q > 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the
|
|
|
|
* innermost realized one */
|
|
|
|
|
|
|
|
for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
|
|
|
|
CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
|
|
|
|
const char *realized;
|
|
|
|
|
|
|
|
if (!(u->manager->cgroup_supported & bit))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */
|
|
|
|
if (delegated_mask & u->cgroup_realized_mask & bit) {
|
|
|
|
q = cg_attach(cgroup_controller_to_string(c), p, pid);
|
|
|
|
if (q >= 0)
|
|
|
|
continue; /* Success! */
|
|
|
|
|
|
|
|
log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m",
|
|
|
|
pid, p, cgroup_controller_to_string(c));
|
|
|
|
}
|
|
|
|
|
|
|
|
/* So this controller is either not delegate or realized, or something else weird happened. In
|
|
|
|
* that case let's attach the PID at least to the closest cgroup up the tree that is
|
|
|
|
* realized. */
|
|
|
|
realized = unit_get_realized_cgroup_path(u, bit);
|
|
|
|
if (!realized)
|
|
|
|
continue; /* Not even realized in the root slice? Then let's not bother */
|
|
|
|
|
|
|
|
q = cg_attach(cgroup_controller_to_string(c), realized, pid);
|
|
|
|
if (q < 0)
|
|
|
|
log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m",
|
|
|
|
pid, realized, cgroup_controller_to_string(c));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return r;
|
2014-12-10 22:06:44 +01:00
|
|
|
}
|
|
|
|
|
2017-09-05 19:27:53 +02:00
|
|
|
static bool unit_has_mask_realized(
|
|
|
|
Unit *u,
|
|
|
|
CGroupMask target_mask,
|
2018-09-30 12:33:16 +02:00
|
|
|
CGroupMask enable_mask) {
|
2017-09-05 19:27:53 +02:00
|
|
|
|
2014-02-14 19:11:07 +01:00
|
|
|
assert(u);
|
|
|
|
|
2018-11-23 01:02:17 +01:00
|
|
|
/* Returns true if this unit is fully realized. We check four things:
|
|
|
|
*
|
|
|
|
* 1. Whether the cgroup was created at all
|
2019-01-02 21:15:15 +01:00
|
|
|
* 2. Whether the cgroup was created in all the hierarchies we need it to be created in (in case of cgroup v1)
|
|
|
|
* 3. Whether the cgroup has all the right controllers enabled (in case of cgroup v2)
|
2018-11-23 01:02:17 +01:00
|
|
|
* 4. Whether the invalidation mask is currently zero
|
|
|
|
*
|
|
|
|
* If you wonder why we mask the target realization and enable mask with CGROUP_MASK_V1/CGROUP_MASK_V2: note
|
2019-01-02 21:15:15 +01:00
|
|
|
* that there are three sets of bitmasks: CGROUP_MASK_V1 (for real cgroup v1 controllers), CGROUP_MASK_V2 (for
|
|
|
|
* real cgroup v2 controllers) and CGROUP_MASK_BPF (for BPF-based pseudo-controllers). Now, cgroup_realized_mask
|
|
|
|
* is only matters for cgroup v1 controllers, and cgroup_enabled_mask only used for cgroup v2, and if they
|
2018-11-23 01:02:17 +01:00
|
|
|
* differ in the others, we don't really care. (After all, the cgroup_enabled_mask tracks with controllers are
|
|
|
|
* enabled through cgroup.subtree_control, and since the BPF pseudo-controllers don't show up there, they
|
|
|
|
* simply don't matter. */
|
|
|
|
|
2017-09-05 19:27:53 +02:00
|
|
|
return u->cgroup_realized &&
|
2018-11-23 01:02:17 +01:00
|
|
|
((u->cgroup_realized_mask ^ target_mask) & CGROUP_MASK_V1) == 0 &&
|
|
|
|
((u->cgroup_enabled_mask ^ enable_mask) & CGROUP_MASK_V2) == 0 &&
|
2018-09-30 12:33:16 +02:00
|
|
|
u->cgroup_invalidated_mask == 0;
|
2013-11-11 10:03:31 +01:00
|
|
|
}
|
|
|
|
|
2018-11-27 16:49:41 +01:00
|
|
|
static bool unit_has_mask_disables_realized(
|
|
|
|
Unit *u,
|
|
|
|
CGroupMask target_mask,
|
|
|
|
CGroupMask enable_mask) {
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
/* Returns true if all controllers which should be disabled are indeed disabled.
|
|
|
|
*
|
|
|
|
* Unlike unit_has_mask_realized, we don't care what was enabled, only that anything we want to remove is
|
|
|
|
* already removed. */
|
|
|
|
|
|
|
|
return !u->cgroup_realized ||
|
|
|
|
(FLAGS_SET(u->cgroup_realized_mask, target_mask & CGROUP_MASK_V1) &&
|
|
|
|
FLAGS_SET(u->cgroup_enabled_mask, enable_mask & CGROUP_MASK_V2));
|
|
|
|
}
|
|
|
|
|
2018-11-26 14:45:26 +01:00
|
|
|
static bool unit_has_mask_enables_realized(
|
|
|
|
Unit *u,
|
|
|
|
CGroupMask target_mask,
|
|
|
|
CGroupMask enable_mask) {
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
/* Returns true if all controllers which should be enabled are indeed enabled.
|
|
|
|
*
|
|
|
|
* Unlike unit_has_mask_realized, we don't care about the controllers that are not present, only that anything
|
|
|
|
* we want to add is already added. */
|
|
|
|
|
|
|
|
return u->cgroup_realized &&
|
cgroup: Add DisableControllers= directive to disable controller in subtree
Some controllers (like the CPU controller) have a performance cost that
is non-trivial on certain workloads. While this can be mitigated and
improved to an extent, there will for some controllers always be some
overheads associated with the benefits gained from the controller.
Inside Facebook, the fix applied has been to disable the CPU controller
forcibly with `cgroup_disable=cpu` on the kernel command line.
This presents a problem: to disable or reenable the controller, a reboot
is required, but this is quite cumbersome and slow to do for many
thousands of machines, especially machines where disabling/enabling a
stateful service on a machine is a matter of several minutes.
Currently systemd provides some configuration knobs for these in the
form of `[Default]CPUAccounting`, `[Default]MemoryAccounting`, and the
like. The limitation of these is that Default*Accounting is overrideable
by individual services, of which any one could decide to reenable a
controller within the hierarchy at any point just by using a controller
feature implicitly (eg. `CPUWeight`), even if the use of that CPU
feature could just be opportunistic. Since many services are provided by
the distribution, or by upstream teams at a particular organisation,
it's not a sustainable solution to simply try to find and remove
offending directives from these units.
This commit presents a more direct solution -- a DisableControllers=
directive that forcibly disallows a controller from being enabled within
a subtree.
2018-12-03 15:38:06 +01:00
|
|
|
((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) &&
|
|
|
|
((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2);
|
2018-11-26 14:45:26 +01:00
|
|
|
}
|
|
|
|
|
2020-06-01 17:30:35 +02:00
|
|
|
static void unit_add_to_cgroup_realize_queue(Unit *u) {
|
2017-11-24 19:48:38 +01:00
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (u->in_cgroup_realize_queue)
|
|
|
|
return;
|
|
|
|
|
2020-05-01 14:00:42 +02:00
|
|
|
LIST_APPEND(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
|
2017-11-24 19:48:38 +01:00
|
|
|
u->in_cgroup_realize_queue = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void unit_remove_from_cgroup_realize_queue(Unit *u) {
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (!u->in_cgroup_realize_queue)
|
|
|
|
return;
|
|
|
|
|
|
|
|
LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
|
|
|
|
u->in_cgroup_realize_queue = false;
|
|
|
|
}
|
|
|
|
|
2018-11-26 14:45:26 +01:00
|
|
|
/* Controllers can only be enabled breadth-first, from the root of the
|
|
|
|
* hierarchy downwards to the unit in question. */
|
|
|
|
static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
|
|
|
|
CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
/* First go deal with this unit's parent, or we won't be able to enable
|
|
|
|
* any new controllers at this layer. */
|
|
|
|
if (UNIT_ISSET(u->slice)) {
|
|
|
|
r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
target_mask = unit_get_target_mask(u);
|
|
|
|
enable_mask = unit_get_enable_mask(u);
|
|
|
|
|
|
|
|
/* We can only enable in this direction, don't try to disable anything.
|
|
|
|
*/
|
|
|
|
if (unit_has_mask_enables_realized(u, target_mask, enable_mask))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
new_target_mask = u->cgroup_realized_mask | target_mask;
|
|
|
|
new_enable_mask = u->cgroup_enabled_mask | enable_mask;
|
|
|
|
|
cgroup: Swap cgroup v1 deletion and migration
When we are about to derealize a controller on v1 cgroup, we first
attempt to delete the controller cgroup and migrate afterwards. This
doesn't work in practice because populated cgroup cannot be deleted.
Furthermore, we leave out slices from migration completely, so
(un)setting a control value on them won't realize their controller
cgroup.
Rework actual realization, unit_create_cgroup() becomes
unit_update_cgroup() and make sure that controller hierarchies are
reduced when given controller cgroup ceased to be needed.
Note that with this we introduce slight deviation between v1 and v2 code
-- when a descendant unit turns off a delegated controller, we attempt
to disable it in ancestor slices. On v2 this may fail (kernel enforced,
because of child cgroups using the controller), on v1 we'll migrate
whole subtree and trim the subhierachy. (Previously, we wouldn't take
away delegated controller, however, derealization was broken anyway.)
Fixes: #14149
2020-05-01 14:00:42 +02:00
|
|
|
return unit_update_cgroup(u, new_target_mask, new_enable_mask, state);
|
2018-11-26 14:45:26 +01:00
|
|
|
}
|
|
|
|
|
2018-11-27 16:49:41 +01:00
|
|
|
/* Controllers can only be disabled depth-first, from the leaves of the
|
|
|
|
* hierarchy upwards to the unit in question. */
|
|
|
|
static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
|
|
|
|
Unit *m;
|
|
|
|
void *v;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (u->type != UNIT_SLICE)
|
|
|
|
return 0;
|
|
|
|
|
2020-09-08 11:58:29 +02:00
|
|
|
HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE]) {
|
2018-11-27 16:49:41 +01:00
|
|
|
CGroupMask target_mask, enable_mask, new_target_mask, new_enable_mask;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
if (UNIT_DEREF(m->slice) != u)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* The cgroup for this unit might not actually be fully
|
|
|
|
* realised yet, in which case it isn't holding any controllers
|
|
|
|
* open anyway. */
|
2020-05-01 14:00:42 +02:00
|
|
|
if (!m->cgroup_realized)
|
2018-11-27 16:49:41 +01:00
|
|
|
continue;
|
|
|
|
|
|
|
|
/* We must disable those below us first in order to release the
|
|
|
|
* controller. */
|
|
|
|
if (m->type == UNIT_SLICE)
|
|
|
|
(void) unit_realize_cgroup_now_disable(m, state);
|
|
|
|
|
|
|
|
target_mask = unit_get_target_mask(m);
|
|
|
|
enable_mask = unit_get_enable_mask(m);
|
|
|
|
|
|
|
|
/* We can only disable in this direction, don't try to enable
|
|
|
|
* anything. */
|
|
|
|
if (unit_has_mask_disables_realized(m, target_mask, enable_mask))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
new_target_mask = m->cgroup_realized_mask & target_mask;
|
|
|
|
new_enable_mask = m->cgroup_enabled_mask & enable_mask;
|
|
|
|
|
cgroup: Swap cgroup v1 deletion and migration
When we are about to derealize a controller on v1 cgroup, we first
attempt to delete the controller cgroup and migrate afterwards. This
doesn't work in practice because populated cgroup cannot be deleted.
Furthermore, we leave out slices from migration completely, so
(un)setting a control value on them won't realize their controller
cgroup.
Rework actual realization, unit_create_cgroup() becomes
unit_update_cgroup() and make sure that controller hierarchies are
reduced when given controller cgroup ceased to be needed.
Note that with this we introduce slight deviation between v1 and v2 code
-- when a descendant unit turns off a delegated controller, we attempt
to disable it in ancestor slices. On v2 this may fail (kernel enforced,
because of child cgroups using the controller), on v1 we'll migrate
whole subtree and trim the subhierachy. (Previously, we wouldn't take
away delegated controller, however, derealization was broken anyway.)
Fixes: #14149
2020-05-01 14:00:42 +02:00
|
|
|
r = unit_update_cgroup(m, new_target_mask, new_enable_mask, state);
|
2018-11-27 16:49:41 +01:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2018-11-26 14:45:26 +01:00
|
|
|
|
2013-11-11 10:03:31 +01:00
|
|
|
/* Check if necessary controllers and attributes for a unit are in place.
|
|
|
|
*
|
2018-11-26 14:45:26 +01:00
|
|
|
* - If so, do nothing.
|
|
|
|
* - If not, create paths, move processes over, and set attributes.
|
|
|
|
*
|
|
|
|
* Controllers can only be *enabled* in a breadth-first way, and *disabled* in
|
|
|
|
* a depth-first way. As such the process looks like this:
|
|
|
|
*
|
|
|
|
* Suppose we have a cgroup hierarchy which looks like this:
|
|
|
|
*
|
|
|
|
* root
|
|
|
|
* / \
|
|
|
|
* / \
|
|
|
|
* / \
|
|
|
|
* a b
|
|
|
|
* / \ / \
|
|
|
|
* / \ / \
|
|
|
|
* c d e f
|
|
|
|
* / \ / \ / \ / \
|
|
|
|
* h i j k l m n o
|
|
|
|
*
|
|
|
|
* 1. We want to realise cgroup "d" now.
|
cgroup: Add DisableControllers= directive to disable controller in subtree
Some controllers (like the CPU controller) have a performance cost that
is non-trivial on certain workloads. While this can be mitigated and
improved to an extent, there will for some controllers always be some
overheads associated with the benefits gained from the controller.
Inside Facebook, the fix applied has been to disable the CPU controller
forcibly with `cgroup_disable=cpu` on the kernel command line.
This presents a problem: to disable or reenable the controller, a reboot
is required, but this is quite cumbersome and slow to do for many
thousands of machines, especially machines where disabling/enabling a
stateful service on a machine is a matter of several minutes.
Currently systemd provides some configuration knobs for these in the
form of `[Default]CPUAccounting`, `[Default]MemoryAccounting`, and the
like. The limitation of these is that Default*Accounting is overrideable
by individual services, of which any one could decide to reenable a
controller within the hierarchy at any point just by using a controller
feature implicitly (eg. `CPUWeight`), even if the use of that CPU
feature could just be opportunistic. Since many services are provided by
the distribution, or by upstream teams at a particular organisation,
it's not a sustainable solution to simply try to find and remove
offending directives from these units.
This commit presents a more direct solution -- a DisableControllers=
directive that forcibly disallows a controller from being enabled within
a subtree.
2018-12-03 15:38:06 +01:00
|
|
|
* 2. cgroup "a" has DisableControllers=cpu in the associated unit.
|
2018-11-26 14:45:26 +01:00
|
|
|
* 3. cgroup "k" just started requesting the memory controller.
|
|
|
|
*
|
|
|
|
* To make this work we must do the following in order:
|
|
|
|
*
|
|
|
|
* 1. Disable CPU controller in k, j
|
|
|
|
* 2. Disable CPU controller in d
|
|
|
|
* 3. Enable memory controller in root
|
|
|
|
* 4. Enable memory controller in a
|
|
|
|
* 5. Enable memory controller in d
|
|
|
|
* 6. Enable memory controller in k
|
|
|
|
*
|
|
|
|
* Notice that we need to touch j in one direction, but not the other. We also
|
|
|
|
* don't go beyond d when disabling -- it's up to "a" to get realized if it
|
|
|
|
* wants to disable further. The basic rules are therefore:
|
|
|
|
*
|
|
|
|
* - If you're disabling something, you need to realise all of the cgroups from
|
|
|
|
* your recursive descendants to the root. This starts from the leaves.
|
|
|
|
* - If you're enabling something, you need to realise from the root cgroup
|
|
|
|
* downwards, but you don't need to iterate your recursive descendants.
|
2013-11-11 10:03:31 +01:00
|
|
|
*
|
|
|
|
* Returns 0 on success and < 0 on failure. */
|
2014-05-22 00:06:16 +02:00
|
|
|
static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
CGroupMask target_mask, enable_mask;
|
2013-11-11 10:03:31 +01:00
|
|
|
int r;
|
2011-06-30 00:11:25 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
assert(u);
|
2011-06-30 00:11:25 +02:00
|
|
|
|
2017-11-24 19:48:38 +01:00
|
|
|
unit_remove_from_cgroup_realize_queue(u);
|
2011-06-30 00:11:25 +02:00
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
target_mask = unit_get_target_mask(u);
|
2016-04-30 22:12:54 +02:00
|
|
|
enable_mask = unit_get_enable_mask(u);
|
|
|
|
|
2018-09-30 12:33:16 +02:00
|
|
|
if (unit_has_mask_realized(u, target_mask, enable_mask))
|
2013-06-30 23:55:36 +02:00
|
|
|
return 0;
|
2011-06-30 00:11:25 +02:00
|
|
|
|
2018-11-27 16:49:41 +01:00
|
|
|
/* Disable controllers below us, if there are any */
|
|
|
|
r = unit_realize_cgroup_now_disable(u, state);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
/* Enable controllers above us, if there are any */
|
2013-11-11 10:03:31 +01:00
|
|
|
if (UNIT_ISSET(u->slice)) {
|
2018-11-26 14:45:26 +01:00
|
|
|
r = unit_realize_cgroup_now_enable(UNIT_DEREF(u->slice), state);
|
2013-11-11 10:03:31 +01:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
}
|
2013-06-27 04:14:27 +02:00
|
|
|
|
2018-11-27 16:16:35 +01:00
|
|
|
/* Now actually deal with the cgroup we were trying to realise and set attributes */
|
cgroup: Swap cgroup v1 deletion and migration
When we are about to derealize a controller on v1 cgroup, we first
attempt to delete the controller cgroup and migrate afterwards. This
doesn't work in practice because populated cgroup cannot be deleted.
Furthermore, we leave out slices from migration completely, so
(un)setting a control value on them won't realize their controller
cgroup.
Rework actual realization, unit_create_cgroup() becomes
unit_update_cgroup() and make sure that controller hierarchies are
reduced when given controller cgroup ceased to be needed.
Note that with this we introduce slight deviation between v1 and v2 code
-- when a descendant unit turns off a delegated controller, we attempt
to disable it in ancestor slices. On v2 this may fail (kernel enforced,
because of child cgroups using the controller), on v1 we'll migrate
whole subtree and trim the subhierachy. (Previously, we wouldn't take
away delegated controller, however, derealization was broken anyway.)
Fixes: #14149
2020-05-01 14:00:42 +02:00
|
|
|
r = unit_update_cgroup(u, target_mask, enable_mask, state);
|
2013-11-11 10:03:31 +01:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
2018-10-18 10:02:00 +02:00
|
|
|
/* Now, reset the invalidation mask */
|
|
|
|
u->cgroup_invalidated_mask = 0;
|
2013-11-11 10:03:31 +01:00
|
|
|
return 0;
|
2011-06-30 00:11:25 +02:00
|
|
|
}
|
|
|
|
|
2017-09-26 22:15:02 +02:00
|
|
|
unsigned manager_dispatch_cgroup_realize_queue(Manager *m) {
|
2014-05-22 00:06:16 +02:00
|
|
|
ManagerState state;
|
2013-06-27 04:14:27 +02:00
|
|
|
unsigned n = 0;
|
2014-05-22 00:06:16 +02:00
|
|
|
Unit *i;
|
2013-11-11 10:03:31 +01:00
|
|
|
int r;
|
2012-04-13 23:24:47 +02:00
|
|
|
|
2017-09-26 22:15:02 +02:00
|
|
|
assert(m);
|
|
|
|
|
2014-05-22 00:06:16 +02:00
|
|
|
state = manager_state(m);
|
|
|
|
|
2017-09-26 22:15:02 +02:00
|
|
|
while ((i = m->cgroup_realize_queue)) {
|
|
|
|
assert(i->in_cgroup_realize_queue);
|
2012-04-13 23:24:47 +02:00
|
|
|
|
2017-11-24 19:48:38 +01:00
|
|
|
if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(i))) {
|
|
|
|
/* Maybe things changed, and the unit is not actually active anymore? */
|
|
|
|
unit_remove_from_cgroup_realize_queue(i);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2014-05-22 00:06:16 +02:00
|
|
|
r = unit_realize_cgroup_now(i, state);
|
2013-11-11 10:03:31 +01:00
|
|
|
if (r < 0)
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
|
2013-06-30 23:55:36 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
n++;
|
|
|
|
}
|
2012-04-13 23:24:47 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
return n;
|
2010-03-31 16:29:55 +02:00
|
|
|
}
|
|
|
|
|
2020-05-21 13:24:43 +02:00
|
|
|
void unit_add_family_to_cgroup_realize_queue(Unit *u) {
|
|
|
|
assert(u);
|
|
|
|
assert(u->type == UNIT_SLICE);
|
2010-08-31 23:24:47 +02:00
|
|
|
|
2020-05-21 13:24:43 +02:00
|
|
|
/* Family of a unit for is defined as (immediate) children of the unit and immediate children of all
|
|
|
|
* its ancestors.
|
|
|
|
*
|
|
|
|
* Ideally we would enqueue ancestor path only (bottom up). However, on cgroup-v1 scheduling becomes
|
|
|
|
* very weird if two units that own processes reside in the same slice, but one is realized in the
|
|
|
|
* "cpu" hierarchy and one is not (for example because one has CPUWeight= set and the other does
|
|
|
|
* not), because that means individual processes need to be scheduled against whole cgroups. Let's
|
|
|
|
* avoid this asymmetry by always ensuring that siblings of a unit are always realized in their v1
|
|
|
|
* controller hierarchies too (if unit requires the controller to be realized).
|
2020-01-13 20:06:39 +01:00
|
|
|
*
|
2020-05-21 13:24:43 +02:00
|
|
|
* The function must invalidate cgroup_members_mask of all ancestors in order to calculate up to date
|
|
|
|
* masks. */
|
|
|
|
|
|
|
|
do {
|
2013-06-27 04:14:27 +02:00
|
|
|
Unit *m;
|
core: track why unit dependencies came to be
This replaces the dependencies Set* objects by Hashmap* objects, where
the key is the depending Unit, and the value is a bitmask encoding why
the specific dependency was created.
The bitmask contains a number of different, defined bits, that indicate
why dependencies exist, for example whether they are created due to
explicitly configured deps in files, by udev rules or implicitly.
Note that memory usage is not increased by this change, even though we
store more information, as we manage to encode the bit mask inside the
value pointer each Hashmap entry contains.
Why this all? When we know how a dependency came to be, we can update
dependencies correctly when a configuration source changes but others
are left unaltered. Specifically:
1. We can fix UDEV_WANTS dependency generation: so far we kept adding
dependencies configured that way, but if a device lost such a
dependency we couldn't them again as there was no scheme for removing
of dependencies in place.
2. We can implement "pin-pointed" reload of unit files. If we know what
dependencies were created as result of configuration in a unit file,
then we know what to flush out when we want to reload it.
3. It's useful for debugging: "systemd-analyze dump" now shows
this information, helping substantially with understanding how
systemd's dependency tree came to be the way it came to be.
2017-10-25 20:46:01 +02:00
|
|
|
void *v;
|
2012-01-11 01:51:32 +01:00
|
|
|
|
2020-05-21 13:24:43 +02:00
|
|
|
/* Children of u likely changed when we're called */
|
|
|
|
u->cgroup_members_mask_valid = false;
|
2020-06-01 17:33:51 +02:00
|
|
|
|
2020-09-08 11:58:29 +02:00
|
|
|
HASHMAP_FOREACH_KEY(v, m, u->dependencies[UNIT_BEFORE]) {
|
2020-01-09 17:30:31 +01:00
|
|
|
/* Skip units that have a dependency on the slice but aren't actually in it. */
|
2020-05-21 13:24:43 +02:00
|
|
|
if (UNIT_DEREF(m->slice) != u)
|
2020-06-01 17:30:35 +02:00
|
|
|
continue;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2020-01-09 17:30:31 +01:00
|
|
|
/* No point in doing cgroup application for units without active processes. */
|
2013-11-11 10:03:31 +01:00
|
|
|
if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
|
|
|
|
continue;
|
|
|
|
|
2020-01-13 20:06:39 +01:00
|
|
|
/* We only enqueue siblings if they were realized once at least, in the main
|
|
|
|
* hierarchy. */
|
|
|
|
if (!m->cgroup_realized)
|
|
|
|
continue;
|
|
|
|
|
2020-01-09 17:30:31 +01:00
|
|
|
/* If the unit doesn't need any new controllers and has current ones realized, it
|
|
|
|
* doesn't need any changes. */
|
2017-09-05 19:27:53 +02:00
|
|
|
if (unit_has_mask_realized(m,
|
|
|
|
unit_get_target_mask(m),
|
2018-09-30 12:33:16 +02:00
|
|
|
unit_get_enable_mask(m)))
|
2013-11-11 10:03:31 +01:00
|
|
|
continue;
|
|
|
|
|
2017-09-26 22:15:02 +02:00
|
|
|
unit_add_to_cgroup_realize_queue(m);
|
2010-04-08 00:52:14 +02:00
|
|
|
}
|
|
|
|
|
2020-05-21 13:24:43 +02:00
|
|
|
/* Parent comes after children */
|
|
|
|
unit_add_to_cgroup_realize_queue(u);
|
|
|
|
} while ((u = UNIT_DEREF(u->slice)));
|
2013-06-27 04:14:27 +02:00
|
|
|
}
|
|
|
|
|
2013-06-30 23:55:36 +02:00
|
|
|
int unit_realize_cgroup(Unit *u) {
|
2013-06-27 04:14:27 +02:00
|
|
|
assert(u);
|
|
|
|
|
2015-08-28 17:14:59 +02:00
|
|
|
if (!UNIT_HAS_CGROUP_CONTEXT(u))
|
2013-06-30 23:55:36 +02:00
|
|
|
return 0;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2020-05-21 13:24:43 +02:00
|
|
|
/* So, here's the deal: when realizing the cgroups for this unit, we need to first create all
|
|
|
|
* parents, but there's more actually: for the weight-based controllers we also need to make sure
|
|
|
|
* that all our siblings (i.e. units that are in the same slice as we are) have cgroups, too. On the
|
|
|
|
* other hand, when a controller is removed from realized set, it may become unnecessary in siblings
|
|
|
|
* and ancestors and they should be (de)realized too.
|
|
|
|
*
|
|
|
|
* This call will defer work on the siblings and derealized ancestors to the next event loop
|
|
|
|
* iteration and synchronously creates the parent cgroups (unit_realize_cgroup_now). */
|
2010-08-31 23:24:47 +02:00
|
|
|
|
2020-05-21 13:24:43 +02:00
|
|
|
if (UNIT_ISSET(u->slice))
|
|
|
|
unit_add_family_to_cgroup_realize_queue(UNIT_DEREF(u->slice));
|
2013-06-27 04:14:27 +02:00
|
|
|
|
2013-11-11 10:03:31 +01:00
|
|
|
/* And realize this one now (and apply the values) */
|
2014-05-22 00:06:16 +02:00
|
|
|
return unit_realize_cgroup_now(u, manager_state(u->manager));
|
2010-03-31 16:29:55 +02:00
|
|
|
}
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
void unit_release_cgroup(Unit *u) {
|
|
|
|
assert(u);
|
|
|
|
|
2018-11-23 01:07:14 +01:00
|
|
|
/* Forgets all cgroup details for this cgroup — but does *not* destroy the cgroup. This is hence OK to call
|
|
|
|
* when we close down everything for reexecution, where we really want to leave the cgroup in place. */
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
|
|
|
if (u->cgroup_path) {
|
|
|
|
(void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
|
|
|
|
u->cgroup_path = mfree(u->cgroup_path);
|
|
|
|
}
|
|
|
|
|
2019-03-19 17:17:31 +01:00
|
|
|
if (u->cgroup_control_inotify_wd >= 0) {
|
|
|
|
if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_control_inotify_wd) < 0)
|
|
|
|
log_unit_debug_errno(u, errno, "Failed to remove cgroup control inotify watch %i for %s, ignoring: %m", u->cgroup_control_inotify_wd, u->id);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2019-03-19 17:17:31 +01:00
|
|
|
(void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd));
|
|
|
|
u->cgroup_control_inotify_wd = -1;
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
}
|
2019-03-19 19:05:19 +01:00
|
|
|
|
|
|
|
if (u->cgroup_memory_inotify_wd >= 0) {
|
|
|
|
if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0)
|
|
|
|
log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id);
|
|
|
|
|
|
|
|
(void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd));
|
|
|
|
u->cgroup_memory_inotify_wd = -1;
|
|
|
|
}
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
}
|
|
|
|
|
2020-10-23 07:44:22 +02:00
|
|
|
bool unit_maybe_release_cgroup(Unit *u) {
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (!u->cgroup_path)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
/* Don't release the cgroup if there are still processes under it. If we get notified later when all the
|
|
|
|
* processes exit (e.g. the processes were in D-state and exited after the unit was marked as failed)
|
|
|
|
* we need the cgroup paths to continue to be tracked by the manager so they can be looked up and cleaned
|
|
|
|
* up later. */
|
|
|
|
r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
|
|
|
|
if (r < 0)
|
|
|
|
log_unit_debug_errno(u, r, "Error checking if the cgroup is recursively empty, ignoring: %m");
|
|
|
|
else if (r == 1) {
|
|
|
|
unit_release_cgroup(u);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
void unit_prune_cgroup(Unit *u) {
|
2010-03-31 16:29:55 +02:00
|
|
|
int r;
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
bool is_root_slice;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
assert(u);
|
2010-03-31 16:29:55 +02:00
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
/* Removes the cgroup, if empty and possible, and stops watching it. */
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
if (!u->cgroup_path)
|
|
|
|
return;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2016-08-18 20:58:10 +02:00
|
|
|
(void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
|
|
|
|
|
|
|
|
r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
|
2019-04-25 09:39:41 +02:00
|
|
|
if (r < 0)
|
|
|
|
/* One reason we could have failed here is, that the cgroup still contains a process.
|
|
|
|
* However, if the cgroup becomes removable at a later time, it might be removed when
|
|
|
|
* the containing slice is stopped. So even if we failed now, this unit shouldn't assume
|
|
|
|
* that the cgroup is still realized the next time it is started. Do not return early
|
|
|
|
* on error, continue cleanup. */
|
2020-09-08 19:28:36 +02:00
|
|
|
log_unit_full_errno(u, r == -EBUSY ? LOG_DEBUG : LOG_WARNING, r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
|
2010-03-31 16:29:55 +02:00
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
if (is_root_slice)
|
|
|
|
return;
|
|
|
|
|
2020-10-23 07:44:22 +02:00
|
|
|
if (!unit_maybe_release_cgroup(u)) /* Returns true if the cgroup was released */
|
|
|
|
return;
|
2013-06-30 23:55:36 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
u->cgroup_realized = false;
|
2014-02-14 19:11:07 +01:00
|
|
|
u->cgroup_realized_mask = 0;
|
2016-04-30 22:12:54 +02:00
|
|
|
u->cgroup_enabled_mask = 0;
|
2018-10-08 23:33:05 +02:00
|
|
|
|
|
|
|
u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed);
|
2010-03-31 16:29:55 +02:00
|
|
|
}
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
int unit_search_main_pid(Unit *u, pid_t *ret) {
|
2013-06-27 04:14:27 +02:00
|
|
|
_cleanup_fclose_ FILE *f = NULL;
|
2019-03-18 11:48:34 +01:00
|
|
|
pid_t pid = 0, npid;
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
int r;
|
2013-06-27 04:14:27 +02:00
|
|
|
|
|
|
|
assert(u);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
assert(ret);
|
2013-06-27 04:14:27 +02:00
|
|
|
|
|
|
|
if (!u->cgroup_path)
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
return -ENXIO;
|
2013-06-27 04:14:27 +02:00
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
2013-06-27 04:14:27 +02:00
|
|
|
|
|
|
|
while (cg_read_pid(f, &npid) > 0) {
|
|
|
|
|
|
|
|
if (npid == pid)
|
|
|
|
continue;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2019-03-18 11:48:34 +01:00
|
|
|
if (pid_is_my_child(npid) == 0)
|
2013-06-27 04:14:27 +02:00
|
|
|
continue;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
if (pid != 0)
|
2013-06-27 04:14:27 +02:00
|
|
|
/* Dang, there's more than one daemonized PID
|
|
|
|
in this group, so we don't know what process
|
|
|
|
is the main process. */
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
|
|
|
return -ENODATA;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
pid = npid;
|
2010-03-31 16:29:55 +02:00
|
|
|
}
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
*ret = pid;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int unit_watch_pids_in_path(Unit *u, const char *path) {
|
2015-09-02 20:46:22 +02:00
|
|
|
_cleanup_closedir_ DIR *d = NULL;
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
_cleanup_fclose_ FILE *f = NULL;
|
|
|
|
int ret = 0, r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
assert(path);
|
|
|
|
|
|
|
|
r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
|
|
|
|
if (r < 0)
|
|
|
|
ret = r;
|
|
|
|
else {
|
|
|
|
pid_t pid;
|
|
|
|
|
|
|
|
while ((r = cg_read_pid(f, &pid)) > 0) {
|
2019-03-18 20:59:36 +01:00
|
|
|
r = unit_watch_pid(u, pid, false);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
if (r < 0 && ret >= 0)
|
|
|
|
ret = r;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (r < 0 && ret >= 0)
|
|
|
|
ret = r;
|
|
|
|
}
|
|
|
|
|
|
|
|
r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
|
|
|
|
if (r < 0) {
|
|
|
|
if (ret >= 0)
|
|
|
|
ret = r;
|
|
|
|
} else {
|
|
|
|
char *fn;
|
|
|
|
|
|
|
|
while ((r = cg_read_subgroup(d, &fn)) > 0) {
|
|
|
|
_cleanup_free_ char *p = NULL;
|
|
|
|
|
2019-07-15 18:16:03 +02:00
|
|
|
p = path_join(empty_to_root(path), fn);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
free(fn);
|
|
|
|
|
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
r = unit_watch_pids_in_path(u, p);
|
|
|
|
if (r < 0 && ret >= 0)
|
|
|
|
ret = r;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (r < 0 && ret >= 0)
|
|
|
|
ret = r;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-01-12 13:06:48 +01:00
|
|
|
int unit_synthesize_cgroup_empty_event(Unit *u) {
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
/* Enqueue a synthetic cgroup empty event if this unit doesn't watch any PIDs anymore. This is compatibility
|
|
|
|
* support for non-unified systems where notifications aren't reliable, and hence need to take whatever we can
|
|
|
|
* get as notification source as soon as we stopped having any useful PIDs to watch for. */
|
|
|
|
|
|
|
|
if (!u->cgroup_path)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
if (r > 0) /* On unified we have reliable notifications, and don't need this */
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (!set_isempty(u->pids))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
unit_add_to_cgroup_empty_queue(u);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
int unit_watch_all_pids(Unit *u) {
|
2017-02-24 17:52:58 +01:00
|
|
|
int r;
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
assert(u);
|
|
|
|
|
|
|
|
/* Adds all PIDs from our cgroup to the set of PIDs we
|
|
|
|
* watch. This is a fallback logic for cases where we do not
|
|
|
|
* get reliable cgroup empty notifications: we try to use
|
|
|
|
* SIGCHLD as replacement. */
|
|
|
|
|
|
|
|
if (!u->cgroup_path)
|
|
|
|
return -ENOENT;
|
|
|
|
|
2017-02-24 18:00:04 +01:00
|
|
|
r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
|
2017-02-24 17:52:58 +01:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
if (r > 0) /* On unified we can use proper notifications */
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
return unit_watch_pids_in_path(u, u->cgroup_path);
|
|
|
|
}
|
|
|
|
|
2017-09-26 22:43:08 +02:00
|
|
|
static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
|
|
|
|
Manager *m = userdata;
|
|
|
|
Unit *u;
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
int r;
|
|
|
|
|
2017-09-26 22:43:08 +02:00
|
|
|
assert(s);
|
|
|
|
assert(m);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2017-09-26 22:43:08 +02:00
|
|
|
u = m->cgroup_empty_queue;
|
|
|
|
if (!u)
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
return 0;
|
|
|
|
|
2017-09-26 22:43:08 +02:00
|
|
|
assert(u->in_cgroup_empty_queue);
|
|
|
|
u->in_cgroup_empty_queue = false;
|
|
|
|
LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
|
|
|
|
|
|
|
|
if (m->cgroup_empty_queue) {
|
|
|
|
/* More stuff queued, let's make sure we remain enabled */
|
|
|
|
r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
|
|
|
|
if (r < 0)
|
2018-05-31 15:40:54 +02:00
|
|
|
log_debug_errno(r, "Failed to reenable cgroup empty event source, ignoring: %m");
|
2017-09-26 22:43:08 +02:00
|
|
|
}
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
|
|
|
unit_add_to_gc_queue(u);
|
|
|
|
|
|
|
|
if (UNIT_VTABLE(u)->notify_cgroup_empty)
|
|
|
|
UNIT_VTABLE(u)->notify_cgroup_empty(u);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-09-26 22:43:08 +02:00
|
|
|
void unit_add_to_cgroup_empty_queue(Unit *u) {
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
/* Note that there are four different ways how cgroup empty events reach us:
|
|
|
|
*
|
|
|
|
* 1. On the unified hierarchy we get an inotify event on the cgroup
|
|
|
|
*
|
|
|
|
* 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
|
|
|
|
*
|
|
|
|
* 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
|
|
|
|
*
|
|
|
|
* 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
|
|
|
|
* soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
|
|
|
|
*
|
|
|
|
* Regardless which way we got the notification, we'll verify it here, and then add it to a separate
|
|
|
|
* queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
|
|
|
|
* SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
|
|
|
|
* (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
|
|
|
|
* case for scope units). */
|
|
|
|
|
|
|
|
if (u->in_cgroup_empty_queue)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Let's verify that the cgroup is really empty */
|
|
|
|
if (!u->cgroup_path)
|
|
|
|
return;
|
2020-01-13 20:06:39 +01:00
|
|
|
|
2017-09-26 22:43:08 +02:00
|
|
|
r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
|
|
|
|
if (r < 0) {
|
|
|
|
log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (r == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
|
|
|
|
u->in_cgroup_empty_queue = true;
|
|
|
|
|
|
|
|
/* Trigger the defer event */
|
|
|
|
r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
|
|
|
|
}
|
|
|
|
|
2020-04-29 17:53:43 +02:00
|
|
|
static void unit_remove_from_cgroup_empty_queue(Unit *u) {
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (!u->in_cgroup_empty_queue)
|
|
|
|
return;
|
|
|
|
|
|
|
|
LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
|
|
|
|
u->in_cgroup_empty_queue = false;
|
|
|
|
}
|
|
|
|
|
2020-09-09 09:24:23 +02:00
|
|
|
int unit_check_oomd_kill(Unit *u) {
|
|
|
|
_cleanup_free_ char *value = NULL;
|
|
|
|
bool increased;
|
|
|
|
uint64_t n = 0;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
if (!u->cgroup_path)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
r = cg_all_unified();
|
|
|
|
if (r < 0)
|
|
|
|
return log_unit_debug_errno(u, r, "Couldn't determine whether we are in all unified mode: %m");
|
|
|
|
else if (r == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.systemd_oomd_kill", &value);
|
|
|
|
if (r < 0 && r != -ENODATA)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
if (!isempty(value)) {
|
|
|
|
r = safe_atou64(value, &n);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
increased = n > u->managed_oom_kill_last;
|
|
|
|
u->managed_oom_kill_last = n;
|
|
|
|
|
|
|
|
if (!increased)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (n > 0)
|
|
|
|
log_struct(LOG_NOTICE,
|
|
|
|
"MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR,
|
|
|
|
LOG_UNIT_ID(u),
|
|
|
|
LOG_UNIT_INVOCATION_ID(u),
|
|
|
|
LOG_UNIT_MESSAGE(u, "systemd-oomd killed %"PRIu64" process(es) in this unit.", n));
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2019-05-19 15:52:02 +02:00
|
|
|
int unit_check_oom(Unit *u) {
|
2019-03-19 19:05:19 +01:00
|
|
|
_cleanup_free_ char *oom_kill = NULL;
|
|
|
|
bool increased;
|
|
|
|
uint64_t c;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
if (!u->cgroup_path)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill);
|
|
|
|
if (r < 0)
|
|
|
|
return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m");
|
|
|
|
|
|
|
|
r = safe_atou64(oom_kill, &c);
|
|
|
|
if (r < 0)
|
|
|
|
return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m");
|
|
|
|
|
|
|
|
increased = c > u->oom_kill_last;
|
|
|
|
u->oom_kill_last = c;
|
|
|
|
|
|
|
|
if (!increased)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
log_struct(LOG_NOTICE,
|
|
|
|
"MESSAGE_ID=" SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR,
|
|
|
|
LOG_UNIT_ID(u),
|
|
|
|
LOG_UNIT_INVOCATION_ID(u),
|
|
|
|
LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer."));
|
|
|
|
|
|
|
|
if (UNIT_VTABLE(u)->notify_cgroup_oom)
|
|
|
|
UNIT_VTABLE(u)->notify_cgroup_oom(u);
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int on_cgroup_oom_event(sd_event_source *s, void *userdata) {
|
|
|
|
Manager *m = userdata;
|
|
|
|
Unit *u;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(s);
|
|
|
|
assert(m);
|
|
|
|
|
|
|
|
u = m->cgroup_oom_queue;
|
|
|
|
if (!u)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
assert(u->in_cgroup_oom_queue);
|
|
|
|
u->in_cgroup_oom_queue = false;
|
|
|
|
LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u);
|
|
|
|
|
|
|
|
if (m->cgroup_oom_queue) {
|
|
|
|
/* More stuff queued, let's make sure we remain enabled */
|
|
|
|
r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
|
|
|
|
if (r < 0)
|
|
|
|
log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m");
|
|
|
|
}
|
|
|
|
|
|
|
|
(void) unit_check_oom(u);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void unit_add_to_cgroup_oom_queue(Unit *u) {
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (u->in_cgroup_oom_queue)
|
|
|
|
return;
|
|
|
|
if (!u->cgroup_path)
|
|
|
|
return;
|
|
|
|
|
|
|
|
LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
|
|
|
|
u->in_cgroup_oom_queue = true;
|
|
|
|
|
|
|
|
/* Trigger the defer event */
|
|
|
|
if (!u->manager->cgroup_oom_event_source) {
|
|
|
|
_cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
|
|
|
|
|
|
|
|
r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager);
|
|
|
|
if (r < 0) {
|
|
|
|
log_error_errno(r, "Failed to create cgroup oom event source: %m");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8);
|
|
|
|
if (r < 0) {
|
|
|
|
log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
(void) sd_event_source_set_description(s, "cgroup-oom");
|
|
|
|
u->manager->cgroup_oom_event_source = TAKE_PTR(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT);
|
|
|
|
if (r < 0)
|
|
|
|
log_error_errno(r, "Failed to enable cgroup oom event source: %m");
|
|
|
|
}
|
|
|
|
|
2020-04-29 17:53:43 +02:00
|
|
|
static int unit_check_cgroup_events(Unit *u) {
|
|
|
|
char *values[2] = {};
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
r = cg_get_keyed_attribute_graceful(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.events",
|
|
|
|
STRV_MAKE("populated", "frozen"), values);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
/* The cgroup.events notifications can be merged together so act as we saw the given state for the
|
|
|
|
* first time. The functions we call to handle given state are idempotent, which makes them
|
|
|
|
* effectively remember the previous state. */
|
|
|
|
if (values[0]) {
|
|
|
|
if (streq(values[0], "1"))
|
|
|
|
unit_remove_from_cgroup_empty_queue(u);
|
|
|
|
else
|
|
|
|
unit_add_to_cgroup_empty_queue(u);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Disregard freezer state changes due to operations not initiated by us */
|
|
|
|
if (values[1] && IN_SET(u->freezer_state, FREEZER_FREEZING, FREEZER_THAWING)) {
|
|
|
|
if (streq(values[1], "0"))
|
|
|
|
unit_thawed(u);
|
|
|
|
else
|
|
|
|
unit_frozen(u);
|
|
|
|
}
|
|
|
|
|
|
|
|
free(values[0]);
|
|
|
|
free(values[1]);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
|
|
|
|
Manager *m = userdata;
|
|
|
|
|
|
|
|
assert(s);
|
|
|
|
assert(fd >= 0);
|
|
|
|
assert(m);
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
union inotify_event_buffer buffer;
|
|
|
|
struct inotify_event *e;
|
|
|
|
ssize_t l;
|
|
|
|
|
|
|
|
l = read(fd, &buffer, sizeof(buffer));
|
|
|
|
if (l < 0) {
|
2017-09-26 22:49:09 +02:00
|
|
|
if (IN_SET(errno, EINTR, EAGAIN))
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
return log_error_errno(errno, "Failed to read control group inotify events: %m");
|
|
|
|
}
|
|
|
|
|
|
|
|
FOREACH_INOTIFY_EVENT(e, buffer, l) {
|
|
|
|
Unit *u;
|
|
|
|
|
|
|
|
if (e->wd < 0)
|
|
|
|
/* Queue overflow has no watch descriptor */
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (e->mask & IN_IGNORED)
|
|
|
|
/* The watch was just removed */
|
|
|
|
continue;
|
|
|
|
|
2019-03-19 19:05:19 +01:00
|
|
|
/* Note that inotify might deliver events for a watch even after it was removed,
|
|
|
|
* because it was queued before the removal. Let's ignore this here safely. */
|
|
|
|
|
2019-03-19 17:17:31 +01:00
|
|
|
u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd));
|
2019-03-19 19:05:19 +01:00
|
|
|
if (u)
|
2020-04-29 17:53:43 +02:00
|
|
|
unit_check_cgroup_events(u);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2019-03-19 19:05:19 +01:00
|
|
|
u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd));
|
|
|
|
if (u)
|
|
|
|
unit_add_to_cgroup_oom_queue(u);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
}
|
|
|
|
}
|
2010-03-31 16:29:55 +02:00
|
|
|
}
|
|
|
|
|
2018-09-30 12:33:16 +02:00
|
|
|
static int cg_bpf_mask_supported(CGroupMask *ret) {
|
|
|
|
CGroupMask mask = 0;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
/* BPF-based firewall */
|
|
|
|
r = bpf_firewall_supported();
|
|
|
|
if (r > 0)
|
|
|
|
mask |= CGROUP_MASK_BPF_FIREWALL;
|
|
|
|
|
2018-10-08 23:33:05 +02:00
|
|
|
/* BPF-based device access control */
|
|
|
|
r = bpf_devices_supported();
|
|
|
|
if (r > 0)
|
|
|
|
mask |= CGROUP_MASK_BPF_DEVICES;
|
|
|
|
|
2018-09-30 12:33:16 +02:00
|
|
|
*ret = mask;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-03-31 16:29:55 +02:00
|
|
|
int manager_setup_cgroup(Manager *m) {
|
2013-06-20 03:45:08 +02:00
|
|
|
_cleanup_free_ char *path = NULL;
|
2017-09-05 11:40:47 +02:00
|
|
|
const char *scope_path;
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
CGroupController c;
|
2017-02-24 17:52:58 +01:00
|
|
|
int r, all_unified;
|
2018-09-30 12:33:16 +02:00
|
|
|
CGroupMask mask;
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
char *e;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
|
|
|
assert(m);
|
|
|
|
|
2010-07-12 18:16:44 +02:00
|
|
|
/* 1. Determine hierarchy */
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
m->cgroup_root = mfree(m->cgroup_root);
|
2013-06-20 03:45:08 +02:00
|
|
|
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
|
2014-11-28 18:23:20 +01:00
|
|
|
if (r < 0)
|
|
|
|
return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
|
2010-03-31 16:29:55 +02:00
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
/* Chop off the init scope, if we are already located in it */
|
|
|
|
e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
|
2014-01-07 14:41:24 +01:00
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
/* LEGACY: Also chop off the system slice if we are in
|
|
|
|
* it. This is to support live upgrades from older systemd
|
|
|
|
* versions where PID 1 was moved there. Also see
|
|
|
|
* cg_get_root_path(). */
|
2016-02-24 21:24:23 +01:00
|
|
|
if (!e && MANAGER_IS_SYSTEM(m)) {
|
2013-06-20 03:45:08 +02:00
|
|
|
e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
|
2013-11-05 22:14:52 +01:00
|
|
|
if (!e)
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
e = endswith(m->cgroup_root, "/system"); /* even more legacy */
|
2010-11-15 23:55:53 +01:00
|
|
|
}
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
if (e)
|
|
|
|
*e = 0;
|
2010-04-21 03:18:05 +02:00
|
|
|
|
2017-11-09 11:12:47 +01:00
|
|
|
/* And make sure to store away the root value without trailing slash, even for the root dir, so that we can
|
|
|
|
* easily prepend it everywhere. */
|
|
|
|
delete_trailing_chars(m->cgroup_root, "/");
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2010-07-12 18:16:44 +02:00
|
|
|
/* 2. Show data */
|
2013-06-20 03:45:08 +02:00
|
|
|
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
|
2014-11-28 18:23:20 +01:00
|
|
|
if (r < 0)
|
|
|
|
return log_error_errno(r, "Cannot find cgroup mount point: %m");
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2019-08-01 12:48:41 +02:00
|
|
|
r = cg_unified();
|
2016-11-21 20:45:53 +01:00
|
|
|
if (r < 0)
|
|
|
|
return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
|
2016-08-16 00:13:36 +02:00
|
|
|
|
2017-02-24 17:52:58 +01:00
|
|
|
all_unified = cg_all_unified();
|
2017-11-15 10:19:45 +01:00
|
|
|
if (all_unified < 0)
|
|
|
|
return log_error_errno(all_unified, "Couldn't determine whether we are in all unified mode: %m");
|
|
|
|
if (all_unified > 0)
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
log_debug("Unified cgroup hierarchy is located at %s.", path);
|
2017-02-24 17:52:58 +01:00
|
|
|
else {
|
2017-02-24 18:00:04 +01:00
|
|
|
r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
|
2017-02-24 17:52:58 +01:00
|
|
|
if (r < 0)
|
|
|
|
return log_error_errno(r, "Failed to determine whether systemd's own controller is in unified mode: %m");
|
|
|
|
if (r > 0)
|
|
|
|
log_debug("Unified cgroup hierarchy is located at %s. Controllers are on legacy hierarchies.", path);
|
|
|
|
else
|
|
|
|
log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
|
|
|
|
}
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2017-09-26 22:43:08 +02:00
|
|
|
/* 3. Allocate cgroup empty defer event source */
|
|
|
|
m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
|
|
|
|
r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
|
|
|
|
if (r < 0)
|
|
|
|
return log_error_errno(r, "Failed to create cgroup empty event source: %m");
|
|
|
|
|
2019-03-18 20:21:11 +01:00
|
|
|
/* Schedule cgroup empty checks early, but after having processed service notification messages or
|
|
|
|
* SIGCHLD signals, so that a cgroup running empty is always just the last safety net of
|
|
|
|
* notification, and we collected the metadata the notification and SIGCHLD stuff offers first. */
|
2017-09-26 22:43:08 +02:00
|
|
|
r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
|
|
|
|
if (r < 0)
|
|
|
|
return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
|
|
|
|
|
|
|
|
r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
|
|
|
|
if (r < 0)
|
|
|
|
return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
|
|
|
|
|
|
|
|
(void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
|
|
|
|
|
|
|
|
/* 4. Install notifier inotify object, or agent */
|
2017-09-05 11:40:47 +02:00
|
|
|
if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
|
2010-07-11 00:50:49 +02:00
|
|
|
|
2017-09-26 22:43:08 +02:00
|
|
|
/* In the unified hierarchy we can get cgroup empty notifications via inotify. */
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2017-09-05 11:40:47 +02:00
|
|
|
m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
|
|
|
|
safe_close(m->cgroup_inotify_fd);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2017-09-05 11:40:47 +02:00
|
|
|
m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
|
|
|
|
if (m->cgroup_inotify_fd < 0)
|
|
|
|
return log_error_errno(errno, "Failed to create control group inotify object: %m");
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2017-09-05 11:40:47 +02:00
|
|
|
r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
|
|
|
|
if (r < 0)
|
|
|
|
return log_error_errno(r, "Failed to watch control group inotify object: %m");
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2019-03-18 20:21:11 +01:00
|
|
|
/* Process cgroup empty notifications early. Note that when this event is dispatched it'll
|
|
|
|
* just add the unit to a cgroup empty queue, hence let's run earlier than that. Also see
|
|
|
|
* handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
|
|
|
|
r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-9);
|
2017-09-05 11:40:47 +02:00
|
|
|
if (r < 0)
|
|
|
|
return log_error_errno(r, "Failed to set priority of inotify event source: %m");
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2017-09-05 11:40:47 +02:00
|
|
|
(void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2018-11-20 22:42:16 +01:00
|
|
|
} else if (MANAGER_IS_SYSTEM(m) && manager_owns_host_root_cgroup(m) && !MANAGER_IS_TEST_RUN(m)) {
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2017-09-05 11:40:47 +02:00
|
|
|
/* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable,
|
|
|
|
* since it does not generate events when control groups with children run empty. */
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2020-11-30 11:09:37 +01:00
|
|
|
r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUPS_AGENT_PATH);
|
2014-11-28 18:23:20 +01:00
|
|
|
if (r < 0)
|
2017-09-05 11:40:47 +02:00
|
|
|
log_warning_errno(r, "Failed to install release agent, ignoring: %m");
|
|
|
|
else if (r > 0)
|
|
|
|
log_debug("Installed release agent.");
|
|
|
|
else if (r == 0)
|
|
|
|
log_debug("Release agent already installed.");
|
|
|
|
}
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
2017-09-26 22:43:08 +02:00
|
|
|
/* 5. Make sure we are in the special "init.scope" unit in the root slice. */
|
2017-09-05 11:40:47 +02:00
|
|
|
scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
|
|
|
|
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
|
2018-03-09 23:30:32 +01:00
|
|
|
if (r >= 0) {
|
|
|
|
/* Also, move all other userspace processes remaining in the root cgroup into that scope. */
|
|
|
|
r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
|
|
|
|
if (r < 0)
|
|
|
|
log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
|
2010-07-11 00:50:49 +02:00
|
|
|
|
2018-03-09 23:30:32 +01:00
|
|
|
/* 6. And pin it, so that it cannot be unmounted */
|
|
|
|
safe_close(m->pin_cgroupfs_fd);
|
|
|
|
m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
|
|
|
|
if (m->pin_cgroupfs_fd < 0)
|
|
|
|
return log_error_errno(errno, "Failed to open pin file: %m");
|
2014-01-07 14:41:24 +01:00
|
|
|
|
2018-10-09 16:15:54 +02:00
|
|
|
} else if (!MANAGER_IS_TEST_RUN(m))
|
2018-03-09 23:30:32 +01:00
|
|
|
return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
|
2017-09-05 11:40:47 +02:00
|
|
|
|
2017-09-26 22:43:08 +02:00
|
|
|
/* 7. Always enable hierarchical support if it exists... */
|
2018-10-09 16:15:54 +02:00
|
|
|
if (!all_unified && !MANAGER_IS_TEST_RUN(m))
|
2017-09-05 11:40:47 +02:00
|
|
|
(void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
|
2010-07-11 00:50:49 +02:00
|
|
|
|
2018-09-30 12:33:16 +02:00
|
|
|
/* 8. Figure out which controllers are supported */
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
r = cg_mask_supported(&m->cgroup_supported);
|
|
|
|
if (r < 0)
|
|
|
|
return log_error_errno(r, "Failed to determine supported controllers: %m");
|
2018-09-30 12:33:16 +02:00
|
|
|
|
|
|
|
/* 9. Figure out which bpf-based pseudo-controllers are supported */
|
|
|
|
r = cg_bpf_mask_supported(&mask);
|
|
|
|
if (r < 0)
|
|
|
|
return log_error_errno(r, "Failed to determine supported bpf-based pseudo-controllers: %m");
|
|
|
|
m->cgroup_supported |= mask;
|
|
|
|
|
|
|
|
/* 10. Log which controllers are supported */
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
|
2016-06-20 20:40:46 +02:00
|
|
|
log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
|
2012-04-14 02:34:10 +02:00
|
|
|
|
2013-04-15 21:52:56 +02:00
|
|
|
return 0;
|
2010-03-31 16:29:55 +02:00
|
|
|
}
|
|
|
|
|
2010-07-11 00:50:49 +02:00
|
|
|
void manager_shutdown_cgroup(Manager *m, bool delete) {
|
2010-03-31 16:29:55 +02:00
|
|
|
assert(m);
|
|
|
|
|
2013-06-20 03:45:08 +02:00
|
|
|
/* We can't really delete the group, since we are in it. But
|
|
|
|
* let's trim it. */
|
2018-04-03 15:04:22 +02:00
|
|
|
if (delete && m->cgroup_root && m->test_run_flags != MANAGER_TEST_RUN_MINIMAL)
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
(void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
|
|
|
|
|
2017-09-26 22:43:08 +02:00
|
|
|
m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
|
|
|
|
|
2019-03-19 17:17:31 +01:00
|
|
|
m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit);
|
2019-03-19 19:05:19 +01:00
|
|
|
m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit);
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
|
|
|
|
m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
|
|
|
|
m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2014-03-18 19:22:43 +01:00
|
|
|
m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
|
2010-07-11 00:50:49 +02:00
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
m->cgroup_root = mfree(m->cgroup_root);
|
2010-03-31 16:29:55 +02:00
|
|
|
}
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
|
2012-02-03 05:25:31 +01:00
|
|
|
char *p;
|
2013-06-27 04:14:27 +02:00
|
|
|
Unit *u;
|
2012-02-03 05:25:31 +01:00
|
|
|
|
|
|
|
assert(m);
|
|
|
|
assert(cgroup);
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
u = hashmap_get(m->cgroup_unit, cgroup);
|
|
|
|
if (u)
|
|
|
|
return u;
|
2012-02-03 05:25:31 +01:00
|
|
|
|
2013-03-22 06:01:04 +01:00
|
|
|
p = strdupa(cgroup);
|
2012-02-03 05:25:31 +01:00
|
|
|
for (;;) {
|
|
|
|
char *e;
|
|
|
|
|
|
|
|
e = strrchr(p, '/');
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
if (!e || e == p)
|
|
|
|
return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
|
2012-02-03 05:25:31 +01:00
|
|
|
|
|
|
|
*e = 0;
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
u = hashmap_get(m->cgroup_unit, p);
|
|
|
|
if (u)
|
|
|
|
return u;
|
2012-02-03 05:25:31 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-03 14:57:44 +02:00
|
|
|
Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
|
2013-06-27 04:14:27 +02:00
|
|
|
_cleanup_free_ char *cgroup = NULL;
|
2010-03-31 16:29:55 +02:00
|
|
|
|
2010-06-16 05:10:31 +02:00
|
|
|
assert(m);
|
|
|
|
|
core: rework how we track which PIDs to watch for a unit
Previously, we'd maintain two hashmaps keyed by PIDs, pointing to Unit
interested in SIGCHLD events for them. This scheme allowed a specific
PID to be watched by exactly 0, 1 or 2 units.
With this rework this is replaced by a single hashmap which is primarily
keyed by the PID and points to a Unit interested in it. However, it
optionally also keyed by the negated PID, in which case it points to a
NULL terminated array of additional Unit objects also interested. This
scheme means arbitrary numbers of Units may now watch the same PID.
Runtime and memory behaviour should not be impact by this change, as for
the common case (i.e. each PID only watched by a single unit) behaviour
stays the same, but for the uncommon case (a PID watched by more than
one unit) we only pay with a single additional memory allocation for the
array.
Why this all? Primarily, because allowing exactly two units to watch a
specific PID is not sufficient for some niche cases, as processes can
belong to more than one unit these days:
1. sd_notify() with MAINPID= can be used to attach a process from a
different cgroup to multiple units.
2. Similar, the PIDFile= setting in unit files can be used for similar
setups,
3. By creating a scope unit a main process of a service may join a
different unit, too.
4. On cgroupsv1 we frequently end up watching all processes remaining in
a scope, and if a process opens lots of scopes one after the other it
might thus end up being watch by many of them.
This patch hence removes the 2-unit-per-PID limit. It also makes a
couple of other changes, some of them quite relevant:
- manager_get_unit_by_pid() (and the bus call wrapping it) when there's
ambiguity will prefer returning the Unit the process belongs to based on
cgroup membership, and only check the watch-pids hashmap if that
fails. This change in logic is probably more in line with what people
expect and makes things more stable as each process can belong to
exactly one cgroup only.
- Every SIGCHLD event is now dispatched to all units interested in its
PID. Previously, there was some magic conditionalization: the SIGCHLD
would only be dispatched to the unit if it was only interested in a
single PID only, or the PID belonged to the control or main PID or we
didn't dispatch a signle SIGCHLD to the unit in the current event loop
iteration yet. These rules were quite arbitrary and also redundant as
the the per-unit handlers would filter the PIDs anyway a second time.
With this change we'll hence relax the rules: all we do now is
dispatch every SIGCHLD event exactly once to each unit interested in
it, and it's up to the unit to then use or ignore this. We use a
generation counter in the unit to ensure that we only invoke the unit
handler once for each event, protecting us from confusion if a unit is
both associated with a specific PID through cgroup membership and
through the "watch_pids" logic. It also protects us from being
confused if the "watch_pids" hashmap is altered while we are
dispatching to it (which is a very likely case).
- sd_notify() message dispatching has been reworked to be very similar
to SIGCHLD handling now. A generation counter is used for dispatching
as well.
This also adds a new test that validates that "watch_pid" registration
and unregstration works correctly.
2018-01-12 13:41:05 +01:00
|
|
|
if (!pid_is_valid(pid))
|
2015-09-03 14:57:44 +02:00
|
|
|
return NULL;
|
|
|
|
|
core: rework how we track which PIDs to watch for a unit
Previously, we'd maintain two hashmaps keyed by PIDs, pointing to Unit
interested in SIGCHLD events for them. This scheme allowed a specific
PID to be watched by exactly 0, 1 or 2 units.
With this rework this is replaced by a single hashmap which is primarily
keyed by the PID and points to a Unit interested in it. However, it
optionally also keyed by the negated PID, in which case it points to a
NULL terminated array of additional Unit objects also interested. This
scheme means arbitrary numbers of Units may now watch the same PID.
Runtime and memory behaviour should not be impact by this change, as for
the common case (i.e. each PID only watched by a single unit) behaviour
stays the same, but for the uncommon case (a PID watched by more than
one unit) we only pay with a single additional memory allocation for the
array.
Why this all? Primarily, because allowing exactly two units to watch a
specific PID is not sufficient for some niche cases, as processes can
belong to more than one unit these days:
1. sd_notify() with MAINPID= can be used to attach a process from a
different cgroup to multiple units.
2. Similar, the PIDFile= setting in unit files can be used for similar
setups,
3. By creating a scope unit a main process of a service may join a
different unit, too.
4. On cgroupsv1 we frequently end up watching all processes remaining in
a scope, and if a process opens lots of scopes one after the other it
might thus end up being watch by many of them.
This patch hence removes the 2-unit-per-PID limit. It also makes a
couple of other changes, some of them quite relevant:
- manager_get_unit_by_pid() (and the bus call wrapping it) when there's
ambiguity will prefer returning the Unit the process belongs to based on
cgroup membership, and only check the watch-pids hashmap if that
fails. This change in logic is probably more in line with what people
expect and makes things more stable as each process can belong to
exactly one cgroup only.
- Every SIGCHLD event is now dispatched to all units interested in its
PID. Previously, there was some magic conditionalization: the SIGCHLD
would only be dispatched to the unit if it was only interested in a
single PID only, or the PID belonged to the control or main PID or we
didn't dispatch a signle SIGCHLD to the unit in the current event loop
iteration yet. These rules were quite arbitrary and also redundant as
the the per-unit handlers would filter the PIDs anyway a second time.
With this change we'll hence relax the rules: all we do now is
dispatch every SIGCHLD event exactly once to each unit interested in
it, and it's up to the unit to then use or ignore this. We use a
generation counter in the unit to ensure that we only invoke the unit
handler once for each event, protecting us from confusion if a unit is
both associated with a specific PID through cgroup membership and
through the "watch_pids" logic. It also protects us from being
confused if the "watch_pids" hashmap is altered while we are
dispatching to it (which is a very likely case).
- sd_notify() message dispatching has been reworked to be very similar
to SIGCHLD handling now. A generation counter is used for dispatching
as well.
This also adds a new test that validates that "watch_pid" registration
and unregstration works correctly.
2018-01-12 13:41:05 +01:00
|
|
|
if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
|
2015-09-03 14:57:44 +02:00
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return manager_get_unit_by_cgroup(m, cgroup);
|
|
|
|
}
|
|
|
|
|
|
|
|
Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
|
core: rework how we track which PIDs to watch for a unit
Previously, we'd maintain two hashmaps keyed by PIDs, pointing to Unit
interested in SIGCHLD events for them. This scheme allowed a specific
PID to be watched by exactly 0, 1 or 2 units.
With this rework this is replaced by a single hashmap which is primarily
keyed by the PID and points to a Unit interested in it. However, it
optionally also keyed by the negated PID, in which case it points to a
NULL terminated array of additional Unit objects also interested. This
scheme means arbitrary numbers of Units may now watch the same PID.
Runtime and memory behaviour should not be impact by this change, as for
the common case (i.e. each PID only watched by a single unit) behaviour
stays the same, but for the uncommon case (a PID watched by more than
one unit) we only pay with a single additional memory allocation for the
array.
Why this all? Primarily, because allowing exactly two units to watch a
specific PID is not sufficient for some niche cases, as processes can
belong to more than one unit these days:
1. sd_notify() with MAINPID= can be used to attach a process from a
different cgroup to multiple units.
2. Similar, the PIDFile= setting in unit files can be used for similar
setups,
3. By creating a scope unit a main process of a service may join a
different unit, too.
4. On cgroupsv1 we frequently end up watching all processes remaining in
a scope, and if a process opens lots of scopes one after the other it
might thus end up being watch by many of them.
This patch hence removes the 2-unit-per-PID limit. It also makes a
couple of other changes, some of them quite relevant:
- manager_get_unit_by_pid() (and the bus call wrapping it) when there's
ambiguity will prefer returning the Unit the process belongs to based on
cgroup membership, and only check the watch-pids hashmap if that
fails. This change in logic is probably more in line with what people
expect and makes things more stable as each process can belong to
exactly one cgroup only.
- Every SIGCHLD event is now dispatched to all units interested in its
PID. Previously, there was some magic conditionalization: the SIGCHLD
would only be dispatched to the unit if it was only interested in a
single PID only, or the PID belonged to the control or main PID or we
didn't dispatch a signle SIGCHLD to the unit in the current event loop
iteration yet. These rules were quite arbitrary and also redundant as
the the per-unit handlers would filter the PIDs anyway a second time.
With this change we'll hence relax the rules: all we do now is
dispatch every SIGCHLD event exactly once to each unit interested in
it, and it's up to the unit to then use or ignore this. We use a
generation counter in the unit to ensure that we only invoke the unit
handler once for each event, protecting us from confusion if a unit is
both associated with a specific PID through cgroup membership and
through the "watch_pids" logic. It also protects us from being
confused if the "watch_pids" hashmap is altered while we are
dispatching to it (which is a very likely case).
- sd_notify() message dispatching has been reworked to be very similar
to SIGCHLD handling now. A generation counter is used for dispatching
as well.
This also adds a new test that validates that "watch_pid" registration
and unregstration works correctly.
2018-01-12 13:41:05 +01:00
|
|
|
Unit *u, **array;
|
2015-09-03 14:57:44 +02:00
|
|
|
|
|
|
|
assert(m);
|
|
|
|
|
core: rework how we track which PIDs to watch for a unit
Previously, we'd maintain two hashmaps keyed by PIDs, pointing to Unit
interested in SIGCHLD events for them. This scheme allowed a specific
PID to be watched by exactly 0, 1 or 2 units.
With this rework this is replaced by a single hashmap which is primarily
keyed by the PID and points to a Unit interested in it. However, it
optionally also keyed by the negated PID, in which case it points to a
NULL terminated array of additional Unit objects also interested. This
scheme means arbitrary numbers of Units may now watch the same PID.
Runtime and memory behaviour should not be impact by this change, as for
the common case (i.e. each PID only watched by a single unit) behaviour
stays the same, but for the uncommon case (a PID watched by more than
one unit) we only pay with a single additional memory allocation for the
array.
Why this all? Primarily, because allowing exactly two units to watch a
specific PID is not sufficient for some niche cases, as processes can
belong to more than one unit these days:
1. sd_notify() with MAINPID= can be used to attach a process from a
different cgroup to multiple units.
2. Similar, the PIDFile= setting in unit files can be used for similar
setups,
3. By creating a scope unit a main process of a service may join a
different unit, too.
4. On cgroupsv1 we frequently end up watching all processes remaining in
a scope, and if a process opens lots of scopes one after the other it
might thus end up being watch by many of them.
This patch hence removes the 2-unit-per-PID limit. It also makes a
couple of other changes, some of them quite relevant:
- manager_get_unit_by_pid() (and the bus call wrapping it) when there's
ambiguity will prefer returning the Unit the process belongs to based on
cgroup membership, and only check the watch-pids hashmap if that
fails. This change in logic is probably more in line with what people
expect and makes things more stable as each process can belong to
exactly one cgroup only.
- Every SIGCHLD event is now dispatched to all units interested in its
PID. Previously, there was some magic conditionalization: the SIGCHLD
would only be dispatched to the unit if it was only interested in a
single PID only, or the PID belonged to the control or main PID or we
didn't dispatch a signle SIGCHLD to the unit in the current event loop
iteration yet. These rules were quite arbitrary and also redundant as
the the per-unit handlers would filter the PIDs anyway a second time.
With this change we'll hence relax the rules: all we do now is
dispatch every SIGCHLD event exactly once to each unit interested in
it, and it's up to the unit to then use or ignore this. We use a
generation counter in the unit to ensure that we only invoke the unit
handler once for each event, protecting us from confusion if a unit is
both associated with a specific PID through cgroup membership and
through the "watch_pids" logic. It also protects us from being
confused if the "watch_pids" hashmap is altered while we are
dispatching to it (which is a very likely case).
- sd_notify() message dispatching has been reworked to be very similar
to SIGCHLD handling now. A generation counter is used for dispatching
as well.
This also adds a new test that validates that "watch_pid" registration
and unregstration works correctly.
2018-01-12 13:41:05 +01:00
|
|
|
/* Note that a process might be owned by multiple units, we return only one here, which is good enough for most
|
|
|
|
* cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most
|
|
|
|
* relevant one as children of the process will be assigned to that one, too, before all else. */
|
|
|
|
|
|
|
|
if (!pid_is_valid(pid))
|
2010-06-16 05:10:31 +02:00
|
|
|
return NULL;
|
|
|
|
|
2018-01-11 23:38:46 +01:00
|
|
|
if (pid == getpid_cached())
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
|
|
|
|
|
core: rework how we track which PIDs to watch for a unit
Previously, we'd maintain two hashmaps keyed by PIDs, pointing to Unit
interested in SIGCHLD events for them. This scheme allowed a specific
PID to be watched by exactly 0, 1 or 2 units.
With this rework this is replaced by a single hashmap which is primarily
keyed by the PID and points to a Unit interested in it. However, it
optionally also keyed by the negated PID, in which case it points to a
NULL terminated array of additional Unit objects also interested. This
scheme means arbitrary numbers of Units may now watch the same PID.
Runtime and memory behaviour should not be impact by this change, as for
the common case (i.e. each PID only watched by a single unit) behaviour
stays the same, but for the uncommon case (a PID watched by more than
one unit) we only pay with a single additional memory allocation for the
array.
Why this all? Primarily, because allowing exactly two units to watch a
specific PID is not sufficient for some niche cases, as processes can
belong to more than one unit these days:
1. sd_notify() with MAINPID= can be used to attach a process from a
different cgroup to multiple units.
2. Similar, the PIDFile= setting in unit files can be used for similar
setups,
3. By creating a scope unit a main process of a service may join a
different unit, too.
4. On cgroupsv1 we frequently end up watching all processes remaining in
a scope, and if a process opens lots of scopes one after the other it
might thus end up being watch by many of them.
This patch hence removes the 2-unit-per-PID limit. It also makes a
couple of other changes, some of them quite relevant:
- manager_get_unit_by_pid() (and the bus call wrapping it) when there's
ambiguity will prefer returning the Unit the process belongs to based on
cgroup membership, and only check the watch-pids hashmap if that
fails. This change in logic is probably more in line with what people
expect and makes things more stable as each process can belong to
exactly one cgroup only.
- Every SIGCHLD event is now dispatched to all units interested in its
PID. Previously, there was some magic conditionalization: the SIGCHLD
would only be dispatched to the unit if it was only interested in a
single PID only, or the PID belonged to the control or main PID or we
didn't dispatch a signle SIGCHLD to the unit in the current event loop
iteration yet. These rules were quite arbitrary and also redundant as
the the per-unit handlers would filter the PIDs anyway a second time.
With this change we'll hence relax the rules: all we do now is
dispatch every SIGCHLD event exactly once to each unit interested in
it, and it's up to the unit to then use or ignore this. We use a
generation counter in the unit to ensure that we only invoke the unit
handler once for each event, protecting us from confusion if a unit is
both associated with a specific PID through cgroup membership and
through the "watch_pids" logic. It also protects us from being
confused if the "watch_pids" hashmap is altered while we are
dispatching to it (which is a very likely case).
- sd_notify() message dispatching has been reworked to be very similar
to SIGCHLD handling now. A generation counter is used for dispatching
as well.
This also adds a new test that validates that "watch_pid" registration
and unregstration works correctly.
2018-01-12 13:41:05 +01:00
|
|
|
u = manager_get_unit_by_pid_cgroup(m, pid);
|
2015-09-01 18:47:46 +02:00
|
|
|
if (u)
|
|
|
|
return u;
|
|
|
|
|
core: rework how we track which PIDs to watch for a unit
Previously, we'd maintain two hashmaps keyed by PIDs, pointing to Unit
interested in SIGCHLD events for them. This scheme allowed a specific
PID to be watched by exactly 0, 1 or 2 units.
With this rework this is replaced by a single hashmap which is primarily
keyed by the PID and points to a Unit interested in it. However, it
optionally also keyed by the negated PID, in which case it points to a
NULL terminated array of additional Unit objects also interested. This
scheme means arbitrary numbers of Units may now watch the same PID.
Runtime and memory behaviour should not be impact by this change, as for
the common case (i.e. each PID only watched by a single unit) behaviour
stays the same, but for the uncommon case (a PID watched by more than
one unit) we only pay with a single additional memory allocation for the
array.
Why this all? Primarily, because allowing exactly two units to watch a
specific PID is not sufficient for some niche cases, as processes can
belong to more than one unit these days:
1. sd_notify() with MAINPID= can be used to attach a process from a
different cgroup to multiple units.
2. Similar, the PIDFile= setting in unit files can be used for similar
setups,
3. By creating a scope unit a main process of a service may join a
different unit, too.
4. On cgroupsv1 we frequently end up watching all processes remaining in
a scope, and if a process opens lots of scopes one after the other it
might thus end up being watch by many of them.
This patch hence removes the 2-unit-per-PID limit. It also makes a
couple of other changes, some of them quite relevant:
- manager_get_unit_by_pid() (and the bus call wrapping it) when there's
ambiguity will prefer returning the Unit the process belongs to based on
cgroup membership, and only check the watch-pids hashmap if that
fails. This change in logic is probably more in line with what people
expect and makes things more stable as each process can belong to
exactly one cgroup only.
- Every SIGCHLD event is now dispatched to all units interested in its
PID. Previously, there was some magic conditionalization: the SIGCHLD
would only be dispatched to the unit if it was only interested in a
single PID only, or the PID belonged to the control or main PID or we
didn't dispatch a signle SIGCHLD to the unit in the current event loop
iteration yet. These rules were quite arbitrary and also redundant as
the the per-unit handlers would filter the PIDs anyway a second time.
With this change we'll hence relax the rules: all we do now is
dispatch every SIGCHLD event exactly once to each unit interested in
it, and it's up to the unit to then use or ignore this. We use a
generation counter in the unit to ensure that we only invoke the unit
handler once for each event, protecting us from confusion if a unit is
both associated with a specific PID through cgroup membership and
through the "watch_pids" logic. It also protects us from being
confused if the "watch_pids" hashmap is altered while we are
dispatching to it (which is a very likely case).
- sd_notify() message dispatching has been reworked to be very similar
to SIGCHLD handling now. A generation counter is used for dispatching
as well.
This also adds a new test that validates that "watch_pid" registration
and unregstration works correctly.
2018-01-12 13:41:05 +01:00
|
|
|
u = hashmap_get(m->watch_pids, PID_TO_PTR(pid));
|
2015-09-01 18:47:46 +02:00
|
|
|
if (u)
|
|
|
|
return u;
|
|
|
|
|
core: rework how we track which PIDs to watch for a unit
Previously, we'd maintain two hashmaps keyed by PIDs, pointing to Unit
interested in SIGCHLD events for them. This scheme allowed a specific
PID to be watched by exactly 0, 1 or 2 units.
With this rework this is replaced by a single hashmap which is primarily
keyed by the PID and points to a Unit interested in it. However, it
optionally also keyed by the negated PID, in which case it points to a
NULL terminated array of additional Unit objects also interested. This
scheme means arbitrary numbers of Units may now watch the same PID.
Runtime and memory behaviour should not be impact by this change, as for
the common case (i.e. each PID only watched by a single unit) behaviour
stays the same, but for the uncommon case (a PID watched by more than
one unit) we only pay with a single additional memory allocation for the
array.
Why this all? Primarily, because allowing exactly two units to watch a
specific PID is not sufficient for some niche cases, as processes can
belong to more than one unit these days:
1. sd_notify() with MAINPID= can be used to attach a process from a
different cgroup to multiple units.
2. Similar, the PIDFile= setting in unit files can be used for similar
setups,
3. By creating a scope unit a main process of a service may join a
different unit, too.
4. On cgroupsv1 we frequently end up watching all processes remaining in
a scope, and if a process opens lots of scopes one after the other it
might thus end up being watch by many of them.
This patch hence removes the 2-unit-per-PID limit. It also makes a
couple of other changes, some of them quite relevant:
- manager_get_unit_by_pid() (and the bus call wrapping it) when there's
ambiguity will prefer returning the Unit the process belongs to based on
cgroup membership, and only check the watch-pids hashmap if that
fails. This change in logic is probably more in line with what people
expect and makes things more stable as each process can belong to
exactly one cgroup only.
- Every SIGCHLD event is now dispatched to all units interested in its
PID. Previously, there was some magic conditionalization: the SIGCHLD
would only be dispatched to the unit if it was only interested in a
single PID only, or the PID belonged to the control or main PID or we
didn't dispatch a signle SIGCHLD to the unit in the current event loop
iteration yet. These rules were quite arbitrary and also redundant as
the the per-unit handlers would filter the PIDs anyway a second time.
With this change we'll hence relax the rules: all we do now is
dispatch every SIGCHLD event exactly once to each unit interested in
it, and it's up to the unit to then use or ignore this. We use a
generation counter in the unit to ensure that we only invoke the unit
handler once for each event, protecting us from confusion if a unit is
both associated with a specific PID through cgroup membership and
through the "watch_pids" logic. It also protects us from being
confused if the "watch_pids" hashmap is altered while we are
dispatching to it (which is a very likely case).
- sd_notify() message dispatching has been reworked to be very similar
to SIGCHLD handling now. A generation counter is used for dispatching
as well.
This also adds a new test that validates that "watch_pid" registration
and unregstration works correctly.
2018-01-12 13:41:05 +01:00
|
|
|
array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid));
|
|
|
|
if (array)
|
|
|
|
return array[0];
|
|
|
|
|
|
|
|
return NULL;
|
2010-04-18 03:04:54 +02:00
|
|
|
}
|
2010-10-27 03:16:49 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
|
|
|
|
Unit *u;
|
2010-10-27 03:16:49 +02:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
assert(m);
|
|
|
|
assert(cgroup);
|
2010-10-27 03:16:49 +02:00
|
|
|
|
2017-09-26 22:43:08 +02:00
|
|
|
/* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
|
|
|
|
* or from the --system instance */
|
|
|
|
|
2016-05-04 20:43:23 +02:00
|
|
|
log_debug("Got cgroup empty notification for: %s", cgroup);
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
u = manager_get_unit_by_cgroup(m, cgroup);
|
2015-03-01 16:24:19 +01:00
|
|
|
if (!u)
|
|
|
|
return 0;
|
2013-06-30 23:56:11 +02:00
|
|
|
|
2017-09-26 22:43:08 +02:00
|
|
|
unit_add_to_cgroup_empty_queue(u);
|
|
|
|
return 1;
|
2015-03-01 16:24:19 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
int unit_get_memory_current(Unit *u, uint64_t *ret) {
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
assert(ret);
|
|
|
|
|
2017-09-27 17:54:06 +02:00
|
|
|
if (!UNIT_CGROUP_BOOL(u, memory_accounting))
|
2017-09-07 16:31:01 +02:00
|
|
|
return -ENODATA;
|
|
|
|
|
2015-03-01 16:24:19 +01:00
|
|
|
if (!u->cgroup_path)
|
|
|
|
return -ENODATA;
|
|
|
|
|
2018-02-09 19:05:59 +01:00
|
|
|
/* The root cgroup doesn't expose this information, let's get it from /proc instead */
|
2018-11-20 22:42:16 +01:00
|
|
|
if (unit_has_host_root_cgroup(u))
|
2019-01-22 15:43:07 +01:00
|
|
|
return procfs_memory_get_used(ret);
|
2018-02-09 19:05:59 +01:00
|
|
|
|
core: unified cgroup hierarchy support
This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.
A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).
It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.
The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.
This patch also removes cg_delete() which is unused now.
On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.
This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.
This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.
The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.
To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.
This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.
When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
2015-09-01 19:22:36 +02:00
|
|
|
if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
|
2015-03-01 16:24:19 +01:00
|
|
|
return -ENODATA;
|
|
|
|
|
2017-02-24 17:52:58 +01:00
|
|
|
r = cg_all_unified();
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
2015-03-01 16:24:19 +01:00
|
|
|
|
2020-03-08 02:14:35 +01:00
|
|
|
return cg_get_attribute_as_uint64("memory", u->cgroup_path, r > 0 ? "memory.current" : "memory.usage_in_bytes", ret);
|
2015-03-01 16:24:19 +01:00
|
|
|
}
|
|
|
|
|
2015-09-10 12:32:16 +02:00
|
|
|
int unit_get_tasks_current(Unit *u, uint64_t *ret) {
|
|
|
|
assert(u);
|
|
|
|
assert(ret);
|
|
|
|
|
2017-09-27 17:54:06 +02:00
|
|
|
if (!UNIT_CGROUP_BOOL(u, tasks_accounting))
|
2017-09-07 16:31:01 +02:00
|
|
|
return -ENODATA;
|
|
|
|
|
2015-09-10 12:32:16 +02:00
|
|
|
if (!u->cgroup_path)
|
|
|
|
return -ENODATA;
|
|
|
|
|
2018-01-17 15:39:39 +01:00
|
|
|
/* The root cgroup doesn't expose this information, let's get it from /proc instead */
|
2018-11-20 22:42:16 +01:00
|
|
|
if (unit_has_host_root_cgroup(u))
|
2018-01-17 15:39:39 +01:00
|
|
|
return procfs_tasks_get_current(ret);
|
|
|
|
|
2018-02-09 19:05:59 +01:00
|
|
|
if ((u->cgroup_realized_mask & CGROUP_MASK_PIDS) == 0)
|
|
|
|
return -ENODATA;
|
|
|
|
|
2020-03-08 02:14:35 +01:00
|
|
|
return cg_get_attribute_as_uint64("pids", u->cgroup_path, "pids.current", ret);
|
2015-09-10 12:32:16 +02:00
|
|
|
}
|
|
|
|
|
2015-03-01 16:24:19 +01:00
|
|
|
static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
|
|
|
|
uint64_t ns;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
assert(ret);
|
|
|
|
|
|
|
|
if (!u->cgroup_path)
|
|
|
|
return -ENODATA;
|
|
|
|
|
2018-02-09 19:05:59 +01:00
|
|
|
/* The root cgroup doesn't expose this information, let's get it from /proc instead */
|
2018-11-20 22:42:16 +01:00
|
|
|
if (unit_has_host_root_cgroup(u))
|
2018-02-09 19:05:59 +01:00
|
|
|
return procfs_cpu_get_usage(ret);
|
|
|
|
|
2018-11-17 12:19:07 +01:00
|
|
|
/* Requisite controllers for CPU accounting are not enabled */
|
|
|
|
if ((get_cpu_accounting_mask() & ~u->cgroup_realized_mask) != 0)
|
|
|
|
return -ENODATA;
|
|
|
|
|
2018-12-10 16:08:13 +01:00
|
|
|
r = cg_all_unified();
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
2017-02-24 17:52:58 +01:00
|
|
|
if (r > 0) {
|
2016-08-07 15:45:39 +02:00
|
|
|
_cleanup_free_ char *val = NULL;
|
|
|
|
uint64_t us;
|
2015-03-01 16:24:19 +01:00
|
|
|
|
2018-02-09 18:35:52 +01:00
|
|
|
r = cg_get_keyed_attribute("cpu", u->cgroup_path, "cpu.stat", STRV_MAKE("usage_usec"), &val);
|
|
|
|
if (IN_SET(r, -ENOENT, -ENXIO))
|
|
|
|
return -ENODATA;
|
2018-12-10 16:08:33 +01:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
2016-08-07 15:45:39 +02:00
|
|
|
|
|
|
|
r = safe_atou64(val, &us);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
ns = us * NSEC_PER_USEC;
|
2020-03-08 02:14:35 +01:00
|
|
|
} else
|
|
|
|
return cg_get_attribute_as_uint64("cpuacct", u->cgroup_path, "cpuacct.usage", ret);
|
2015-03-01 16:24:19 +01:00
|
|
|
|
|
|
|
*ret = ns;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
|
|
|
|
nsec_t ns;
|
|
|
|
int r;
|
|
|
|
|
2016-08-18 20:58:10 +02:00
|
|
|
assert(u);
|
|
|
|
|
|
|
|
/* Retrieve the current CPU usage counter. This will subtract the CPU counter taken when the unit was
|
|
|
|
* started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply
|
|
|
|
* call this function with a NULL return value. */
|
|
|
|
|
2017-09-27 17:54:06 +02:00
|
|
|
if (!UNIT_CGROUP_BOOL(u, cpu_accounting))
|
2017-09-07 16:31:01 +02:00
|
|
|
return -ENODATA;
|
|
|
|
|
2015-03-01 16:24:19 +01:00
|
|
|
r = unit_get_cpu_usage_raw(u, &ns);
|
2016-08-18 20:58:10 +02:00
|
|
|
if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) {
|
|
|
|
/* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our
|
|
|
|
* cached value. */
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
*ret = u->cpu_usage_last;
|
|
|
|
return 0;
|
|
|
|
}
|
2015-03-01 16:24:19 +01:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
2016-08-07 15:45:39 +02:00
|
|
|
if (ns > u->cpu_usage_base)
|
|
|
|
ns -= u->cpu_usage_base;
|
2015-03-01 16:24:19 +01:00
|
|
|
else
|
|
|
|
ns = 0;
|
|
|
|
|
2016-08-18 20:58:10 +02:00
|
|
|
u->cpu_usage_last = ns;
|
|
|
|
if (ret)
|
|
|
|
*ret = ns;
|
|
|
|
|
2015-03-01 16:24:19 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-09-05 19:27:53 +02:00
|
|
|
int unit_get_ip_accounting(
|
|
|
|
Unit *u,
|
|
|
|
CGroupIPAccountingMetric metric,
|
|
|
|
uint64_t *ret) {
|
|
|
|
|
2017-09-07 14:07:13 +02:00
|
|
|
uint64_t value;
|
2017-09-05 19:27:53 +02:00
|
|
|
int fd, r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
assert(metric >= 0);
|
|
|
|
assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
|
|
|
|
assert(ret);
|
|
|
|
|
2017-09-27 17:54:06 +02:00
|
|
|
if (!UNIT_CGROUP_BOOL(u, ip_accounting))
|
2017-09-07 16:31:01 +02:00
|
|
|
return -ENODATA;
|
|
|
|
|
2017-09-05 19:27:53 +02:00
|
|
|
fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ?
|
|
|
|
u->ip_accounting_ingress_map_fd :
|
|
|
|
u->ip_accounting_egress_map_fd;
|
|
|
|
if (fd < 0)
|
|
|
|
return -ENODATA;
|
|
|
|
|
|
|
|
if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES))
|
2017-09-07 14:07:13 +02:00
|
|
|
r = bpf_firewall_read_accounting(fd, &value, NULL);
|
2017-09-05 19:27:53 +02:00
|
|
|
else
|
2017-09-07 14:07:13 +02:00
|
|
|
r = bpf_firewall_read_accounting(fd, NULL, &value);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
/* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile
|
|
|
|
* all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the
|
|
|
|
* ip_accounting_extra[] field, and add them in here transparently. */
|
|
|
|
|
|
|
|
*ret = value + u->ip_accounting_extra[metric];
|
2017-09-05 19:27:53 +02:00
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2019-03-22 12:16:03 +01:00
|
|
|
static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) {
|
|
|
|
static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {
|
|
|
|
[CGROUP_IO_READ_BYTES] = "rbytes=",
|
|
|
|
[CGROUP_IO_WRITE_BYTES] = "wbytes=",
|
|
|
|
[CGROUP_IO_READ_OPERATIONS] = "rios=",
|
|
|
|
[CGROUP_IO_WRITE_OPERATIONS] = "wios=",
|
|
|
|
};
|
|
|
|
uint64_t acc[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = {};
|
|
|
|
_cleanup_free_ char *path = NULL;
|
|
|
|
_cleanup_fclose_ FILE *f = NULL;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (!u->cgroup_path)
|
|
|
|
return -ENODATA;
|
|
|
|
|
|
|
|
if (unit_has_host_root_cgroup(u))
|
|
|
|
return -ENODATA; /* TODO: return useful data for the top-level cgroup */
|
|
|
|
|
|
|
|
r = cg_all_unified();
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
if (r == 0) /* TODO: support cgroupv1 */
|
|
|
|
return -ENODATA;
|
|
|
|
|
|
|
|
if (!FLAGS_SET(u->cgroup_realized_mask, CGROUP_MASK_IO))
|
|
|
|
return -ENODATA;
|
|
|
|
|
|
|
|
r = cg_get_path("io", u->cgroup_path, "io.stat", &path);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
f = fopen(path, "re");
|
|
|
|
if (!f)
|
|
|
|
return -errno;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
_cleanup_free_ char *line = NULL;
|
|
|
|
const char *p;
|
|
|
|
|
|
|
|
r = read_line(f, LONG_LINE_MAX, &line);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
if (r == 0)
|
|
|
|
break;
|
|
|
|
|
|
|
|
p = line;
|
|
|
|
p += strcspn(p, WHITESPACE); /* Skip over device major/minor */
|
|
|
|
p += strspn(p, WHITESPACE); /* Skip over following whitespace */
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
_cleanup_free_ char *word = NULL;
|
|
|
|
|
|
|
|
r = extract_first_word(&p, &word, NULL, EXTRACT_RETAIN_ESCAPE);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
if (r == 0)
|
|
|
|
break;
|
|
|
|
|
|
|
|
for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
|
|
|
|
const char *x;
|
|
|
|
|
|
|
|
x = startswith(word, field_names[i]);
|
|
|
|
if (x) {
|
|
|
|
uint64_t w;
|
|
|
|
|
|
|
|
r = safe_atou64(x, &w);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
/* Sum up the stats of all devices */
|
|
|
|
acc[i] += w;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
memcpy(ret, acc, sizeof(acc));
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int unit_get_io_accounting(
|
|
|
|
Unit *u,
|
|
|
|
CGroupIOAccountingMetric metric,
|
|
|
|
bool allow_cache,
|
|
|
|
uint64_t *ret) {
|
|
|
|
|
|
|
|
uint64_t raw[_CGROUP_IO_ACCOUNTING_METRIC_MAX];
|
|
|
|
int r;
|
|
|
|
|
|
|
|
/* Retrieve an IO account parameter. This will subtract the counter when the unit was started. */
|
|
|
|
|
|
|
|
if (!UNIT_CGROUP_BOOL(u, io_accounting))
|
|
|
|
return -ENODATA;
|
|
|
|
|
|
|
|
if (allow_cache && u->io_accounting_last[metric] != UINT64_MAX)
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
r = unit_get_io_accounting_raw(u, raw);
|
|
|
|
if (r == -ENODATA && u->io_accounting_last[metric] != UINT64_MAX)
|
|
|
|
goto done;
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++) {
|
|
|
|
/* Saturated subtraction */
|
|
|
|
if (raw[i] > u->io_accounting_base[i])
|
|
|
|
u->io_accounting_last[i] = raw[i] - u->io_accounting_base[i];
|
|
|
|
else
|
|
|
|
u->io_accounting_last[i] = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
done:
|
|
|
|
if (ret)
|
|
|
|
*ret = u->io_accounting_last[metric];
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-09-05 19:27:53 +02:00
|
|
|
int unit_reset_cpu_accounting(Unit *u) {
|
2015-03-01 16:24:19 +01:00
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
2016-08-18 20:58:10 +02:00
|
|
|
u->cpu_usage_last = NSEC_INFINITY;
|
|
|
|
|
2019-03-22 10:41:32 +01:00
|
|
|
r = unit_get_cpu_usage_raw(u, &u->cpu_usage_base);
|
2015-03-01 16:24:19 +01:00
|
|
|
if (r < 0) {
|
2016-08-07 15:45:39 +02:00
|
|
|
u->cpu_usage_base = 0;
|
2015-03-01 16:24:19 +01:00
|
|
|
return r;
|
2013-06-30 23:56:11 +02:00
|
|
|
}
|
2011-02-13 18:52:02 +01:00
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
return 0;
|
2010-10-27 03:16:49 +02:00
|
|
|
}
|
|
|
|
|
2017-09-05 19:27:53 +02:00
|
|
|
int unit_reset_ip_accounting(Unit *u) {
|
|
|
|
int r = 0, q = 0;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (u->ip_accounting_ingress_map_fd >= 0)
|
|
|
|
r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd);
|
|
|
|
|
|
|
|
if (u->ip_accounting_egress_map_fd >= 0)
|
|
|
|
q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd);
|
|
|
|
|
2017-09-07 14:07:13 +02:00
|
|
|
zero(u->ip_accounting_extra);
|
|
|
|
|
2017-09-05 19:27:53 +02:00
|
|
|
return r < 0 ? r : q;
|
|
|
|
}
|
|
|
|
|
2019-03-22 12:16:03 +01:00
|
|
|
int unit_reset_io_accounting(Unit *u) {
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
for (CGroupIOAccountingMetric i = 0; i < _CGROUP_IO_ACCOUNTING_METRIC_MAX; i++)
|
|
|
|
u->io_accounting_last[i] = UINT64_MAX;
|
|
|
|
|
|
|
|
r = unit_get_io_accounting_raw(u, u->io_accounting_base);
|
|
|
|
if (r < 0) {
|
|
|
|
zero(u->io_accounting_base);
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-03-22 11:25:49 +01:00
|
|
|
int unit_reset_accounting(Unit *u) {
|
2019-03-22 12:16:03 +01:00
|
|
|
int r, q, v;
|
2019-03-22 11:25:49 +01:00
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
r = unit_reset_cpu_accounting(u);
|
2019-03-22 12:16:03 +01:00
|
|
|
q = unit_reset_io_accounting(u);
|
|
|
|
v = unit_reset_ip_accounting(u);
|
2019-03-22 11:25:49 +01:00
|
|
|
|
2019-03-22 12:16:03 +01:00
|
|
|
return r < 0 ? r : q < 0 ? q : v;
|
2019-03-22 11:25:49 +01:00
|
|
|
}
|
|
|
|
|
2015-09-11 18:21:53 +02:00
|
|
|
void unit_invalidate_cgroup(Unit *u, CGroupMask m) {
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (!UNIT_HAS_CGROUP_CONTEXT(u))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (m == 0)
|
|
|
|
return;
|
|
|
|
|
2016-05-19 02:35:12 +02:00
|
|
|
/* always invalidate compat pairs together */
|
|
|
|
if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO))
|
|
|
|
m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
|
|
|
|
|
2017-09-05 11:17:01 +02:00
|
|
|
if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT))
|
|
|
|
m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT;
|
|
|
|
|
2018-11-22 22:02:53 +01:00
|
|
|
if (FLAGS_SET(u->cgroup_invalidated_mask, m)) /* NOP? */
|
2015-09-11 18:21:53 +02:00
|
|
|
return;
|
|
|
|
|
2018-11-22 22:02:53 +01:00
|
|
|
u->cgroup_invalidated_mask |= m;
|
2017-09-26 22:15:02 +02:00
|
|
|
unit_add_to_cgroup_realize_queue(u);
|
2015-09-11 18:21:53 +02:00
|
|
|
}
|
|
|
|
|
2017-09-05 19:27:53 +02:00
|
|
|
void unit_invalidate_cgroup_bpf(Unit *u) {
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (!UNIT_HAS_CGROUP_CONTEXT(u))
|
|
|
|
return;
|
|
|
|
|
2018-09-30 12:33:16 +02:00
|
|
|
if (u->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */
|
2017-09-05 19:27:53 +02:00
|
|
|
return;
|
|
|
|
|
2018-09-30 12:33:16 +02:00
|
|
|
u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
|
2017-09-26 22:15:02 +02:00
|
|
|
unit_add_to_cgroup_realize_queue(u);
|
2017-09-05 19:27:53 +02:00
|
|
|
|
|
|
|
/* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
|
|
|
|
* list of our children includes our own. */
|
|
|
|
if (u->type == UNIT_SLICE) {
|
|
|
|
Unit *member;
|
core: track why unit dependencies came to be
This replaces the dependencies Set* objects by Hashmap* objects, where
the key is the depending Unit, and the value is a bitmask encoding why
the specific dependency was created.
The bitmask contains a number of different, defined bits, that indicate
why dependencies exist, for example whether they are created due to
explicitly configured deps in files, by udev rules or implicitly.
Note that memory usage is not increased by this change, even though we
store more information, as we manage to encode the bit mask inside the
value pointer each Hashmap entry contains.
Why this all? When we know how a dependency came to be, we can update
dependencies correctly when a configuration source changes but others
are left unaltered. Specifically:
1. We can fix UDEV_WANTS dependency generation: so far we kept adding
dependencies configured that way, but if a device lost such a
dependency we couldn't them again as there was no scheme for removing
of dependencies in place.
2. We can implement "pin-pointed" reload of unit files. If we know what
dependencies were created as result of configuration in a unit file,
then we know what to flush out when we want to reload it.
3. It's useful for debugging: "systemd-analyze dump" now shows
this information, helping substantially with understanding how
systemd's dependency tree came to be the way it came to be.
2017-10-25 20:46:01 +02:00
|
|
|
void *v;
|
2017-09-05 19:27:53 +02:00
|
|
|
|
2020-09-08 11:58:29 +02:00
|
|
|
HASHMAP_FOREACH_KEY(v, member, u->dependencies[UNIT_BEFORE])
|
2018-12-12 11:49:35 +01:00
|
|
|
if (UNIT_DEREF(member->slice) == u)
|
|
|
|
unit_invalidate_cgroup_bpf(member);
|
2017-09-05 19:27:53 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-02-06 11:57:35 +01:00
|
|
|
bool unit_cgroup_delegate(Unit *u) {
|
|
|
|
CGroupContext *c;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
|
|
|
|
if (!UNIT_VTABLE(u)->can_delegate)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
c = unit_get_cgroup_context(u);
|
|
|
|
if (!c)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return c->delegate;
|
|
|
|
}
|
|
|
|
|
2015-09-11 18:21:53 +02:00
|
|
|
void manager_invalidate_startup_units(Manager *m) {
|
|
|
|
Unit *u;
|
|
|
|
|
|
|
|
assert(m);
|
|
|
|
|
2020-09-08 11:58:29 +02:00
|
|
|
SET_FOREACH(u, m->startup_units)
|
2016-05-05 22:42:55 +02:00
|
|
|
unit_invalidate_cgroup(u, CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_BLKIO);
|
2015-09-11 18:21:53 +02:00
|
|
|
}
|
|
|
|
|
2019-05-22 12:12:17 +02:00
|
|
|
static int unit_get_nice(Unit *u) {
|
|
|
|
ExecContext *ec;
|
|
|
|
|
|
|
|
ec = unit_get_exec_context(u);
|
|
|
|
return ec ? ec->nice : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint64_t unit_get_cpu_weight(Unit *u) {
|
|
|
|
ManagerState state = manager_state(u->manager);
|
|
|
|
CGroupContext *cc;
|
|
|
|
|
|
|
|
cc = unit_get_cgroup_context(u);
|
|
|
|
return cc ? cgroup_context_cpu_weight(cc, state) : CGROUP_WEIGHT_DEFAULT;
|
|
|
|
}
|
|
|
|
|
|
|
|
int compare_job_priority(const void *a, const void *b) {
|
|
|
|
const Job *x = a, *y = b;
|
|
|
|
int nice_x, nice_y;
|
|
|
|
uint64_t weight_x, weight_y;
|
|
|
|
int ret;
|
|
|
|
|
2019-07-18 18:30:15 +02:00
|
|
|
if ((ret = CMP(x->unit->type, y->unit->type)) != 0)
|
|
|
|
return -ret;
|
|
|
|
|
2019-05-22 12:12:17 +02:00
|
|
|
weight_x = unit_get_cpu_weight(x->unit);
|
|
|
|
weight_y = unit_get_cpu_weight(y->unit);
|
|
|
|
|
2019-07-18 18:30:15 +02:00
|
|
|
if ((ret = CMP(weight_x, weight_y)) != 0)
|
|
|
|
return -ret;
|
2019-05-22 12:12:17 +02:00
|
|
|
|
|
|
|
nice_x = unit_get_nice(x->unit);
|
|
|
|
nice_y = unit_get_nice(y->unit);
|
|
|
|
|
|
|
|
if ((ret = CMP(nice_x, nice_y)) != 0)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
return strcmp(x->unit->id, y->unit->id);
|
|
|
|
}
|
|
|
|
|
2020-04-29 17:53:43 +02:00
|
|
|
int unit_cgroup_freezer_action(Unit *u, FreezerAction action) {
|
|
|
|
_cleanup_free_ char *path = NULL;
|
|
|
|
FreezerState target, kernel = _FREEZER_STATE_INVALID;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
assert(IN_SET(action, FREEZER_FREEZE, FREEZER_THAW));
|
|
|
|
|
2020-09-08 14:51:39 +02:00
|
|
|
if (!cg_freezer_supported())
|
|
|
|
return 0;
|
|
|
|
|
2020-04-29 17:53:43 +02:00
|
|
|
if (!u->cgroup_realized)
|
|
|
|
return -EBUSY;
|
|
|
|
|
|
|
|
target = action == FREEZER_FREEZE ? FREEZER_FROZEN : FREEZER_RUNNING;
|
|
|
|
|
|
|
|
r = unit_freezer_state_kernel(u, &kernel);
|
|
|
|
if (r < 0)
|
|
|
|
log_unit_debug_errno(u, r, "Failed to obtain cgroup freezer state: %m");
|
|
|
|
|
|
|
|
if (target == kernel) {
|
|
|
|
u->freezer_state = target;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.freeze", &path);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
log_unit_debug(u, "%s unit.", action == FREEZER_FREEZE ? "Freezing" : "Thawing");
|
|
|
|
|
|
|
|
if (action == FREEZER_FREEZE)
|
|
|
|
u->freezer_state = FREEZER_FREEZING;
|
|
|
|
else
|
|
|
|
u->freezer_state = FREEZER_THAWING;
|
|
|
|
|
|
|
|
r = write_string_file(path, one_zero(action == FREEZER_FREEZE), WRITE_STRING_FILE_DISABLE_BUFFER);
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
2020-05-07 17:23:30 +02:00
|
|
|
return 1;
|
2020-04-29 17:53:43 +02:00
|
|
|
}
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
|
2019-11-08 15:12:23 +01:00
|
|
|
[CGROUP_DEVICE_POLICY_AUTO] = "auto",
|
|
|
|
[CGROUP_DEVICE_POLICY_CLOSED] = "closed",
|
|
|
|
[CGROUP_DEVICE_POLICY_STRICT] = "strict",
|
2013-06-27 04:14:27 +02:00
|
|
|
};
|
2010-10-27 03:16:49 +02:00
|
|
|
|
2019-07-29 17:50:05 +02:00
|
|
|
int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
|
|
|
|
_cleanup_free_ char *v = NULL;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
assert(u);
|
|
|
|
assert(cpus);
|
|
|
|
|
|
|
|
if (!u->cgroup_path)
|
|
|
|
return -ENODATA;
|
|
|
|
|
|
|
|
if ((u->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0)
|
|
|
|
return -ENODATA;
|
|
|
|
|
|
|
|
r = cg_all_unified();
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
if (r == 0)
|
|
|
|
return -ENODATA;
|
2020-01-13 20:11:56 +01:00
|
|
|
|
|
|
|
r = cg_get_attribute("cpuset", u->cgroup_path, name, &v);
|
2019-07-29 17:50:05 +02:00
|
|
|
if (r == -ENOENT)
|
|
|
|
return -ENODATA;
|
|
|
|
if (r < 0)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL);
|
|
|
|
}
|
|
|
|
|
2013-06-27 04:14:27 +02:00
|
|
|
DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
|
2020-04-29 17:53:43 +02:00
|
|
|
|
|
|
|
static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = {
|
|
|
|
[FREEZER_FREEZE] = "freeze",
|
|
|
|
[FREEZER_THAW] = "thaw",
|
|
|
|
};
|
|
|
|
|
|
|
|
DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction);
|