cgroup: Add DisableControllers= directive to disable controller in subtree

Some controllers (like the CPU controller) have a performance cost that
is non-trivial on certain workloads. While this can be mitigated and
improved to an extent, there will for some controllers always be some
overheads associated with the benefits gained from the controller.
Inside Facebook, the fix applied has been to disable the CPU controller
forcibly with `cgroup_disable=cpu` on the kernel command line.

This presents a problem: to disable or reenable the controller, a reboot
is required, but this is quite cumbersome and slow to do for many
thousands of machines, especially machines where disabling/enabling a
stateful service on a machine is a matter of several minutes.

Currently systemd provides some configuration knobs for these in the
form of `[Default]CPUAccounting`, `[Default]MemoryAccounting`, and the
like. The limitation of these is that Default*Accounting is overrideable
by individual services, of which any one could decide to reenable a
controller within the hierarchy at any point just by using a controller
feature implicitly (eg. `CPUWeight`), even if the use of that CPU
feature could just be opportunistic. Since many services are provided by
the distribution, or by upstream teams at a particular organisation,
it's not a sustainable solution to simply try to find and remove
offending directives from these units.

This commit presents a more direct solution -- a DisableControllers=
directive that forcibly disallows a controller from being enabled within
a subtree.
This commit is contained in:
Chris Down 2018-12-03 14:38:06 +00:00
parent 4f6f62e468
commit c72703e26d
11 changed files with 133 additions and 26 deletions

View File

@ -736,6 +736,27 @@
</listitem>
</varlistentry>
<varlistentry>
<term><varname>DisableControllers=</varname></term>
<listitem>
<para>Disables controllers from being enabled for a unit's children. If a controller listed is already in use
in its subtree, the controller will be removed from the subtree. This can be used to avoid child units being
able to implicitly or explicitly enable a controller. Defaults to not disabling any controllers.</para>
<para>It may not be possible to successfully disable a controller if the unit or any child of the unit in
question delegates controllers to its children, as any delegated subtree of the cgroup hierarchy is unmanaged
by systemd.</para>
<para>Multiple controllers may be specified, separated by spaces. You may also pass
<varname>DisableControllers=</varname> multiple times, in which case each new instance adds another controller
to disable. Passing <varname>DisableControllers=</varname> by itself with no controller name present resets
the disabled controller list.</para>
<para>Valid controllers are <option>cpu</option>, <option>cpuacct</option>, <option>io</option>,
<option>blkio</option>, <option>memory</option>, <option>devices</option>, and <option>pids</option>.</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>

View File

@ -1346,7 +1346,7 @@ CGroupMask unit_get_own_mask(Unit *u) {
if (!c)
return 0;
return cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u);
return (cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u)) & ~unit_get_ancestor_disable_mask(u);
}
CGroupMask unit_get_delegate_mask(Unit *u) {
@ -1461,6 +1461,7 @@ CGroupMask unit_get_target_mask(Unit *u) {
mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
mask &= u->manager->cgroup_supported;
mask &= ~unit_get_ancestor_disable_mask(u);
return mask;
}
@ -1475,6 +1476,7 @@ CGroupMask unit_get_enable_mask(Unit *u) {
mask = unit_get_members_mask(u);
mask &= u->manager->cgroup_supported;
mask &= ~unit_get_ancestor_disable_mask(u);
return mask;
}
@ -1914,8 +1916,8 @@ static bool unit_has_mask_enables_realized(
* we want to add is already added. */
return u->cgroup_realized &&
((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == u->cgroup_realized_mask &&
((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == u->cgroup_enabled_mask;
((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) &&
((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2);
}
void unit_add_to_cgroup_realize_queue(Unit *u) {
@ -1965,11 +1967,7 @@ static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
new_target_mask = u->cgroup_realized_mask | target_mask;
new_enable_mask = u->cgroup_enabled_mask | enable_mask;
r = unit_create_cgroup(u, new_target_mask, new_enable_mask, state);
if (r < 0)
return r;
return 0;
return unit_create_cgroup(u, new_target_mask, new_enable_mask, state);
}
/* Controllers can only be disabled depth-first, from the leaves of the
@ -2043,7 +2041,7 @@ static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
* h i j k l m n o
*
* 1. We want to realise cgroup "d" now.
* 2. cgroup "a" has DisableController=cpu in the associated unit.
* 2. cgroup "a" has DisableControllers=cpu in the associated unit.
* 3. cgroup "k" just started requesting the memory controller.
*
* To make this work we must do the following in order:

View File

@ -118,6 +118,8 @@ struct CGroupContext {
bool delegate;
CGroupMask delegate_controllers;
CGroupMask disable_controllers;
};
/* Used when querying IP accounting data */
@ -151,6 +153,9 @@ CGroupMask unit_get_delegate_mask(Unit *u);
CGroupMask unit_get_members_mask(Unit *u);
CGroupMask unit_get_siblings_mask(Unit *u);
CGroupMask unit_get_subtree_mask(Unit *u);
CGroupMask unit_get_disable_mask(Unit *u);
CGroupMask unit_get_ancestor_disable_mask(Unit *u);
CGroupMask unit_get_target_mask(Unit *u);
CGroupMask unit_get_enable_mask(Unit *u);

View File

@ -17,6 +17,38 @@
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy);
static int property_get_cgroup_mask(
sd_bus *bus,
const char *path,
const char *interface,
const char *property,
sd_bus_message *reply,
void *userdata,
sd_bus_error *error) {
CGroupMask *mask = userdata;
CGroupController ctrl;
int r;
assert(bus);
assert(reply);
r = sd_bus_message_open_container(reply, 'a', "s");
if (r < 0)
return r;
for (ctrl = 0; ctrl < _CGROUP_CONTROLLER_MAX; ctrl++) {
if ((*mask & CGROUP_CONTROLLER_TO_MASK(ctrl)) == 0)
continue;
r = sd_bus_message_append(reply, "s", cgroup_controller_to_string(ctrl));
if (r < 0)
return r;
}
return sd_bus_message_close_container(reply);
}
static int property_get_delegate_controllers(
sd_bus *bus,
const char *path,
@ -27,8 +59,6 @@ static int property_get_delegate_controllers(
sd_bus_error *error) {
CGroupContext *c = userdata;
CGroupController cc;
int r;
assert(bus);
assert(reply);
@ -37,20 +67,7 @@ static int property_get_delegate_controllers(
if (!c->delegate)
return sd_bus_message_append(reply, "as", 0);
r = sd_bus_message_open_container(reply, 'a', "s");
if (r < 0)
return r;
for (cc = 0; cc < _CGROUP_CONTROLLER_MAX; cc++) {
if ((c->delegate_controllers & CGROUP_CONTROLLER_TO_MASK(cc)) == 0)
continue;
r = sd_bus_message_append(reply, "s", cgroup_controller_to_string(cc));
if (r < 0)
return r;
}
return sd_bus_message_close_container(reply);
return property_get_cgroup_mask(bus, path, interface, property, reply, &c->delegate_controllers, error);
}
static int property_get_io_device_weight(
@ -342,6 +359,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
SD_BUS_PROPERTY("IPAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, ip_accounting), 0),
SD_BUS_PROPERTY("IPAddressAllow", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_allow), 0),
SD_BUS_PROPERTY("IPAddressDeny", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_deny), 0),
SD_BUS_PROPERTY("DisableControllers", "as", property_get_cgroup_mask, offsetof(CGroupContext, disable_controllers), 0),
SD_BUS_VTABLE_END
};

View File

@ -192,6 +192,7 @@ $1.BlockIOWriteBandwidth, config_parse_blockio_bandwidth, 0,
$1.TasksAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.tasks_accounting)
$1.TasksMax, config_parse_tasks_max, 0, offsetof($1, cgroup_context.tasks_max)
$1.Delegate, config_parse_delegate, 0, offsetof($1, cgroup_context)
$1.DisableControllers, config_parse_disable_controllers, 0, offsetof($1, cgroup_context)
$1.IPAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.ip_accounting)
$1.IPAddressAllow, config_parse_ip_address_access, 0, offsetof($1, cgroup_context.ip_address_allow)
$1.IPAddressDeny, config_parse_ip_address_access, 0, offsetof($1, cgroup_context.ip_address_deny)

View File

@ -4323,6 +4323,41 @@ int config_parse_exit_status(
return 0;
}
int config_parse_disable_controllers(
const char *unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
int r;
CGroupContext *c = data;
CGroupMask disabled_mask;
/* 1. If empty, make all controllers eligible for use again.
* 2. If non-empty, merge all listed controllers, space separated. */
if (isempty(rvalue)) {
c->disable_controllers = 0;
return 0;
}
r = cg_mask_from_string(rvalue, &disabled_mask);
if (r < 0 || disabled_mask <= 0) {
log_syntax(unit, LOG_ERR, filename, line, r, "Invalid cgroup string: %s, ignoring", rvalue);
return 0;
}
c->disable_controllers |= disabled_mask;
return 0;
}
#define FOLLOW_MAX 8
static int open_follow(char **filename, FILE **_f, Set *names, char **_final) {

View File

@ -105,6 +105,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields);
CONFIG_PARSER_PROTOTYPE(config_parse_collect_mode);
CONFIG_PARSER_PROTOTYPE(config_parse_pid_file);
CONFIG_PARSER_PROTOTYPE(config_parse_exit_status);
CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers);
/* gperf prototypes */
const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length);

View File

@ -30,7 +30,7 @@ static void log_cgroup_mask(CGroupMask got, CGroupMask expected) {
static int test_cgroup_mask(void) {
_cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL;
_cleanup_(manager_freep) Manager *m = NULL;
Unit *son, *daughter, *parent, *root, *grandchild, *parent_deep;
Unit *son, *daughter, *parent, *root, *grandchild, *parent_deep, *nomem_parent, *nomem_leaf;
int r;
CGroupMask cpu_accounting_mask = get_cpu_accounting_mask();
@ -68,11 +68,15 @@ static int test_cgroup_mask(void) {
assert_se(manager_load_startable_unit_or_warn(m, "daughter.service", NULL, &daughter) >= 0);
assert_se(manager_load_startable_unit_or_warn(m, "grandchild.service", NULL, &grandchild) >= 0);
assert_se(manager_load_startable_unit_or_warn(m, "parent-deep.slice", NULL, &parent_deep) >= 0);
assert_se(manager_load_startable_unit_or_warn(m, "nomem.slice", NULL, &nomem_parent) >= 0);
assert_se(manager_load_startable_unit_or_warn(m, "nomemleaf.service", NULL, &nomem_leaf) >= 0);
assert_se(UNIT_DEREF(son->slice) == parent);
assert_se(UNIT_DEREF(daughter->slice) == parent);
assert_se(UNIT_DEREF(parent_deep->slice) == parent);
assert_se(UNIT_DEREF(grandchild->slice) == parent_deep);
assert_se(UNIT_DEREF(nomem_leaf->slice) == nomem_parent);
root = UNIT_DEREF(parent->slice);
assert_se(UNIT_DEREF(nomem_parent->slice) == root);
/* Verify per-unit cgroups settings. */
ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(son), CGROUP_MASK_CPU);
@ -80,6 +84,8 @@ static int test_cgroup_mask(void) {
ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(grandchild), 0);
ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(parent_deep), CGROUP_MASK_MEMORY);
ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(parent), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO));
ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(nomem_parent), 0);
ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(nomem_leaf), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO));
ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(root), 0);
/* Verify aggregation of member masks */
@ -88,6 +94,8 @@ static int test_cgroup_mask(void) {
ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(grandchild), 0);
ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(parent_deep), 0);
ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(parent), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY));
ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(nomem_parent), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO));
ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(nomem_leaf), 0);
ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(root), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
/* Verify aggregation of sibling masks. */
@ -96,6 +104,8 @@ static int test_cgroup_mask(void) {
ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(grandchild), 0);
ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(parent_deep), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY));
ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(parent), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(nomem_parent), (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(nomem_leaf), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO));
ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(root), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
/* Verify aggregation of target masks. */
@ -104,6 +114,8 @@ static int test_cgroup_mask(void) {
ASSERT_CGROUP_MASK(unit_get_target_mask(grandchild), 0);
ASSERT_CGROUP_MASK(unit_get_target_mask(parent_deep), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY) & m->cgroup_supported));
ASSERT_CGROUP_MASK(unit_get_target_mask(parent), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY) & m->cgroup_supported));
ASSERT_CGROUP_MASK(unit_get_target_mask(nomem_parent), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_IO | CGROUP_MASK_BLKIO) & m->cgroup_supported));
ASSERT_CGROUP_MASK(unit_get_target_mask(nomem_leaf), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_IO | CGROUP_MASK_BLKIO) & m->cgroup_supported));
ASSERT_CGROUP_MASK(unit_get_target_mask(root), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY) & m->cgroup_supported));
return 0;

View File

@ -18,6 +18,8 @@ test_data_files = '''
hwdb/10-bad.hwdb
journal-data/journal-1.txt
journal-data/journal-2.txt
nomem.slice
nomemleaf.service
parent-deep.slice
parent.slice
sched_idle_bad.service

5
test/nomem.slice Normal file
View File

@ -0,0 +1,5 @@
[Unit]
Description=Nomem Parent Slice
[Slice]
DisableControllers=memory

9
test/nomemleaf.service Normal file
View File

@ -0,0 +1,9 @@
[Unit]
Description=Nomem Leaf Service
[Service]
Slice=nomem.slice
Type=oneshot
ExecStart=/bin/true
IOWeight=200
MemoryAccounting=true