cgroup: introduce support for cgroup v2 CPUSET controller

Introduce support for configuring cpus and mems for processes using
cgroup v2 CPUSET controller.  This allows users to limit which cpus
and memory NUMA nodes can be used by processes to better utilize
system resources.

The cgroup v2 interfaces to control it are cpuset.cpus and cpuset.mems
where the requested configuration is written.  However, it doesn't mean
that the requested configuration will be actually used as parent cgroup
may limit the cpus or mems as well.  In order to reflect the real
configuration cgroup v2 provides read-only files cpuset.cpus.effective
and cpuset.mems.effective which are exported to users as well.
This commit is contained in:
Pavel Hrdina 2019-07-29 17:50:05 +02:00
parent 68c2b5ddb1
commit 047f5d63d7
14 changed files with 268 additions and 3 deletions

View File

@ -226,6 +226,8 @@ All cgroup/resource control settings are available for transient units
✓ StartupCPUShares=
✓ CPUQuota=
✓ CPUQuotaPeriodSec=
✓ AllowedCPUs=
✓ AllowedMemoryNodes=
✓ MemoryAccounting=
✓ DefaultMemoryMin=
✓ MemoryMin=

View File

@ -214,6 +214,36 @@
</listitem>
</varlistentry>
<varlistentry>
<term><varname>AllowedCPUs=</varname></term>
<listitem>
<para>Restrict processes to be executed on specific CPUs. Takes a list of CPU indices or ranges separated by either
whitespace or commas. CPU ranges are specified by the lower and upper CPU indices separated by a dash.</para>
<para>Setting <varname>AllowedCPUs=</varname> doesn't guarantee that all of the CPUs will be used by the processes
as it may be limited by parent units. The effective configuration is reported as <varname>EffectiveCPUs=</varname>.</para>
<para>This setting is supported only with the unified control group hierarchy.</para>
</listitem>
</varlistentry>
<varlistentry>
<term><varname>AllowedMemoryNodes=</varname></term>
<listitem>
<para>Restrict processes to be executed on specific memory NUMA nodes. Takes a list of memory NUMA nodes indices
or ranges separated by either whitespace or commas. Memory NUMA nodes ranges are specified by the lower and upper
CPU indices separated by a dash.</para>
<para>Setting <varname>AllowedMemoryNodes=</varname> doesn't guarantee that all of the memory NUMA nodes will
be used by the processes as it may be limited by parent units. The effective configuration is reported as
<varname>EffectiveMemoryNodes=</varname>.</para>
<para>This setting is supported only with the unified control group hierarchy.</para>
</listitem>
</varlistentry>
<varlistentry>
<term><varname>MemoryAccounting=</varname></term>

View File

@ -2905,6 +2905,7 @@ bool fd_is_cgroup_fs(int fd) {
static const char *const cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
[CGROUP_CONTROLLER_CPU] = "cpu",
[CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
[CGROUP_CONTROLLER_CPUSET] = "cpuset",
[CGROUP_CONTROLLER_IO] = "io",
[CGROUP_CONTROLLER_BLKIO] = "blkio",
[CGROUP_CONTROLLER_MEMORY] = "memory",

View File

@ -20,6 +20,7 @@ typedef enum CGroupController {
/* Original cgroup controllers */
CGROUP_CONTROLLER_CPU,
CGROUP_CONTROLLER_CPUACCT, /* v1 only */
CGROUP_CONTROLLER_CPUSET, /* v2 only */
CGROUP_CONTROLLER_IO, /* v2 only */
CGROUP_CONTROLLER_BLKIO, /* v1 only */
CGROUP_CONTROLLER_MEMORY,
@ -40,6 +41,7 @@ typedef enum CGroupController {
typedef enum CGroupMask {
CGROUP_MASK_CPU = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_CPU),
CGROUP_MASK_CPUACCT = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_CPUACCT),
CGROUP_MASK_CPUSET = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_CPUSET),
CGROUP_MASK_IO = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_IO),
CGROUP_MASK_BLKIO = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BLKIO),
CGROUP_MASK_MEMORY = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_MEMORY),
@ -52,7 +54,7 @@ typedef enum CGroupMask {
CGROUP_MASK_V1 = CGROUP_MASK_CPU|CGROUP_MASK_CPUACCT|CGROUP_MASK_BLKIO|CGROUP_MASK_MEMORY|CGROUP_MASK_DEVICES|CGROUP_MASK_PIDS,
/* All real cgroup v2 controllers */
CGROUP_MASK_V2 = CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_MEMORY|CGROUP_MASK_PIDS,
CGROUP_MASK_V2 = CGROUP_MASK_CPU|CGROUP_MASK_CPUSET|CGROUP_MASK_IO|CGROUP_MASK_MEMORY|CGROUP_MASK_PIDS,
/* All cgroup v2 BPF pseudo-controllers */
CGROUP_MASK_BPF = CGROUP_MASK_BPF_FIREWALL|CGROUP_MASK_BPF_DEVICES,

View File

@ -202,10 +202,15 @@ void cgroup_context_done(CGroupContext *c) {
c->ip_filters_ingress = strv_free(c->ip_filters_ingress);
c->ip_filters_egress = strv_free(c->ip_filters_egress);
cpu_set_reset(&c->cpuset_cpus);
cpu_set_reset(&c->cpuset_mems);
}
void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
_cleanup_free_ char *disable_controllers_str = NULL;
_cleanup_free_ char *cpuset_cpus = NULL;
_cleanup_free_ char *cpuset_mems = NULL;
CGroupIODeviceLimit *il;
CGroupIODeviceWeight *iw;
CGroupIODeviceLatency *l;
@ -224,6 +229,9 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
(void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str);
cpuset_cpus = cpu_set_to_range_string(&c->cpuset_cpus);
cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems);
fprintf(f,
"%sCPUAccounting=%s\n"
"%sIOAccounting=%s\n"
@ -237,6 +245,8 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
"%sStartupCPUShares=%" PRIu64 "\n"
"%sCPUQuotaPerSecSec=%s\n"
"%sCPUQuotaPeriodSec=%s\n"
"%sAllowedCPUs=%s\n"
"%sAllowedMemoryNodes=%s\n"
"%sIOWeight=%" PRIu64 "\n"
"%sStartupIOWeight=%" PRIu64 "\n"
"%sBlockIOWeight=%" PRIu64 "\n"
@ -265,6 +275,8 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
prefix, c->startup_cpu_shares,
prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
prefix, format_timespan(v, sizeof(v), c->cpu_quota_period_usec, 1),
prefix, cpuset_cpus,
prefix, cpuset_mems,
prefix, c->io_weight,
prefix, c->startup_io_weight,
prefix, c->blockio_weight,
@ -796,6 +808,16 @@ static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) {
CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX);
}
static void cgroup_apply_unified_cpuset(Unit *u, CPUSet cpus, const char *name) {
_cleanup_free_ char *buf = NULL;
buf = cpu_set_to_range_string(&cpus);
if (!buf)
return;
(void) set_attribute_and_warn(u, "cpuset", name, buf);
}
static bool cgroup_context_has_io_config(CGroupContext *c) {
return c->io_accounting ||
c->io_weight != CGROUP_WEIGHT_INVALID ||
@ -1036,6 +1058,11 @@ static void cgroup_context_apply(
}
}
if ((apply_mask & CGROUP_MASK_CPUSET) && !is_local_root) {
cgroup_apply_unified_cpuset(u, c->cpuset_cpus, "cpuset.cpus");
cgroup_apply_unified_cpuset(u, c->cpuset_mems, "cpuset.mems");
}
/* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2
* controller), and in case of containers we want to leave control of these attributes to the container manager
* (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */
@ -1408,6 +1435,9 @@ static CGroupMask unit_get_cgroup_mask(Unit *u) {
c->cpu_quota_per_sec_usec != USEC_INFINITY)
mask |= CGROUP_MASK_CPU;
if (c->cpuset_cpus.set || c->cpuset_mems.set)
mask |= CGROUP_MASK_CPUSET;
if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c))
mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
@ -3560,4 +3590,32 @@ static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] =
[CGROUP_STRICT] = "strict",
};
int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
_cleanup_free_ char *v = NULL;
int r;
assert(u);
assert(cpus);
if (!u->cgroup_path)
return -ENODATA;
if ((u->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0)
return -ENODATA;
r = cg_all_unified();
if (r < 0)
return r;
if (r == 0)
return -ENODATA;
if (r > 0)
r = cg_get_attribute("cpuset", u->cgroup_path, name, &v);
if (r == -ENOENT)
return -ENODATA;
if (r < 0)
return r;
return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL);
}
DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);

View File

@ -4,6 +4,7 @@
#include <stdbool.h>
#include "cgroup-util.h"
#include "cpu-set-util.h"
#include "ip-address-access.h"
#include "list.h"
#include "time-util.h"
@ -92,6 +93,9 @@ struct CGroupContext {
usec_t cpu_quota_per_sec_usec;
usec_t cpu_quota_period_usec;
CPUSet cpuset_cpus;
CPUSet cpuset_mems;
uint64_t io_weight;
uint64_t startup_io_weight;
LIST_HEAD(CGroupIODeviceWeight, io_device_weights);
@ -254,3 +258,5 @@ CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_;
bool unit_cgroup_delegate(Unit *u);
int compare_job_priority(const void *a, const void *b);
int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name);

View File

@ -71,6 +71,27 @@ static int property_get_delegate_controllers(
return property_get_cgroup_mask(bus, path, interface, property, reply, &c->delegate_controllers, error);
}
static int property_get_cpuset(
sd_bus *bus,
const char *path,
const char *interface,
const char *property,
sd_bus_message *reply,
void *userdata,
sd_bus_error *error) {
CPUSet *cpus = userdata;
_cleanup_free_ uint8_t *array = NULL;
size_t allocated;
assert(bus);
assert(reply);
assert(cpus);
(void) cpu_set_to_dbus(cpus, &array, &allocated);
return sd_bus_message_append_array(reply, 'y', array, allocated);
}
static int property_get_io_device_weight(
sd_bus *bus,
const char *path,
@ -332,6 +353,8 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
SD_BUS_PROPERTY("StartupCPUShares", "t", NULL, offsetof(CGroupContext, startup_cpu_shares), 0),
SD_BUS_PROPERTY("CPUQuotaPerSecUSec", "t", bus_property_get_usec, offsetof(CGroupContext, cpu_quota_per_sec_usec), 0),
SD_BUS_PROPERTY("CPUQuotaPeriodUSec", "t", bus_property_get_usec, offsetof(CGroupContext, cpu_quota_period_usec), 0),
SD_BUS_PROPERTY("AllowedCPUs", "ay", property_get_cpuset, offsetof(CGroupContext, cpuset_cpus), 0),
SD_BUS_PROPERTY("AllowedMemoryNodes", "ay", property_get_cpuset, offsetof(CGroupContext, cpuset_mems), 0),
SD_BUS_PROPERTY("IOAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, io_accounting), 0),
SD_BUS_PROPERTY("IOWeight", "t", NULL, offsetof(CGroupContext, io_weight), 0),
SD_BUS_PROPERTY("StartupIOWeight", "t", NULL, offsetof(CGroupContext, startup_io_weight), 0),
@ -856,6 +879,42 @@ int bus_cgroup_set_property(
return 1;
} else if (STR_IN_SET(name, "AllowedCPUs", "AllowedMemoryNodes")) {
const void *a;
size_t n;
_cleanup_(cpu_set_reset) CPUSet new_set = {};
r = sd_bus_message_read_array(message, 'y', &a, &n);
if (r < 0)
return r;
r = cpu_set_from_dbus(a, n, &new_set);
if (r < 0)
return r;
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
_cleanup_free_ char *setstr = NULL;
_cleanup_free_ char *data = NULL;
CPUSet *set;
setstr = cpu_set_to_range_string(&new_set);
if (streq(name, "AllowedCPUs"))
set = &c->cpuset_cpus;
else
set = &c->cpuset_mems;
if (asprintf(&data, "%s=%s", name, setstr) < 0)
return -ENOMEM;
cpu_set_reset(set);
cpu_set_add_all(set, &new_set);
unit_invalidate_cgroup(u, CGROUP_MASK_CPUSET);
unit_write_setting(u, flags, name, data);
}
return 1;
} else if ((iol_type = cgroup_io_limit_type_from_string(name)) >= 0) {
const char *path;
unsigned n = 0;

View File

@ -957,6 +957,52 @@ static int property_get_cpu_usage(
return sd_bus_message_append(reply, "t", ns);
}
static int property_get_cpuset_cpus(
sd_bus *bus,
const char *path,
const char *interface,
const char *property,
sd_bus_message *reply,
void *userdata,
sd_bus_error *error) {
Unit *u = userdata;
_cleanup_(cpu_set_reset) CPUSet cpus = {};
_cleanup_free_ uint8_t *array = NULL;
size_t allocated;
assert(bus);
assert(reply);
assert(u);
(void) unit_get_cpuset(u, &cpus, "cpuset.cpus.effective");
(void) cpu_set_to_dbus(&cpus, &array, &allocated);
return sd_bus_message_append_array(reply, 'y', array, allocated);
}
static int property_get_cpuset_mems(
sd_bus *bus,
const char *path,
const char *interface,
const char *property,
sd_bus_message *reply,
void *userdata,
sd_bus_error *error) {
Unit *u = userdata;
_cleanup_(cpu_set_reset) CPUSet mems = {};
_cleanup_free_ uint8_t *array = NULL;
size_t allocated;
assert(bus);
assert(reply);
assert(u);
(void) unit_get_cpuset(u, &mems, "cpuset.mems.effective");
(void) cpu_set_to_dbus(&mems, &array, &allocated);
return sd_bus_message_append_array(reply, 'y', array, allocated);
}
static int property_get_cgroup(
sd_bus *bus,
const char *path,
@ -1306,6 +1352,8 @@ const sd_bus_vtable bus_unit_cgroup_vtable[] = {
SD_BUS_PROPERTY("ControlGroup", "s", property_get_cgroup, 0, 0),
SD_BUS_PROPERTY("MemoryCurrent", "t", property_get_current_memory, 0, 0),
SD_BUS_PROPERTY("CPUUsageNSec", "t", property_get_cpu_usage, 0, 0),
SD_BUS_PROPERTY("EffectiveCPUs", "ay", property_get_cpuset_cpus, 0, 0),
SD_BUS_PROPERTY("EffectiveMemoryNodes", "ay", property_get_cpuset_mems, 0, 0),
SD_BUS_PROPERTY("TasksCurrent", "t", property_get_current_tasks, 0, 0),
SD_BUS_PROPERTY("IPIngressBytes", "t", property_get_ip_counter, 0, 0),
SD_BUS_PROPERTY("IPIngressPackets", "t", property_get_ip_counter, 0, 0),

View File

@ -173,6 +173,8 @@ $1.CPUShares, config_parse_cpu_shares, 0,
$1.StartupCPUShares, config_parse_cpu_shares, 0, offsetof($1, cgroup_context.startup_cpu_shares)
$1.CPUQuota, config_parse_cpu_quota, 0, offsetof($1, cgroup_context)
$1.CPUQuotaPeriodSec, config_parse_sec_def_infinity, 0, offsetof($1, cgroup_context.cpu_quota_period_usec)
$1.CPUSetCpus, config_parse_cpuset_cpus, 0, offsetof($1, cgroup_context)
$1.CPUSetMems, config_parse_cpuset_mems, 0, offsetof($1, cgroup_context)
$1.MemoryAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.memory_accounting)
$1.MemoryMin, config_parse_memory_limit, 0, offsetof($1, cgroup_context)
$1.DefaultMemoryMin, config_parse_memory_limit, 0, offsetof($1, cgroup_context)

View File

@ -3149,6 +3149,44 @@ int config_parse_cpu_quota(
return 0;
}
int config_parse_cpuset_cpus(
const char *unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
CGroupContext *c = data;
(void) parse_cpu_set_extend(rvalue, &c->cpuset_cpus, true, unit, filename, line, lvalue);
return 0;
}
int config_parse_cpuset_mems(
const char *unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
CGroupContext *c = data;
(void) parse_cpu_set_extend(rvalue, &c->cpuset_mems, true, unit, filename, line, lvalue);
return 0;
}
int config_parse_memory_limit(
const char *unit,
const char *filename,

View File

@ -92,6 +92,8 @@ CONFIG_PARSER_PROTOTYPE(config_parse_set_status);
CONFIG_PARSER_PROTOTYPE(config_parse_namespace_path_strv);
CONFIG_PARSER_PROTOTYPE(config_parse_temporary_filesystems);
CONFIG_PARSER_PROTOTYPE(config_parse_cpu_quota);
CONFIG_PARSER_PROTOTYPE(config_parse_cpuset_cpus);
CONFIG_PARSER_PROTOTYPE(config_parse_cpuset_mems);
CONFIG_PARSER_PROTOTYPE(config_parse_protect_home);
CONFIG_PARSER_PROTOTYPE(config_parse_protect_system);
CONFIG_PARSER_PROTOTYPE(config_parse_bus_name);

View File

@ -435,6 +435,22 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons
return bus_append_cg_cpu_shares_parse(m, field, eq);
if (STR_IN_SET(field, "AllowedCPUs", "AllowedMemoryNodes")) {
_cleanup_(cpu_set_reset) CPUSet cpuset = {};
_cleanup_free_ uint8_t *array = NULL;
size_t allocated;
r = parse_cpu_set(eq, &cpuset);
if (r < 0)
return log_error_errno(r, "Failed to parse %s value: %s", field, eq);
r = cpu_set_to_dbus(&cpuset, &array, &allocated);
if (r < 0)
return log_error_errno(r, "Failed to serialize CPUSet: %m");
return bus_append_byte_array(m, field, array, allocated);
}
if (STR_IN_SET(field, "BlockIOWeight", "StartupBlockIOWeight"))
return bus_append_cg_blkio_weight_parse(m, field, eq);

View File

@ -5411,7 +5411,7 @@ static int print_property(const char *name, const char *expected_value, sd_bus_m
bus_print_property_value(name, expected_value, value, strempty(fields));
return 1;
} else if (contents[0] == SD_BUS_TYPE_BYTE && STR_IN_SET(name, "CPUAffinity", "NUMAMask")) {
} else if (contents[0] == SD_BUS_TYPE_BYTE && STR_IN_SET(name, "CPUAffinity", "NUMAMask", "AllowedCPUs", "AllowedMemoryNodes", "EffectiveCPUs", "EffectiveMemoryNodes")) {
_cleanup_free_ char *affinity = NULL;
_cleanup_(cpu_set_reset) CPUSet set = {};
const void *a;

View File

@ -129,9 +129,10 @@ static void test_cg_mask_to_string_one(CGroupMask mask, const char *t) {
static void test_cg_mask_to_string(void) {
test_cg_mask_to_string_one(0, NULL);
test_cg_mask_to_string_one(_CGROUP_MASK_ALL, "cpu cpuacct io blkio memory devices pids bpf-firewall bpf-devices");
test_cg_mask_to_string_one(_CGROUP_MASK_ALL, "cpu cpuacct cpuset io blkio memory devices pids bpf-firewall bpf-devices");
test_cg_mask_to_string_one(CGROUP_MASK_CPU, "cpu");
test_cg_mask_to_string_one(CGROUP_MASK_CPUACCT, "cpuacct");
test_cg_mask_to_string_one(CGROUP_MASK_CPUSET, "cpuset");
test_cg_mask_to_string_one(CGROUP_MASK_IO, "io");
test_cg_mask_to_string_one(CGROUP_MASK_BLKIO, "blkio");
test_cg_mask_to_string_one(CGROUP_MASK_MEMORY, "memory");