diff --git a/docs/TRANSIENT-SETTINGS.md b/docs/TRANSIENT-SETTINGS.md index 7ba5837e81..08d317ca41 100644 --- a/docs/TRANSIENT-SETTINGS.md +++ b/docs/TRANSIENT-SETTINGS.md @@ -226,6 +226,8 @@ All cgroup/resource control settings are available for transient units ✓ StartupCPUShares= ✓ CPUQuota= ✓ CPUQuotaPeriodSec= +✓ AllowedCPUs= +✓ AllowedMemoryNodes= ✓ MemoryAccounting= ✓ DefaultMemoryMin= ✓ MemoryMin= diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index d8fa24727a..98a0741359 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -214,6 +214,36 @@ + + AllowedCPUs= + + + Restrict processes to be executed on specific CPUs. Takes a list of CPU indices or ranges separated by either + whitespace or commas. CPU ranges are specified by the lower and upper CPU indices separated by a dash. + + Setting AllowedCPUs= doesn't guarantee that all of the CPUs will be used by the processes + as it may be limited by parent units. The effective configuration is reported as EffectiveCPUs=. + + This setting is supported only with the unified control group hierarchy. + + + + + AllowedMemoryNodes= + + + Restrict processes to be executed on specific memory NUMA nodes. Takes a list of memory NUMA nodes indices + or ranges separated by either whitespace or commas. Memory NUMA nodes ranges are specified by the lower and upper + CPU indices separated by a dash. + + Setting AllowedMemoryNodes= doesn't guarantee that all of the memory NUMA nodes will + be used by the processes as it may be limited by parent units. The effective configuration is reported as + EffectiveMemoryNodes=. + + This setting is supported only with the unified control group hierarchy. + + + MemoryAccounting= diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index d5c33014d7..80ec264261 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -2905,6 +2905,7 @@ bool fd_is_cgroup_fs(int fd) { static const char *const cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = { [CGROUP_CONTROLLER_CPU] = "cpu", [CGROUP_CONTROLLER_CPUACCT] = "cpuacct", + [CGROUP_CONTROLLER_CPUSET] = "cpuset", [CGROUP_CONTROLLER_IO] = "io", [CGROUP_CONTROLLER_BLKIO] = "blkio", [CGROUP_CONTROLLER_MEMORY] = "memory", diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h index a39ab451b9..ab094fdc4f 100644 --- a/src/basic/cgroup-util.h +++ b/src/basic/cgroup-util.h @@ -20,6 +20,7 @@ typedef enum CGroupController { /* Original cgroup controllers */ CGROUP_CONTROLLER_CPU, CGROUP_CONTROLLER_CPUACCT, /* v1 only */ + CGROUP_CONTROLLER_CPUSET, /* v2 only */ CGROUP_CONTROLLER_IO, /* v2 only */ CGROUP_CONTROLLER_BLKIO, /* v1 only */ CGROUP_CONTROLLER_MEMORY, @@ -40,6 +41,7 @@ typedef enum CGroupController { typedef enum CGroupMask { CGROUP_MASK_CPU = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_CPU), CGROUP_MASK_CPUACCT = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_CPUACCT), + CGROUP_MASK_CPUSET = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_CPUSET), CGROUP_MASK_IO = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_IO), CGROUP_MASK_BLKIO = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BLKIO), CGROUP_MASK_MEMORY = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_MEMORY), @@ -52,7 +54,7 @@ typedef enum CGroupMask { CGROUP_MASK_V1 = CGROUP_MASK_CPU|CGROUP_MASK_CPUACCT|CGROUP_MASK_BLKIO|CGROUP_MASK_MEMORY|CGROUP_MASK_DEVICES|CGROUP_MASK_PIDS, /* All real cgroup v2 controllers */ - CGROUP_MASK_V2 = CGROUP_MASK_CPU|CGROUP_MASK_IO|CGROUP_MASK_MEMORY|CGROUP_MASK_PIDS, + CGROUP_MASK_V2 = CGROUP_MASK_CPU|CGROUP_MASK_CPUSET|CGROUP_MASK_IO|CGROUP_MASK_MEMORY|CGROUP_MASK_PIDS, /* All cgroup v2 BPF pseudo-controllers */ CGROUP_MASK_BPF = CGROUP_MASK_BPF_FIREWALL|CGROUP_MASK_BPF_DEVICES, diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 60a7799361..423742969e 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -202,10 +202,15 @@ void cgroup_context_done(CGroupContext *c) { c->ip_filters_ingress = strv_free(c->ip_filters_ingress); c->ip_filters_egress = strv_free(c->ip_filters_egress); + + cpu_set_reset(&c->cpuset_cpus); + cpu_set_reset(&c->cpuset_mems); } void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { _cleanup_free_ char *disable_controllers_str = NULL; + _cleanup_free_ char *cpuset_cpus = NULL; + _cleanup_free_ char *cpuset_mems = NULL; CGroupIODeviceLimit *il; CGroupIODeviceWeight *iw; CGroupIODeviceLatency *l; @@ -224,6 +229,9 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { (void) cg_mask_to_string(c->disable_controllers, &disable_controllers_str); + cpuset_cpus = cpu_set_to_range_string(&c->cpuset_cpus); + cpuset_mems = cpu_set_to_range_string(&c->cpuset_mems); + fprintf(f, "%sCPUAccounting=%s\n" "%sIOAccounting=%s\n" @@ -237,6 +245,8 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { "%sStartupCPUShares=%" PRIu64 "\n" "%sCPUQuotaPerSecSec=%s\n" "%sCPUQuotaPeriodSec=%s\n" + "%sAllowedCPUs=%s\n" + "%sAllowedMemoryNodes=%s\n" "%sIOWeight=%" PRIu64 "\n" "%sStartupIOWeight=%" PRIu64 "\n" "%sBlockIOWeight=%" PRIu64 "\n" @@ -265,6 +275,8 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { prefix, c->startup_cpu_shares, prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1), prefix, format_timespan(v, sizeof(v), c->cpu_quota_period_usec, 1), + prefix, cpuset_cpus, + prefix, cpuset_mems, prefix, c->io_weight, prefix, c->startup_io_weight, prefix, c->blockio_weight, @@ -796,6 +808,16 @@ static uint64_t cgroup_cpu_weight_to_shares(uint64_t weight) { CGROUP_CPU_SHARES_MIN, CGROUP_CPU_SHARES_MAX); } +static void cgroup_apply_unified_cpuset(Unit *u, CPUSet cpus, const char *name) { + _cleanup_free_ char *buf = NULL; + + buf = cpu_set_to_range_string(&cpus); + if (!buf) + return; + + (void) set_attribute_and_warn(u, "cpuset", name, buf); +} + static bool cgroup_context_has_io_config(CGroupContext *c) { return c->io_accounting || c->io_weight != CGROUP_WEIGHT_INVALID || @@ -1036,6 +1058,11 @@ static void cgroup_context_apply( } } + if ((apply_mask & CGROUP_MASK_CPUSET) && !is_local_root) { + cgroup_apply_unified_cpuset(u, c->cpuset_cpus, "cpuset.cpus"); + cgroup_apply_unified_cpuset(u, c->cpuset_mems, "cpuset.mems"); + } + /* The 'io' controller attributes are not exported on the host's root cgroup (being a pure cgroup v2 * controller), and in case of containers we want to leave control of these attributes to the container manager * (and we couldn't access that stuff anyway, even if we tried if proper delegation is used). */ @@ -1408,6 +1435,9 @@ static CGroupMask unit_get_cgroup_mask(Unit *u) { c->cpu_quota_per_sec_usec != USEC_INFINITY) mask |= CGROUP_MASK_CPU; + if (c->cpuset_cpus.set || c->cpuset_mems.set) + mask |= CGROUP_MASK_CPUSET; + if (cgroup_context_has_io_config(c) || cgroup_context_has_blockio_config(c)) mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO; @@ -3560,4 +3590,32 @@ static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = [CGROUP_STRICT] = "strict", }; +int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) { + _cleanup_free_ char *v = NULL; + int r; + + assert(u); + assert(cpus); + + if (!u->cgroup_path) + return -ENODATA; + + if ((u->cgroup_realized_mask & CGROUP_MASK_CPUSET) == 0) + return -ENODATA; + + r = cg_all_unified(); + if (r < 0) + return r; + if (r == 0) + return -ENODATA; + if (r > 0) + r = cg_get_attribute("cpuset", u->cgroup_path, name, &v); + if (r == -ENOENT) + return -ENODATA; + if (r < 0) + return r; + + return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL); +} + DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy); diff --git a/src/core/cgroup.h b/src/core/cgroup.h index d8ec070623..bca53fb980 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -4,6 +4,7 @@ #include #include "cgroup-util.h" +#include "cpu-set-util.h" #include "ip-address-access.h" #include "list.h" #include "time-util.h" @@ -92,6 +93,9 @@ struct CGroupContext { usec_t cpu_quota_per_sec_usec; usec_t cpu_quota_period_usec; + CPUSet cpuset_cpus; + CPUSet cpuset_mems; + uint64_t io_weight; uint64_t startup_io_weight; LIST_HEAD(CGroupIODeviceWeight, io_device_weights); @@ -254,3 +258,5 @@ CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_; bool unit_cgroup_delegate(Unit *u); int compare_job_priority(const void *a, const void *b); + +int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name); diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index 2f2313c599..bd0e73befc 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -71,6 +71,27 @@ static int property_get_delegate_controllers( return property_get_cgroup_mask(bus, path, interface, property, reply, &c->delegate_controllers, error); } +static int property_get_cpuset( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CPUSet *cpus = userdata; + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + assert(bus); + assert(reply); + assert(cpus); + + (void) cpu_set_to_dbus(cpus, &array, &allocated); + return sd_bus_message_append_array(reply, 'y', array, allocated); +} + static int property_get_io_device_weight( sd_bus *bus, const char *path, @@ -332,6 +353,8 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_PROPERTY("StartupCPUShares", "t", NULL, offsetof(CGroupContext, startup_cpu_shares), 0), SD_BUS_PROPERTY("CPUQuotaPerSecUSec", "t", bus_property_get_usec, offsetof(CGroupContext, cpu_quota_per_sec_usec), 0), SD_BUS_PROPERTY("CPUQuotaPeriodUSec", "t", bus_property_get_usec, offsetof(CGroupContext, cpu_quota_period_usec), 0), + SD_BUS_PROPERTY("AllowedCPUs", "ay", property_get_cpuset, offsetof(CGroupContext, cpuset_cpus), 0), + SD_BUS_PROPERTY("AllowedMemoryNodes", "ay", property_get_cpuset, offsetof(CGroupContext, cpuset_mems), 0), SD_BUS_PROPERTY("IOAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, io_accounting), 0), SD_BUS_PROPERTY("IOWeight", "t", NULL, offsetof(CGroupContext, io_weight), 0), SD_BUS_PROPERTY("StartupIOWeight", "t", NULL, offsetof(CGroupContext, startup_io_weight), 0), @@ -856,6 +879,42 @@ int bus_cgroup_set_property( return 1; + } else if (STR_IN_SET(name, "AllowedCPUs", "AllowedMemoryNodes")) { + const void *a; + size_t n; + _cleanup_(cpu_set_reset) CPUSet new_set = {}; + + r = sd_bus_message_read_array(message, 'y', &a, &n); + if (r < 0) + return r; + + r = cpu_set_from_dbus(a, n, &new_set); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *setstr = NULL; + _cleanup_free_ char *data = NULL; + CPUSet *set; + + setstr = cpu_set_to_range_string(&new_set); + + if (streq(name, "AllowedCPUs")) + set = &c->cpuset_cpus; + else + set = &c->cpuset_mems; + + if (asprintf(&data, "%s=%s", name, setstr) < 0) + return -ENOMEM; + + cpu_set_reset(set); + cpu_set_add_all(set, &new_set); + unit_invalidate_cgroup(u, CGROUP_MASK_CPUSET); + unit_write_setting(u, flags, name, data); + } + + return 1; + } else if ((iol_type = cgroup_io_limit_type_from_string(name)) >= 0) { const char *path; unsigned n = 0; diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c index 9722f53162..9477c47140 100644 --- a/src/core/dbus-unit.c +++ b/src/core/dbus-unit.c @@ -957,6 +957,52 @@ static int property_get_cpu_usage( return sd_bus_message_append(reply, "t", ns); } +static int property_get_cpuset_cpus( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata; + _cleanup_(cpu_set_reset) CPUSet cpus = {}; + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + assert(bus); + assert(reply); + assert(u); + + (void) unit_get_cpuset(u, &cpus, "cpuset.cpus.effective"); + (void) cpu_set_to_dbus(&cpus, &array, &allocated); + return sd_bus_message_append_array(reply, 'y', array, allocated); +} + +static int property_get_cpuset_mems( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + Unit *u = userdata; + _cleanup_(cpu_set_reset) CPUSet mems = {}; + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + assert(bus); + assert(reply); + assert(u); + + (void) unit_get_cpuset(u, &mems, "cpuset.mems.effective"); + (void) cpu_set_to_dbus(&mems, &array, &allocated); + return sd_bus_message_append_array(reply, 'y', array, allocated); +} + static int property_get_cgroup( sd_bus *bus, const char *path, @@ -1306,6 +1352,8 @@ const sd_bus_vtable bus_unit_cgroup_vtable[] = { SD_BUS_PROPERTY("ControlGroup", "s", property_get_cgroup, 0, 0), SD_BUS_PROPERTY("MemoryCurrent", "t", property_get_current_memory, 0, 0), SD_BUS_PROPERTY("CPUUsageNSec", "t", property_get_cpu_usage, 0, 0), + SD_BUS_PROPERTY("EffectiveCPUs", "ay", property_get_cpuset_cpus, 0, 0), + SD_BUS_PROPERTY("EffectiveMemoryNodes", "ay", property_get_cpuset_mems, 0, 0), SD_BUS_PROPERTY("TasksCurrent", "t", property_get_current_tasks, 0, 0), SD_BUS_PROPERTY("IPIngressBytes", "t", property_get_ip_counter, 0, 0), SD_BUS_PROPERTY("IPIngressPackets", "t", property_get_ip_counter, 0, 0), diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 765d7d503d..62dff99d86 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -173,6 +173,8 @@ $1.CPUShares, config_parse_cpu_shares, 0, $1.StartupCPUShares, config_parse_cpu_shares, 0, offsetof($1, cgroup_context.startup_cpu_shares) $1.CPUQuota, config_parse_cpu_quota, 0, offsetof($1, cgroup_context) $1.CPUQuotaPeriodSec, config_parse_sec_def_infinity, 0, offsetof($1, cgroup_context.cpu_quota_period_usec) +$1.CPUSetCpus, config_parse_cpuset_cpus, 0, offsetof($1, cgroup_context) +$1.CPUSetMems, config_parse_cpuset_mems, 0, offsetof($1, cgroup_context) $1.MemoryAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.memory_accounting) $1.MemoryMin, config_parse_memory_limit, 0, offsetof($1, cgroup_context) $1.DefaultMemoryMin, config_parse_memory_limit, 0, offsetof($1, cgroup_context) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 56cad2f7b8..5fdead22f9 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -3149,6 +3149,44 @@ int config_parse_cpu_quota( return 0; } +int config_parse_cpuset_cpus( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = data; + + (void) parse_cpu_set_extend(rvalue, &c->cpuset_cpus, true, unit, filename, line, lvalue); + + return 0; +} + +int config_parse_cpuset_mems( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + CGroupContext *c = data; + + (void) parse_cpu_set_extend(rvalue, &c->cpuset_mems, true, unit, filename, line, lvalue); + + return 0; +} + int config_parse_memory_limit( const char *unit, const char *filename, diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index 1183987d7d..e29d58f706 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -92,6 +92,8 @@ CONFIG_PARSER_PROTOTYPE(config_parse_set_status); CONFIG_PARSER_PROTOTYPE(config_parse_namespace_path_strv); CONFIG_PARSER_PROTOTYPE(config_parse_temporary_filesystems); CONFIG_PARSER_PROTOTYPE(config_parse_cpu_quota); +CONFIG_PARSER_PROTOTYPE(config_parse_cpuset_cpus); +CONFIG_PARSER_PROTOTYPE(config_parse_cpuset_mems); CONFIG_PARSER_PROTOTYPE(config_parse_protect_home); CONFIG_PARSER_PROTOTYPE(config_parse_protect_system); CONFIG_PARSER_PROTOTYPE(config_parse_bus_name); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 612ae84fff..2cd5adfd30 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -435,6 +435,22 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons return bus_append_cg_cpu_shares_parse(m, field, eq); + if (STR_IN_SET(field, "AllowedCPUs", "AllowedMemoryNodes")) { + _cleanup_(cpu_set_reset) CPUSet cpuset = {}; + _cleanup_free_ uint8_t *array = NULL; + size_t allocated; + + r = parse_cpu_set(eq, &cpuset); + if (r < 0) + return log_error_errno(r, "Failed to parse %s value: %s", field, eq); + + r = cpu_set_to_dbus(&cpuset, &array, &allocated); + if (r < 0) + return log_error_errno(r, "Failed to serialize CPUSet: %m"); + + return bus_append_byte_array(m, field, array, allocated); + } + if (STR_IN_SET(field, "BlockIOWeight", "StartupBlockIOWeight")) return bus_append_cg_blkio_weight_parse(m, field, eq); diff --git a/src/systemctl/systemctl.c b/src/systemctl/systemctl.c index 7917cbe39f..f50edbcee7 100644 --- a/src/systemctl/systemctl.c +++ b/src/systemctl/systemctl.c @@ -5411,7 +5411,7 @@ static int print_property(const char *name, const char *expected_value, sd_bus_m bus_print_property_value(name, expected_value, value, strempty(fields)); return 1; - } else if (contents[0] == SD_BUS_TYPE_BYTE && STR_IN_SET(name, "CPUAffinity", "NUMAMask")) { + } else if (contents[0] == SD_BUS_TYPE_BYTE && STR_IN_SET(name, "CPUAffinity", "NUMAMask", "AllowedCPUs", "AllowedMemoryNodes", "EffectiveCPUs", "EffectiveMemoryNodes")) { _cleanup_free_ char *affinity = NULL; _cleanup_(cpu_set_reset) CPUSet set = {}; const void *a; diff --git a/src/test/test-cgroup-mask.c b/src/test/test-cgroup-mask.c index 72a6551ffd..ef3755de41 100644 --- a/src/test/test-cgroup-mask.c +++ b/src/test/test-cgroup-mask.c @@ -129,9 +129,10 @@ static void test_cg_mask_to_string_one(CGroupMask mask, const char *t) { static void test_cg_mask_to_string(void) { test_cg_mask_to_string_one(0, NULL); - test_cg_mask_to_string_one(_CGROUP_MASK_ALL, "cpu cpuacct io blkio memory devices pids bpf-firewall bpf-devices"); + test_cg_mask_to_string_one(_CGROUP_MASK_ALL, "cpu cpuacct cpuset io blkio memory devices pids bpf-firewall bpf-devices"); test_cg_mask_to_string_one(CGROUP_MASK_CPU, "cpu"); test_cg_mask_to_string_one(CGROUP_MASK_CPUACCT, "cpuacct"); + test_cg_mask_to_string_one(CGROUP_MASK_CPUSET, "cpuset"); test_cg_mask_to_string_one(CGROUP_MASK_IO, "io"); test_cg_mask_to_string_one(CGROUP_MASK_BLKIO, "blkio"); test_cg_mask_to_string_one(CGROUP_MASK_MEMORY, "memory");