diff --git a/NEWS b/NEWS
index e5a6612647..d4a60c9d06 100644
--- a/NEWS
+++ b/NEWS
@@ -104,6 +104,12 @@ CHANGES WITH 243 in spe:
all" pattern instead, e.g. OriginalName=* or Name=* in case all
interfaces should really be matched.
+ * A new setting NUMAPolicy= may be used to set process memory
+ allocation policy. Setting can be specified in system.conf and
+ hence will set the default policy for PID1. Default policy can be
+ overriden on per-service basis. Related setting NUMAMask= is used to
+ specify NUMA node mask that should be associated with the selected
+ policy.
…
CHANGES WITH 242:
diff --git a/man/systemd-system.conf.xml b/man/systemd-system.conf.xml
index f5d419c519..9de04a7879 100644
--- a/man/systemd-system.conf.xml
+++ b/man/systemd-system.conf.xml
@@ -106,6 +106,25 @@
systemd.exec5.
+
+ NUMAPolicy=
+
+ Configures the NUMA memory policy for the service manager and the default NUMA memory policy
+ for all forked off processes. Individual services may override the default policy with the
+ NUMAPolicy= setting in unit files, see
+ systemd.exec5.
+
+
+
+ NUMAMask=
+
+ Configures the NUMA node mask that will be associated with the selected NUMA policy. Note that
+ and NUMA policies don't require explicit NUMA node mask and
+ value of the option can be empty. Similarly to NUMAPolicy=, value can be overriden
+ by individual services in unit files, see
+ systemd.exec5.
+
+
RuntimeWatchdogSec=
ShutdownWatchdogSec=
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 8f7d64d017..8963764bf6 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -762,6 +762,28 @@ CapabilityBoundingSet=~CAP_B CAP_C
details.
+
+ NUMAPolicy=
+
+ Controls the NUMA memory policy of the executed processes. Takes a policy type, one of:
+ , , , and
+ . A list of NUMA nodes that should be associated with the policy must be specified
+ in NUMAMask=. For more details on each policy please see,
+ set_mempolicy2. For overall
+ overview of NUMA support in Linux see,
+ numa7
+
+
+
+
+ NUMAMask=
+
+ Controls the NUMA node list which will be applied alongside with selected NUMA policy.
+ Takes a list of NUMA nodes and has the same syntax as a list of CPUs for CPUAffinity=
+ option. Note that the list of NUMA nodes is not required for and
+ policies and for policy we expect a single NUMA node.
+
+
IOSchedulingClass=
@@ -2918,6 +2940,12 @@ StandardInputData=SWNrIHNpdHplIGRhIHVuJyBlc3NlIEtsb3BzLAp1ZmYgZWVtYWwga2xvcHAncy
EXIT_CONFIGURATION_DIRECTORY
Failed to set up unit's configuration directory. See ConfigurationDirectory= above.
+
+ 242
+ EXIT_NUMA_POLICY
+ Failed to set up unit's NUMA memory policy. See NUMAPolicy= and NUMAMask=above.
+
+
diff --git a/meson.build b/meson.build
index 0a9b3d5b85..e9c44bbb94 100644
--- a/meson.build
+++ b/meson.build
@@ -496,6 +496,10 @@ foreach ident : [
#include '''],
['explicit_bzero' , '''#include '''],
['reallocarray', '''#include '''],
+ ['set_mempolicy', '''#include
+ #include '''],
+ ['get_mempolicy', '''#include
+ #include '''],
]
have = cc.has_function(ident[0], prefix : ident[1], args : '-D_GNU_SOURCE')
diff --git a/src/basic/missing_syscall.h b/src/basic/missing_syscall.h
index d1aa32218b..cd455eb47c 100644
--- a/src/basic/missing_syscall.h
+++ b/src/basic/missing_syscall.h
@@ -444,3 +444,46 @@ static inline ssize_t missing_statx(int dfd, const char *filename, unsigned flag
# define statx missing_statx
#endif
+
+#if !HAVE_SET_MEMPOLICY
+
+enum {
+ MPOL_DEFAULT,
+ MPOL_PREFERRED,
+ MPOL_BIND,
+ MPOL_INTERLEAVE,
+ MPOL_LOCAL,
+};
+
+static inline long missing_set_mempolicy(int mode, const unsigned long *nodemask,
+ unsigned long maxnode) {
+ long i;
+# ifdef __NR_set_mempolicy
+ i = syscall(__NR_set_mempolicy, mode, nodemask, maxnode);
+# else
+ errno = ENOSYS;
+ i = -1;
+# endif
+ return i;
+}
+
+# define set_mempolicy missing_set_mempolicy
+#endif
+
+
+#if !HAVE_GET_MEMPOLICY
+static inline long missing_get_mempolicy(int *mode, unsigned long *nodemask,
+ unsigned long maxnode, void *addr,
+ unsigned long flags) {
+ long i;
+# ifdef __NR_get_mempolicy
+ i = syscall(__NR_get_mempolicy, mode, nodemask, maxnode, addr, flags);
+# else
+ errno = ENOSYS;
+ i = -1;
+# endif
+ return i;
+}
+
+#define get_mempolicy missing_get_mempolicy
+#endif
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index 4d5fb2eb10..c816569f2b 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -225,6 +225,48 @@ static int property_get_cpu_affinity(
return sd_bus_message_append_array(reply, 'y', array, allocated);
}
+static int property_get_numa_mask(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ ExecContext *c = userdata;
+ _cleanup_free_ uint8_t *array = NULL;
+ size_t allocated;
+
+ assert(bus);
+ assert(reply);
+ assert(c);
+
+ (void) cpu_set_to_dbus(&c->numa_policy.nodes, &array, &allocated);
+
+ return sd_bus_message_append_array(reply, 'y', array, allocated);
+}
+
+static int property_get_numa_policy(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+ ExecContext *c = userdata;
+ int32_t policy;
+
+ assert(bus);
+ assert(reply);
+ assert(c);
+
+ policy = numa_policy_get_type(&c->numa_policy);
+
+ return sd_bus_message_append_basic(reply, 'i', &policy);
+}
+
static int property_get_timer_slack_nsec(
sd_bus *bus,
const char *path,
@@ -700,6 +742,8 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("CPUSchedulingPolicy", "i", property_get_cpu_sched_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("CPUSchedulingPriority", "i", property_get_cpu_sched_priority, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("CPUAffinity", "ay", property_get_cpu_affinity, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NUMAPolicy", "i", property_get_numa_policy, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("NUMAMask", "ay", property_get_numa_mask, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("CPUSchedulingResetOnFork", "b", bus_property_get_bool, offsetof(ExecContext, cpu_sched_reset_on_fork), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("NonBlocking", "b", bus_property_get_bool, offsetof(ExecContext, non_blocking), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1650,9 +1694,10 @@ int bus_exec_context_set_transient_property(
return 1;
}
#endif
- if (streq(name, "CPUAffinity")) {
+ if (STR_IN_SET(name, "CPUAffinity", "NUMAMask")) {
const void *a;
size_t n;
+ bool affinity = streq(name, "CPUAffinity");
_cleanup_(cpu_set_reset) CPUSet set = {};
r = sd_bus_message_read_array(message, 'y', &a, &n);
@@ -1665,7 +1710,7 @@ int bus_exec_context_set_transient_property(
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
if (n == 0) {
- cpu_set_reset(&c->cpu_set);
+ cpu_set_reset(affinity ? &c->cpu_set : &c->numa_policy.nodes);
unit_write_settingf(u, flags, name, "%s=", name);
} else {
_cleanup_free_ char *str = NULL;
@@ -1677,7 +1722,7 @@ int bus_exec_context_set_transient_property(
/* We forego any optimizations here, and always create the structure using
* cpu_set_add_all(), because we don't want to care if the existing size we
* got over dbus is appropriate. */
- r = cpu_set_add_all(&c->cpu_set, &set);
+ r = cpu_set_add_all(affinity ? &c->cpu_set : &c->numa_policy.nodes, &set);
if (r < 0)
return r;
@@ -1687,6 +1732,20 @@ int bus_exec_context_set_transient_property(
return 1;
+ } else if (streq(name, "NUMAPolicy")) {
+ int32_t type;
+
+ r = sd_bus_message_read(message, "i", &type);
+ if (r < 0)
+ return r;
+
+ if (!mpol_is_valid(type))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid NUMAPolicy value: %i", type);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags))
+ c->numa_policy.type = type;
+
+ return 1;
} else if (streq(name, "Nice")) {
int32_t q;
diff --git a/src/core/execute.c b/src/core/execute.c
index 921449391d..426e57b8e0 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -3148,6 +3148,16 @@ static int exec_child(
return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
}
+ if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
+ r = apply_numa_policy(&context->numa_policy);
+ if (r == -EOPNOTSUPP)
+ log_unit_debug_errno(unit, SYNTHETIC_ERRNO(r), "NUMA support not available, ignoring.");
+ else if (r < 0) {
+ *exit_status = EXIT_NUMA_POLICY;
+ return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
+ }
+ }
+
if (context->ioprio_set)
if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
*exit_status = EXIT_IOPRIO;
@@ -3854,6 +3864,7 @@ void exec_context_init(ExecContext *c) {
assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
c->log_level_max = -1;
+ numa_policy_reset(&c->numa_policy);
}
void exec_context_done(ExecContext *c) {
@@ -3898,6 +3909,7 @@ void exec_context_done(ExecContext *c) {
c->n_temporary_filesystems = 0;
cpu_set_reset(&c->cpu_set);
+ numa_policy_reset(&c->numa_policy);
c->utmp_id = mfree(c->utmp_id);
c->selinux_context = mfree(c->selinux_context);
@@ -4336,6 +4348,14 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
}
+ if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
+ _cleanup_free_ char *nodes = NULL;
+
+ nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
+ fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
+ fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
+ }
+
if (c->timer_slack_nsec != NSEC_INFINITY)
fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
diff --git a/src/core/execute.h b/src/core/execute.h
index 780876826f..609e15fc07 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -167,6 +167,7 @@ struct ExecContext {
int cpu_sched_priority;
CPUSet cpu_set;
+ NUMAPolicy numa_policy;
ExecInput std_input;
ExecOutput std_output;
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4
index b868a367f1..5e6fb64093 100644
--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@@ -36,6 +36,8 @@ $1.CPUSchedulingPolicy, config_parse_exec_cpu_sched_policy, 0,
$1.CPUSchedulingPriority, config_parse_exec_cpu_sched_prio, 0, offsetof($1, exec_context)
$1.CPUSchedulingResetOnFork, config_parse_bool, 0, offsetof($1, exec_context.cpu_sched_reset_on_fork)
$1.CPUAffinity, config_parse_exec_cpu_affinity, 0, offsetof($1, exec_context)
+$1.NUMAPolicy, config_parse_numa_policy, 0, offsetof($1, exec_context.numa_policy.type)
+$1.NUMAMask, config_parse_numa_mask, 0, offsetof($1, exec_context.numa_policy)
$1.UMask, config_parse_mode, 0, offsetof($1, exec_context.umask)
$1.Environment, config_parse_environ, 0, offsetof($1, exec_context.environment)
$1.EnvironmentFile, config_parse_unit_env_file, 0, offsetof($1, exec_context.environment_files)
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
index 5c413be08f..274d9d2fef 100644
--- a/src/core/load-fragment.c
+++ b/src/core/load-fragment.c
@@ -92,6 +92,7 @@ DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint
DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight");
DEFINE_CONFIG_PARSE_PTR(config_parse_cpu_shares, cg_cpu_shares_parse, uint64_t, "Invalid CPU shares");
DEFINE_CONFIG_PARSE_PTR(config_parse_exec_mount_flags, mount_propagation_flags_from_string, unsigned long, "Failed to parse mount flag");
+DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_numa_policy, mpol, int, -1, "Invalid NUMA policy type");
int config_parse_unit_deps(
const char *unit,
@@ -1211,6 +1212,33 @@ int config_parse_exec_cpu_sched_policy(const char *unit,
return 0;
}
+int config_parse_numa_mask(const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ int r;
+ NUMAPolicy *p = data;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(data);
+
+ r = parse_cpu_set_extend(rvalue, &p->nodes, true, unit, filename, line, lvalue);
+ if (r < 0) {
+ log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse NUMA node mask, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ return r;
+}
+
int config_parse_exec_cpu_sched_prio(const char *unit,
const char *filename,
unsigned line,
diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h
index 0891f36760..ddcc8d216d 100644
--- a/src/core/load-fragment.h
+++ b/src/core/load-fragment.h
@@ -108,6 +108,8 @@ CONFIG_PARSER_PROTOTYPE(config_parse_pid_file);
CONFIG_PARSER_PROTOTYPE(config_parse_exit_status);
CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers);
CONFIG_PARSER_PROTOTYPE(config_parse_oom_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_numa_policy);
+CONFIG_PARSER_PROTOTYPE(config_parse_numa_mask);
/* gperf prototypes */
const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length);
diff --git a/src/core/main.c b/src/core/main.c
index 3a41573ef0..d74e8737e6 100644
--- a/src/core/main.c
+++ b/src/core/main.c
@@ -142,6 +142,7 @@ static sd_id128_t arg_machine_id;
static EmergencyAction arg_cad_burst_action;
static OOMPolicy arg_default_oom_policy;
static CPUSet arg_cpu_affinity;
+static NUMAPolicy arg_numa_policy;
static int parse_configuration(void);
@@ -720,6 +721,8 @@ static int parse_config_file(void) {
{ "Manager", "CrashReboot", config_parse_bool, 0, &arg_crash_reboot },
{ "Manager", "ShowStatus", config_parse_show_status, 0, &arg_show_status },
{ "Manager", "CPUAffinity", config_parse_cpu_affinity2, 0, &arg_cpu_affinity },
+ { "Manager", "NUMAPolicy", config_parse_numa_policy, 0, &arg_numa_policy.type },
+ { "Manager", "NUMAMask", config_parse_numa_mask, 0, &arg_numa_policy },
{ "Manager", "JoinControllers", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
{ "Manager", "RuntimeWatchdogSec", config_parse_sec, 0, &arg_runtime_watchdog },
{ "Manager", "ShutdownWatchdogSec", config_parse_sec, 0, &arg_shutdown_watchdog },
@@ -1753,6 +1756,27 @@ static void update_cpu_affinity(bool skip_setup) {
log_warning_errno(errno, "Failed to set CPU affinity: %m");
}
+static void update_numa_policy(bool skip_setup) {
+ int r;
+ _cleanup_free_ char *nodes = NULL;
+ const char * policy = NULL;
+
+ if (skip_setup || !mpol_is_valid(numa_policy_get_type(&arg_numa_policy)))
+ return;
+
+ if (DEBUG_LOGGING) {
+ policy = mpol_to_string(numa_policy_get_type(&arg_numa_policy));
+ nodes = cpu_set_to_range_string(&arg_numa_policy.nodes);
+ log_debug("Setting NUMA policy to %s, with nodes %s.", strnull(policy), strnull(nodes));
+ }
+
+ r = apply_numa_policy(&arg_numa_policy);
+ if (r == -EOPNOTSUPP)
+ log_debug_errno(r, "NUMA support not available, ignoring.");
+ else if (r < 0)
+ log_warning_errno(r, "Failed to set NUMA memory policy: %m");
+}
+
static void do_reexecute(
int argc,
char *argv[],
@@ -1924,6 +1948,7 @@ static int invoke_main_loop(
set_manager_defaults(m);
update_cpu_affinity(false);
+ update_numa_policy(false);
if (saved_log_level >= 0)
manager_override_log_level(m, saved_log_level);
@@ -2084,6 +2109,7 @@ static int initialize_runtime(
return 0;
update_cpu_affinity(skip_setup);
+ update_numa_policy(skip_setup);
if (arg_system) {
/* Make sure we leave a core dump without panicking the kernel. */
@@ -2262,6 +2288,7 @@ static void reset_arguments(void) {
arg_default_oom_policy = OOM_STOP;
cpu_set_reset(&arg_cpu_affinity);
+ numa_policy_reset(&arg_numa_policy);
}
static int parse_configuration(void) {
diff --git a/src/core/system.conf.in b/src/core/system.conf.in
index 548e6dfb8c..20f56969cc 100644
--- a/src/core/system.conf.in
+++ b/src/core/system.conf.in
@@ -23,6 +23,8 @@
#CrashReboot=no
#CtrlAltDelBurstAction=reboot-force
#CPUAffinity=1 2
+#NUMAPolicy=default
+#NUMAMask=
#RuntimeWatchdogSec=0
#ShutdownWatchdogSec=10min
#WatchdogDevice=
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index fd26b86359..bb30e8f151 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -1049,6 +1049,34 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
return bus_append_byte_array(m, field, array, allocated);
}
+ if (streq(field, "NUMAPolicy")) {
+ r = mpol_from_string(eq);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s value: %s", field, eq);
+
+ r = sd_bus_message_append(m, "(sv)", field, "i", (int32_t) r);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "NUMAMask")) {
+ _cleanup_(cpu_set_reset) CPUSet nodes = {};
+ _cleanup_free_ uint8_t *array = NULL;
+ size_t allocated;
+
+ r = parse_cpu_set(eq, &nodes);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s value: %s", field, eq);
+
+ r = cpu_set_to_dbus(&nodes, &array, &allocated);
+ if (r < 0)
+ return log_error_errno(r, "Failed to serialize NUMAMask: %m");
+
+ return bus_append_byte_array(m, field, array, allocated);
+ }
+
if (STR_IN_SET(field, "RestrictAddressFamilies", "SystemCallFilter")) {
int whitelist = 1;
const char *p = eq;
diff --git a/src/shared/cpu-set-util.c b/src/shared/cpu-set-util.c
index b0036c7f61..f27543dfe2 100644
--- a/src/shared/cpu-set-util.c
+++ b/src/shared/cpu-set-util.c
@@ -7,12 +7,20 @@
#include "alloc-util.h"
#include "cpu-set-util.h"
+#include "dirent-util.h"
+#include "errno-util.h"
#include "extract-word.h"
+#include "fd-util.h"
#include "log.h"
#include "macro.h"
#include "memory-util.h"
+#include "missing_syscall.h"
#include "parse-util.h"
+#include "stat-util.h"
#include "string-util.h"
+#include "string-table.h"
+#include "strv.h"
+#include "util.h"
char* cpu_set_to_string(const CPUSet *a) {
_cleanup_free_ char *str = NULL;
@@ -287,3 +295,88 @@ int cpu_set_from_dbus(const uint8_t *bits, size_t size, CPUSet *set) {
s = (CPUSet) {};
return 0;
}
+
+bool numa_policy_is_valid(const NUMAPolicy *policy) {
+ assert(policy);
+
+ if (!mpol_is_valid(numa_policy_get_type(policy)))
+ return false;
+
+ if (!policy->nodes.set &&
+ !IN_SET(numa_policy_get_type(policy), MPOL_DEFAULT, MPOL_LOCAL, MPOL_PREFERRED))
+ return false;
+
+ if (policy->nodes.set &&
+ numa_policy_get_type(policy) == MPOL_PREFERRED &&
+ CPU_COUNT_S(policy->nodes.allocated, policy->nodes.set) != 1)
+ return false;
+
+ return true;
+}
+
+static int numa_policy_to_mempolicy(const NUMAPolicy *policy, unsigned long *ret_maxnode, unsigned long **ret_nodes) {
+ unsigned node, bits = 0, ulong_bits;
+ _cleanup_free_ unsigned long *out = NULL;
+
+ assert(policy);
+ assert(ret_maxnode);
+ assert(ret_nodes);
+
+ if (IN_SET(numa_policy_get_type(policy), MPOL_DEFAULT, MPOL_LOCAL) ||
+ (numa_policy_get_type(policy) == MPOL_PREFERRED && !policy->nodes.set)) {
+ *ret_nodes = NULL;
+ *ret_maxnode = 0;
+ return 0;
+ }
+
+ bits = policy->nodes.allocated * 8;
+ ulong_bits = sizeof(unsigned long) * 8;
+
+ out = new0(unsigned long, DIV_ROUND_UP(policy->nodes.allocated, sizeof(unsigned long)));
+ if (!out)
+ return -ENOMEM;
+
+ /* We don't make any assumptions about internal type libc is using to store NUMA node mask.
+ Hence we need to convert the node mask to the representation expected by set_mempolicy() */
+ for (node = 0; node < bits; node++)
+ if (CPU_ISSET_S(node, policy->nodes.allocated, policy->nodes.set))
+ out[node / ulong_bits] |= 1ul << (node % ulong_bits);
+
+ *ret_nodes = TAKE_PTR(out);
+ *ret_maxnode = bits + 1;
+ return 0;
+}
+
+int apply_numa_policy(const NUMAPolicy *policy) {
+ int r;
+ _cleanup_free_ unsigned long *nodes = NULL;
+ unsigned long maxnode;
+
+ assert(policy);
+
+ if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS)
+ return -EOPNOTSUPP;
+
+ if (!numa_policy_is_valid(policy))
+ return -EINVAL;
+
+ r = numa_policy_to_mempolicy(policy, &maxnode, &nodes);
+ if (r < 0)
+ return r;
+
+ r = set_mempolicy(numa_policy_get_type(policy), nodes, maxnode);
+ if (r < 0)
+ return -errno;
+
+ return 0;
+}
+
+static const char* const mpol_table[] = {
+ [MPOL_DEFAULT] = "default",
+ [MPOL_PREFERRED] = "preferred",
+ [MPOL_BIND] = "bind",
+ [MPOL_INTERLEAVE] = "interleave",
+ [MPOL_LOCAL] = "local",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(mpol, int);
diff --git a/src/shared/cpu-set-util.h b/src/shared/cpu-set-util.h
index fd6a15f446..27812dfd59 100644
--- a/src/shared/cpu-set-util.h
+++ b/src/shared/cpu-set-util.h
@@ -4,6 +4,7 @@
#include
#include "macro.h"
+#include "missing_syscall.h"
/* This wraps the libc interface with a variable to keep the allocated size. */
typedef struct CPUSet {
@@ -48,3 +49,30 @@ int cpu_set_to_dbus(const CPUSet *set, uint8_t **ret, size_t *allocated);
int cpu_set_from_dbus(const uint8_t *bits, size_t size, CPUSet *set);
int cpus_in_affinity_mask(void);
+
+static inline bool mpol_is_valid(int t) {
+ return t >= MPOL_DEFAULT && t <= MPOL_LOCAL;
+}
+
+typedef struct NUMAPolicy {
+ /* Always use numa_policy_get_type() to read the value */
+ int type;
+ CPUSet nodes;
+} NUMAPolicy;
+
+bool numa_policy_is_valid(const NUMAPolicy *p);
+
+static inline int numa_policy_get_type(const NUMAPolicy *p) {
+ return p->type < 0 ? (p->nodes.set ? MPOL_PREFERRED : -1) : p->type;
+}
+
+static inline void numa_policy_reset(NUMAPolicy *p) {
+ assert(p);
+ cpu_set_reset(&p->nodes);
+ p->type = -1;
+}
+
+int apply_numa_policy(const NUMAPolicy *policy);
+
+const char* mpol_to_string(int i) _const_;
+int mpol_from_string(const char *s) _pure_;
diff --git a/src/shared/exit-status.c b/src/shared/exit-status.c
index 26b3060d9b..58ebc3ca4d 100644
--- a/src/shared/exit-status.c
+++ b/src/shared/exit-status.c
@@ -157,6 +157,9 @@ const char* exit_status_to_string(int status, ExitStatusLevel level) {
case EXIT_CONFIGURATION_DIRECTORY:
return "CONFIGURATION_DIRECTORY";
+ case EXIT_NUMA_POLICY:
+ return "NUMA_POLICY";
+
case EXIT_EXCEPTION:
return "EXCEPTION";
}
diff --git a/src/shared/exit-status.h b/src/shared/exit-status.h
index 510eb319cf..5637e6aa04 100644
--- a/src/shared/exit-status.h
+++ b/src/shared/exit-status.h
@@ -69,6 +69,7 @@ enum {
EXIT_CACHE_DIRECTORY,
EXIT_LOGS_DIRECTORY, /* 240 */
EXIT_CONFIGURATION_DIRECTORY,
+ EXIT_NUMA_POLICY,
EXIT_EXCEPTION = 255, /* Whenever we want to propagate an abnormal/signal exit, in line with bash */
};
diff --git a/src/systemctl/systemctl.c b/src/systemctl/systemctl.c
index 31bc776449..cf0c612923 100644
--- a/src/systemctl/systemctl.c
+++ b/src/systemctl/systemctl.c
@@ -4838,6 +4838,16 @@ static int print_property(const char *name, const char *expected_value, sd_bus_m
else if (all)
bus_print_property_value(name, expected_value, value, "[not set]");
+ return 1;
+ } else if (streq(name, "NUMAPolicy")) {
+ int32_t i;
+
+ r = sd_bus_message_read_basic(m, bus_type, &i);
+ if (r < 0)
+ return r;
+
+ bus_print_property_valuef(name, expected_value, value, "%s", strna(mpol_to_string(i)));
+
return 1;
}
break;
@@ -5451,7 +5461,7 @@ static int print_property(const char *name, const char *expected_value, sd_bus_m
bus_print_property_value(name, expected_value, value, strempty(fields));
return 1;
- } else if (contents[0] == SD_BUS_TYPE_BYTE && streq(name, "CPUAffinity")) {
+ } else if (contents[0] == SD_BUS_TYPE_BYTE && STR_IN_SET(name, "CPUAffinity", "NUMAMask")) {
_cleanup_free_ char *affinity = NULL;
_cleanup_(cpu_set_reset) CPUSet set = {};
const void *a;
@@ -5463,7 +5473,7 @@ static int print_property(const char *name, const char *expected_value, sd_bus_m
r = cpu_set_from_dbus(a, n, &set);
if (r < 0)
- return log_error_errno(r, "Failed to deserialize CPUAffinity: %m");
+ return log_error_errno(r, "Failed to deserialize %s: %m", name);
affinity = cpu_set_to_range_string(&set);
if (!affinity)