core: make TasksMax a partially dynamic property

TasksMax= and DefaultTasksMax= can be specified as percentages. We don't
actually document of what the percentage is relative to, but the implementation
uses the smallest of /proc/sys/kernel/pid_max, /proc/sys/kernel/threads-max,
and /sys/fs/cgroup/pids.max (when present). When the value is a percentage,
we immediately convert it to an absolute value. If the limit later changes
(which can happen e.g. when systemd-sysctl runs), the absolute value becomes
outdated.

So let's store either the percentage or absolute value, whatever was specified,
and only convert to an absolute value when the value is used. For example, when
starting a unit, the absolute value will be calculated when the cgroup for
the unit is created.

Fixes #13419.
This commit is contained in:
Zbigniew Jędrzejewski-Szmek 2019-11-05 13:50:28 +01:00
parent a650e19820
commit 3a0f06c41a
12 changed files with 128 additions and 29 deletions

View File

@ -130,7 +130,6 @@ static inline bool CGROUP_BLKIO_WEIGHT_IS_OK(uint64_t x) {
}
/* Default resource limits */
#define DEFAULT_TASKS_MAX_PERCENTAGE 15U /* 15% of PIDs, 4915 on default settings */
#define DEFAULT_USER_TASKS_MAX_PERCENTAGE 33U /* 33% of PIDs, 10813 on default settings */
typedef enum CGroupUnified {

View File

@ -99,7 +99,6 @@ uint64_t physical_memory_scale(uint64_t v, uint64_t max) {
}
uint64_t system_tasks_max(void) {
uint64_t a = TASKS_MAX, b = TASKS_MAX;
_cleanup_free_ char *root = NULL;
int r;

View File

@ -16,6 +16,7 @@
#include "fd-util.h"
#include "fileio.h"
#include "fs-util.h"
#include "limits-util.h"
#include "parse-util.h"
#include "path-util.h"
#include "process-util.h"
@ -34,6 +35,13 @@
* out specific attributes from us. */
#define LOG_LEVEL_CGROUP_WRITE(r) (IN_SET(abs(r), ENOENT, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING)
uint64_t tasks_max_resolve(const TasksMax *tasks_max) {
if (tasks_max->scale == 0)
return tasks_max->value;
return system_tasks_max_scale(tasks_max->value, tasks_max->scale);
}
bool manager_owns_host_root_cgroup(Manager *m) {
assert(m);
@ -117,7 +125,7 @@ void cgroup_context_init(CGroupContext *c) {
.blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
.startup_blockio_weight = CGROUP_BLKIO_WEIGHT_INVALID,
.tasks_max = CGROUP_LIMIT_MAX,
.tasks_max = TASKS_MAX_UNSET,
};
}
@ -447,7 +455,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
prefix, c->memory_max, format_cgroup_memory_limit_comparison(cdd, sizeof(cdd), u, "MemoryMax"),
prefix, c->memory_swap_max, format_cgroup_memory_limit_comparison(cde, sizeof(cde), u, "MemorySwapMax"),
prefix, c->memory_limit,
prefix, c->tasks_max,
prefix, tasks_max_resolve(&c->tasks_max),
prefix, cgroup_device_policy_to_string(c->device_policy),
prefix, strempty(disable_controllers_str),
prefix, yes_no(c->delegate));
@ -1339,9 +1347,9 @@ static void cgroup_context_apply(
* which is desirable so that there's an official way to release control of the sysctl from
* systemd: set the limit to unbounded and reload. */
if (c->tasks_max != CGROUP_LIMIT_MAX) {
if (tasks_max_isset(&c->tasks_max)) {
u->manager->sysctl_pid_max_changed = true;
r = procfs_tasks_set_limit(c->tasks_max);
r = procfs_tasks_set_limit(tasks_max_resolve(&c->tasks_max));
} else if (u->manager->sysctl_pid_max_changed)
r = procfs_tasks_set_limit(TASKS_MAX);
else
@ -1354,10 +1362,10 @@ static void cgroup_context_apply(
/* The attribute itself is not available on the host root cgroup, and in the container case we want to
* leave it for the container manager. */
if (!is_local_root) {
if (c->tasks_max != CGROUP_LIMIT_MAX) {
char buf[DECIMAL_STR_MAX(uint64_t) + 2];
if (tasks_max_isset(&c->tasks_max)) {
char buf[DECIMAL_STR_MAX(uint64_t) + 1];
sprintf(buf, "%" PRIu64 "\n", c->tasks_max);
xsprintf(buf, "%" PRIu64 "\n", tasks_max_resolve(&c->tasks_max));
(void) set_attribute_and_warn(u, "pids", "pids.max", buf);
} else
(void) set_attribute_and_warn(u, "pids", "pids.max", "max\n");
@ -1434,7 +1442,7 @@ static CGroupMask unit_get_cgroup_mask(Unit *u) {
mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES;
if (c->tasks_accounting ||
c->tasks_max != CGROUP_LIMIT_MAX)
tasks_max_isset(&c->tasks_max))
mask |= CGROUP_MASK_PIDS;
return CGROUP_MASK_EXTEND_JOINED(mask);

View File

@ -9,6 +9,21 @@
#include "list.h"
#include "time-util.h"
typedef struct TasksMax {
/* If scale == 0, just use value; otherwise, value / scale.
* See tasks_max_resolve(). */
uint64_t value;
uint64_t scale;
} TasksMax;
#define TASKS_MAX_UNSET ((TasksMax) { .value = UINT64_MAX, .scale = 0 })
static inline bool tasks_max_isset(const TasksMax *tasks_max) {
return tasks_max->value != UINT64_MAX || tasks_max->scale != 0;
}
uint64_t tasks_max_resolve(const TasksMax *tasks_max);
typedef struct CGroupContext CGroupContext;
typedef struct CGroupDeviceAllow CGroupDeviceAllow;
typedef struct CGroupIODeviceWeight CGroupIODeviceWeight;
@ -135,7 +150,7 @@ struct CGroupContext {
LIST_HEAD(CGroupDeviceAllow, device_allow);
/* Common */
uint64_t tasks_max;
TasksMax tasks_max;
};
/* Used when querying IP accounting data */

View File

@ -16,6 +16,8 @@
#include "limits-util.h"
#include "path-util.h"
BUS_DEFINE_PROPERTY_GET(bus_property_get_tasks_max, "t", TasksMax, tasks_max_resolve);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy);
static int property_get_cgroup_mask(
@ -382,7 +384,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
SD_BUS_PROPERTY("DevicePolicy", "s", property_get_cgroup_device_policy, offsetof(CGroupContext, device_policy), 0),
SD_BUS_PROPERTY("DeviceAllow", "a(ss)", property_get_device_allow, 0, 0),
SD_BUS_PROPERTY("TasksAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, tasks_accounting), 0),
SD_BUS_PROPERTY("TasksMax", "t", NULL, offsetof(CGroupContext, tasks_max), 0),
SD_BUS_PROPERTY("TasksMax", "t", bus_property_get_tasks_max, offsetof(CGroupContext, tasks_max), 0),
SD_BUS_PROPERTY("IPAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, ip_accounting), 0),
SD_BUS_PROPERTY("IPAddressAllow", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_allow), 0),
SD_BUS_PROPERTY("IPAddressDeny", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_deny), 0),
@ -715,9 +717,76 @@ BUS_DEFINE_SET_CGROUP_WEIGHT(blockio_weight, CGROUP_MASK_BLKIO, CGROUP_BLKIO_WEI
BUS_DEFINE_SET_CGROUP_LIMIT(memory, CGROUP_MASK_MEMORY, physical_memory_scale, 1);
BUS_DEFINE_SET_CGROUP_LIMIT(memory_protection, CGROUP_MASK_MEMORY, physical_memory_scale, 0);
BUS_DEFINE_SET_CGROUP_LIMIT(swap, CGROUP_MASK_MEMORY, physical_memory_scale, 0);
BUS_DEFINE_SET_CGROUP_LIMIT(tasks_max, CGROUP_MASK_PIDS, system_tasks_max_scale, 1);
#pragma GCC diagnostic pop
static int bus_cgroup_set_tasks_max(
Unit *u,
const char *name,
TasksMax *p,
sd_bus_message *message,
UnitWriteFlags flags,
sd_bus_error *error) {
uint64_t v;
int r;
assert(p);
r = sd_bus_message_read(message, "t", &v);
if (r < 0)
return r;
if (v < 1)
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
"Value specified in %s is out of range", name);
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
*p = (TasksMax) { .value = v, .scale = 0 }; /* When .scale==0, .value is the absolute value */
unit_invalidate_cgroup(u, CGROUP_MASK_PIDS);
if (v == CGROUP_LIMIT_MAX)
unit_write_settingf(u, flags, name,
"%s=infinity", name);
else
unit_write_settingf(u, flags, name,
"%s=%" PRIu64, name, v);
}
return 1;
}
static int bus_cgroup_set_tasks_max_scale(
Unit *u,
const char *name,
TasksMax *p,
sd_bus_message *message,
UnitWriteFlags flags,
sd_bus_error *error) {
uint32_t v;
int r;
assert(p);
r = sd_bus_message_read(message, "u", &v);
if (r < 0)
return r;
if (v < 1 || v >= UINT32_MAX)
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
"Value specified in %s is out of range", name);
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
*p = (TasksMax) { v, UINT32_MAX }; /* .scale is not 0, so this is interpreted as v/UINT32_MAX. */
unit_invalidate_cgroup(u, CGROUP_MASK_PIDS);
unit_write_settingf(u, flags, name, "%s=%" PRIu32 "%%", "TasksMax",
(uint32_t) (DIV_ROUND_UP((uint64_t) v * 100U, (uint64_t) UINT32_MAX)));
}
return 1;
}
int bus_cgroup_set_property(
Unit *u,
CGroupContext *c,

View File

@ -9,4 +9,6 @@
extern const sd_bus_vtable bus_cgroup_vtable[];
int bus_property_get_tasks_max(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
int bus_cgroup_set_property(Unit *u, CGroupContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);

View File

@ -9,6 +9,7 @@
#include "architecture.h"
#include "build.h"
#include "bus-common-errors.h"
#include "dbus-cgroup.h"
#include "dbus-execute.h"
#include "dbus-job.h"
#include "dbus-manager.h"
@ -2465,7 +2466,7 @@ const sd_bus_vtable bus_manager_vtable[] = {
SD_BUS_PROPERTY("DefaultLimitRTPRIOSoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTPRIO]), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DefaultLimitRTTIME", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DefaultLimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DefaultTasksMax", "t", NULL, offsetof(Manager, default_tasks_max), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DefaultTasksMax", "t", bus_property_get_tasks_max, offsetof(Manager, default_tasks_max), 0),
SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, default_oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),

View File

@ -3263,36 +3263,39 @@ int config_parse_tasks_max(
void *data,
void *userdata) {
uint64_t *tasks_max = data, v;
const Unit *u = userdata;
TasksMax *tasks_max = data;
uint64_t v;
int r;
if (isempty(rvalue)) {
*tasks_max = u ? u->manager->default_tasks_max : UINT64_MAX;
*tasks_max = u ? u->manager->default_tasks_max : TASKS_MAX_UNSET;
return 0;
}
if (streq(rvalue, "infinity")) {
*tasks_max = CGROUP_LIMIT_MAX;
*tasks_max = TASKS_MAX_UNSET;
return 0;
}
r = parse_permille(rvalue);
if (r < 0) {
if (r >= 0)
*tasks_max = (TasksMax) { r, 1000U }; /* r‰ */
else {
r = safe_atou64(rvalue, &v);
if (r < 0) {
log_syntax(unit, LOG_ERR, filename, line, r, "Invalid maximum tasks value '%s', ignoring: %m", rvalue);
return 0;
}
} else
v = system_tasks_max_scale(r, 1000U);
if (v <= 0 || v >= UINT64_MAX) {
log_syntax(unit, LOG_ERR, filename, line, 0, "Maximum tasks value '%s' out of range, ignoring.", rvalue);
return 0;
if (v <= 0 || v >= UINT64_MAX) {
log_syntax(unit, LOG_ERR, filename, line, 0, "Maximum tasks value '%s' out of range, ignoring.", rvalue);
return 0;
}
*tasks_max = (TasksMax) { v };
}
*tasks_max = v;
return 0;
}

View File

@ -84,6 +84,8 @@
#include <sanitizer/lsan_interface.h>
#endif
#define DEFAULT_TASKS_MAX ((TasksMax) { 15U, 100U }) /* 15% */
static enum {
ACTION_RUN,
ACTION_HELP,
@ -135,7 +137,7 @@ static bool arg_default_ip_accounting;
static bool arg_default_blockio_accounting;
static bool arg_default_memory_accounting;
static bool arg_default_tasks_accounting;
static uint64_t arg_default_tasks_max;
static TasksMax arg_default_tasks_max;
static sd_id128_t arg_machine_id;
static EmergencyAction arg_cad_burst_action;
static OOMPolicy arg_default_oom_policy;
@ -2131,7 +2133,7 @@ static void reset_arguments(void) {
arg_default_blockio_accounting = false;
arg_default_memory_accounting = MEMORY_ACCOUNTING_DEFAULT;
arg_default_tasks_accounting = true;
arg_default_tasks_max = system_tasks_max_scale(DEFAULT_TASKS_MAX_PERCENTAGE, 100U);
arg_default_tasks_max = DEFAULT_TASKS_MAX;
arg_machine_id = (sd_id128_t) {};
arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE;
arg_default_oom_policy = OOM_STOP;

View File

@ -762,7 +762,7 @@ int manager_new(UnitFileScope scope, ManagerTestRunFlags test_run_flags, Manager
.default_timer_accuracy_usec = USEC_PER_MINUTE,
.default_memory_accounting = MEMORY_ACCOUNTING_DEFAULT,
.default_tasks_accounting = true,
.default_tasks_max = UINT64_MAX,
.default_tasks_max = TASKS_MAX_UNSET,
.default_timeout_start_usec = DEFAULT_TIMEOUT_USEC,
.default_timeout_stop_usec = DEFAULT_TIMEOUT_USEC,
.default_restart_usec = DEFAULT_RESTART_USEC,

View File

@ -9,6 +9,7 @@
#include "sd-event.h"
#include "cgroup-util.h"
#include "cgroup.h"
#include "fdset.h"
#include "hashmap.h"
#include "ip-address-access.h"
@ -349,7 +350,7 @@ struct Manager {
bool default_tasks_accounting;
bool default_ip_accounting;
uint64_t default_tasks_max;
TasksMax default_tasks_max;
usec_t default_timer_accuracy_usec;
OOMPolicy default_oom_policy;

View File

@ -56,7 +56,7 @@ static int test_cgroup_mask(void) {
m->default_blockio_accounting =
m->default_io_accounting =
m->default_tasks_accounting = false;
m->default_tasks_max = (uint64_t) -1;
m->default_tasks_max = TASKS_MAX_UNSET;
assert_se(manager_startup(m, NULL, NULL) >= 0);