core: support cgroup v2 device controller

Cgroup v2 provides the eBPF-based device controller, which isn't currently
supported by systemd. This commit aims to provide such support.

There are no user-visible changes, just the device policy and whitelist
start working if cgroup v2 is used.
This commit is contained in:
Roman Gushchin 2018-10-08 14:33:05 -07:00
parent 91cfdd8d29
commit 084c700780
9 changed files with 360 additions and 40 deletions

View File

@ -2768,6 +2768,7 @@ static const char *cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
[CGROUP_CONTROLLER_DEVICES] = "devices",
[CGROUP_CONTROLLER_PIDS] = "pids",
[CGROUP_CONTROLLER_BPF_FIREWALL] = "bpf-firewall",
[CGROUP_CONTROLLER_BPF_DEVICES] = "bpf-devices",
};
DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);

View File

@ -30,6 +30,7 @@ typedef enum CGroupController {
/* BPF-based pseudo-controllers, v2 only */
CGROUP_CONTROLLER_BPF_FIREWALL,
CGROUP_CONTROLLER_BPF_DEVICES,
_CGROUP_CONTROLLER_MAX,
_CGROUP_CONTROLLER_INVALID = -1,
@ -47,6 +48,7 @@ typedef enum CGroupMask {
CGROUP_MASK_DEVICES = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_DEVICES),
CGROUP_MASK_PIDS = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_PIDS),
CGROUP_MASK_BPF_FIREWALL = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BPF_FIREWALL),
CGROUP_MASK_BPF_DEVICES = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BPF_DEVICES),
_CGROUP_MASK_ALL = CGROUP_CONTROLLER_TO_MASK(_CGROUP_CONTROLLER_MAX) - 1
} CGroupMask;

247
src/core/bpf-devices.c Normal file
View File

@ -0,0 +1,247 @@
/* SPDX-License-Identifier: LGPL-2.1+ */
#include <linux/libbpf.h>
#include "bpf-devices.h"
#include "bpf-program.h"
#define PASS_JUMP_OFF 4096
static int bpf_access_type(const char *acc) {
int r = 0;
assert(acc);
for (; *acc; acc++)
switch(*acc) {
case 'r':
r |= BPF_DEVCG_ACC_READ;
break;
case 'w':
r |= BPF_DEVCG_ACC_WRITE;
break;
case 'm':
r |= BPF_DEVCG_ACC_MKNOD;
break;
default:
return -EINVAL;
}
return r;
}
int cgroup_bpf_whitelist_device(BPFProgram *prog, int type, int major, int minor, const char *acc) {
struct bpf_insn insn[] = {
BPF_JMP_IMM(BPF_JNE, BPF_REG_2, type, 6), /* compare device type */
BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), /* calculate access type */
BPF_ALU32_IMM(BPF_AND, BPF_REG_1, 0),
BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 3), /* compare access type */
BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 2), /* compare major */
BPF_JMP_IMM(BPF_JNE, BPF_REG_5, minor, 1), /* compare minor */
BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */
};
int r, access;
assert(prog);
assert(acc);
access = bpf_access_type(acc);
if (access <= 0)
return -EINVAL;
insn[2].imm = access;
r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn));
if (r < 0)
log_error_errno(r, "Extending device control BPF program failed: %m");
return r;
}
int cgroup_bpf_whitelist_major(BPFProgram *prog, int type, int major, const char *acc) {
struct bpf_insn insn[] = {
BPF_JMP_IMM(BPF_JNE, BPF_REG_2, type, 5), /* compare device type */
BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), /* calculate access type */
BPF_ALU32_IMM(BPF_AND, BPF_REG_1, 0),
BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 2), /* compare access type */
BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 1), /* compare major */
BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */
};
int r, access;
assert(prog);
assert(acc);
access = bpf_access_type(acc);
if (access <= 0)
return -EINVAL;
insn[2].imm = access;
r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn));
if (r < 0)
log_error_errno(r, "Extending device control BPF program failed: %m");
return r;
}
int cgroup_init_device_bpf(BPFProgram **ret, CGroupDevicePolicy policy, bool whitelist) {
struct bpf_insn pre_insn[] = {
/* load device type to r2 */
BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_1,
offsetof(struct bpf_cgroup_dev_ctx, access_type)),
/* load access type to r3 */
BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
offsetof(struct bpf_cgroup_dev_ctx, access_type)),
BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
/* load major number to r4 */
BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
offsetof(struct bpf_cgroup_dev_ctx, major)),
/* load minor number to r5 */
BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
offsetof(struct bpf_cgroup_dev_ctx, minor)),
};
_cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL;
int r;
assert(ret);
if (policy == CGROUP_AUTO && !whitelist)
return 0;
r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, &prog);
if (r < 0)
return log_error_errno(r, "Loading device control BPF program failed: %m");
if (policy == CGROUP_CLOSED || whitelist) {
r = bpf_program_add_instructions(prog, pre_insn, ELEMENTSOF(pre_insn));
if (r < 0)
return log_error_errno(r, "Extending device control BPF program failed: %m");
}
*ret = TAKE_PTR(prog);
return 0;
}
int cgroup_apply_device_bpf(Unit *u, BPFProgram *prog, CGroupDevicePolicy policy, bool whitelist) {
struct bpf_insn post_insn[] = {
/* return DENY */
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_JMP_A(1),
};
struct bpf_insn exit_insn[] = {
/* else return ALLOW */
BPF_MOV64_IMM(BPF_REG_0, 1),
BPF_EXIT_INSN()
};
_cleanup_free_ char *path = NULL;
uint32_t flags;
int r;
if (!prog) {
/* Remove existing program. */
u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed);
return 0;
}
if (policy != CGROUP_STRICT || whitelist) {
size_t off;
r = bpf_program_add_instructions(prog, post_insn, ELEMENTSOF(post_insn));
if (r < 0)
return log_error_errno(r, "Extending device control BPF program failed: %m");
/* Fixup PASS_JUMP_OFF jump offsets. */
for (off = 0; off < prog->n_instructions; off++) {
struct bpf_insn *ins = &prog->instructions[off];
if (ins->code == (BPF_JMP | BPF_JA) && ins->off == PASS_JUMP_OFF)
ins->off = prog->n_instructions - off - 1;
}
} else
/* Explicitly forbid everything. */
exit_insn[0].imm = 0;
r = bpf_program_add_instructions(prog, exit_insn, ELEMENTSOF(exit_insn));
if (r < 0)
return log_error_errno(r, "Extending device control BPF program failed: %m");
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
if (r < 0)
return log_error_errno(r, "Failed to determine cgroup path: %m");
flags = (u->type == UNIT_SLICE || unit_cgroup_delegate(u)) ? BPF_F_ALLOW_MULTI : 0;
/* Unref the old BPF program (which will implicitly detach it) right before attaching the new program. */
u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed);
r = bpf_program_cgroup_attach(prog, BPF_CGROUP_DEVICE, path, flags);
if (r < 0)
return log_error_errno(r, "Attaching device control BPF program to cgroup %s failed: %m", path);
/* Remember that this BPF program is installed now. */
u->bpf_device_control_installed = bpf_program_ref(prog);
return 0;
}
int bpf_devices_supported(void) {
struct bpf_insn trivial[] = {
BPF_MOV64_IMM(BPF_REG_0, 1),
BPF_EXIT_INSN()
};
_cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
static int supported = -1;
int r;
/* Checks whether BPF device controller is supported. For this, we check five things:
*
* a) whether we are privileged
* b) whether the unified hierarchy is being used
* c) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_DEVICE programs, which we require
*/
if (supported >= 0)
return supported;
if (geteuid() != 0) {
log_debug("Not enough privileges, BPF device control is not supported.");
return supported = 0;
}
r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
if (r < 0)
return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
if (r == 0) {
log_debug("Not running with unified cgroups, BPF device control is not supported.");
return supported = 0;
}
r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, &program);
if (r < 0) {
log_debug_errno(r, "Can't allocate CGROUP DEVICE BPF program, BPF device control is not supported: %m");
return supported = 0;
}
r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
if (r < 0) {
log_debug_errno(r, "Can't add trivial instructions to CGROUP DEVICE BPF program, BPF device control is not supported: %m");
return supported = 0;
}
r = bpf_program_load_kernel(program, NULL, 0);
if (r < 0) {
log_debug_errno(r, "Can't load kernel CGROUP DEVICE BPF program, BPF device control is not supported: %m");
return supported = 0;
}
return supported;
}

16
src/core/bpf-devices.h Normal file
View File

@ -0,0 +1,16 @@
/* SPDX-License-Identifier: LGPL-2.1+ */
#pragma once
#include <inttypes.h>
#include "unit.h"
struct BPFProgram;
int bpf_devices_supported(void);
int cgroup_bpf_whitelist_device(BPFProgram *p, int type, int major, int minor, const char *acc);
int cgroup_bpf_whitelist_major(BPFProgram *p, int type, int major, const char *acc);
int cgroup_init_device_bpf(BPFProgram **ret, CGroupDevicePolicy policy, bool whitelist);
int cgroup_apply_device_bpf(Unit *u, BPFProgram *p, CGroupDevicePolicy policy, bool whitelist);

View File

@ -7,6 +7,7 @@
#include "blockdev-util.h"
#include "bpf-firewall.h"
#include "btrfs-util.h"
#include "bpf-devices.h"
#include "bus-error.h"
#include "cgroup-util.h"
#include "cgroup.h"
@ -386,8 +387,7 @@ static int lookup_block_device(const char *p, dev_t *ret) {
return 0;
}
static int whitelist_device(const char *path, const char *node, const char *acc) {
char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
static int whitelist_device(BPFProgram *prog, const char *path, const char *node, const char *acc) {
struct stat st;
bool ignore_notfound;
int r;
@ -414,23 +414,34 @@ static int whitelist_device(const char *path, const char *node, const char *acc)
return -ENODEV;
}
sprintf(buf,
"%c %u:%u %s",
S_ISCHR(st.st_mode) ? 'c' : 'b',
major(st.st_rdev), minor(st.st_rdev),
acc);
if (cg_all_unified() > 0) {
if (!prog)
return 0;
r = cg_set_attribute("devices", path, "devices.allow", buf);
if (r < 0)
log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
"Failed to set devices.allow on %s: %m", path);
cgroup_bpf_whitelist_device(prog, S_ISCHR(st.st_mode) ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK,
major(st.st_rdev), minor(st.st_rdev), acc);
} else {
char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
sprintf(buf,
"%c %u:%u %s",
S_ISCHR(st.st_mode) ? 'c' : 'b',
major(st.st_rdev), minor(st.st_rdev),
acc);
r = cg_set_attribute("devices", path, "devices.allow", buf);
if (r < 0)
log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING,
r, "Failed to set devices.allow on %s: %m", path);
}
return r;
}
static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
static int whitelist_major(BPFProgram *prog, const char *path, const char *name, char type, const char *acc) {
_cleanup_fclose_ FILE *f = NULL;
char line[LINE_MAX];
char *p, *w;
bool good = false;
int r;
@ -443,7 +454,6 @@ static int whitelist_major(const char *path, const char *name, char type, const
return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
FOREACH_LINE(line, f, goto fail) {
char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
unsigned maj;
truncate_nl(line);
@ -485,16 +495,27 @@ static int whitelist_major(const char *path, const char *name, char type, const
if (fnmatch(name, w, 0) != 0)
continue;
sprintf(buf,
"%c %u:* %s",
type,
maj,
acc);
if (cg_all_unified() > 0) {
if (!prog)
continue;
r = cg_set_attribute("devices", path, "devices.allow", buf);
if (r < 0)
log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
"Failed to set devices.allow on %s: %m", path);
cgroup_bpf_whitelist_major(prog,
type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK,
maj, acc);
} else {
char buf[2+DECIMAL_STR_MAX(unsigned)+3+4];
sprintf(buf,
"%c %u:* %s",
type,
maj,
acc);
r = cg_set_attribute("devices", path, "devices.allow", buf);
if (r < 0)
log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING,
r, "Failed to set devices.allow on %s: %m", path);
}
}
return 0;
@ -1019,20 +1040,27 @@ static void cgroup_context_apply(
}
}
if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) && !is_root) {
_cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL;
CGroupDeviceAllow *a;
/* Changing the devices list of a populated cgroup
* might result in EINVAL, hence ignore EINVAL
* here. */
if (cg_all_unified() > 0) {
r = cgroup_init_device_bpf(&prog, c->device_policy, c->device_allow);
if (r < 0)
log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m");
} else {
/* Changing the devices list of a populated cgroup
* might result in EINVAL, hence ignore EINVAL
* here. */
if (c->device_allow || c->device_policy != CGROUP_AUTO)
r = cg_set_attribute("devices", path, "devices.deny", "a");
else
r = cg_set_attribute("devices", path, "devices.allow", "a");
if (r < 0)
log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
"Failed to reset devices.list: %m");
if (c->device_allow || c->device_policy != CGROUP_AUTO)
r = cg_set_attribute("devices", path, "devices.deny", "a");
else
r = cg_set_attribute("devices", path, "devices.allow", "a");
if (r < 0)
log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
"Failed to reset devices.list: %m");
}
if (c->device_policy == CGROUP_CLOSED ||
(c->device_policy == CGROUP_AUTO && c->device_allow)) {
@ -1051,10 +1079,10 @@ static void cgroup_context_apply(
const char *x, *y;
NULSTR_FOREACH_PAIR(x, y, auto_devices)
whitelist_device(path, x, y);
whitelist_device(prog, path, x, y);
/* PTS (/dev/pts) devices may not be duplicated, but accessed */
whitelist_major(path, "pts", 'c', "rw");
whitelist_major(prog, path, "pts", 'c', "rw");
}
LIST_FOREACH(device_allow, a, c->device_allow) {
@ -1074,14 +1102,26 @@ static void cgroup_context_apply(
acc[k++] = 0;
if (path_startswith(a->path, "/dev/"))
whitelist_device(path, a->path, acc);
whitelist_device(prog, path, a->path, acc);
else if ((val = startswith(a->path, "block-")))
whitelist_major(path, val, 'b', acc);
whitelist_major(prog, path, val, 'b', acc);
else if ((val = startswith(a->path, "char-")))
whitelist_major(path, val, 'c', acc);
whitelist_major(prog, path, val, 'c', acc);
else
log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
}
r = cgroup_apply_device_bpf(u, prog, c->device_policy, c->device_allow);
if (r < 0) {
static bool warned = false;
log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r,
"Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n"
"Proceeding WITHOUT applying ACL (all devices will be accessible)!\n"
"(This warning is only shown for the first loaded unit using device ACL.)", u->id);
warned = true;
}
}
if (apply_mask & CGROUP_MASK_PIDS) {
@ -1151,7 +1191,7 @@ CGroupMask cgroup_context_get_mask(CGroupContext *c) {
if (c->device_allow ||
c->device_policy != CGROUP_AUTO)
mask |= CGROUP_MASK_DEVICES;
mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES;
if (c->tasks_accounting ||
c->tasks_max != CGROUP_LIMIT_MAX)
@ -1942,6 +1982,8 @@ void unit_prune_cgroup(Unit *u) {
u->cgroup_realized = false;
u->cgroup_realized_mask = 0;
u->cgroup_enabled_mask = 0;
u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed);
}
int unit_search_main_pid(Unit *u, pid_t *ret) {
@ -2212,6 +2254,11 @@ static int cg_bpf_mask_supported(CGroupMask *ret) {
if (r > 0)
mask |= CGROUP_MASK_BPF_FIREWALL;
/* BPF-based device access control */
r = bpf_devices_supported();
if (r > 0)
mask |= CGROUP_MASK_BPF_DEVICES;
*ret = mask;
return 0;
}

View File

@ -5,6 +5,8 @@ libcore_la_sources = '''
audit-fd.h
automount.c
automount.h
bpf-devices.c
bpf-devices.h
bpf-firewall.c
bpf-firewall.h
cgroup.c

View File

@ -666,6 +666,8 @@ void unit_free(Unit *u) {
bpf_program_unref(u->ip_bpf_egress);
bpf_program_unref(u->ip_bpf_egress_installed);
bpf_program_unref(u->bpf_device_control_installed);
condition_free_list(u->conditions);
condition_free_list(u->asserts);

View File

@ -257,6 +257,9 @@ typedef struct Unit {
CGroupMask cgroup_members_mask;
int cgroup_inotify_wd;
/* Device Controller BPF program */
BPFProgram *bpf_device_control_installed;
/* IP BPF Firewalling/accounting */
int ip_accounting_ingress_map_fd;
int ip_accounting_egress_map_fd;

View File

@ -100,7 +100,7 @@ static void test_cg_mask_to_string_one(CGroupMask mask, const char *t) {
static void test_cg_mask_to_string(void) {
test_cg_mask_to_string_one(0, NULL);
test_cg_mask_to_string_one(_CGROUP_MASK_ALL, "cpu cpuacct io blkio memory devices pids bpf-firewall");
test_cg_mask_to_string_one(_CGROUP_MASK_ALL, "cpu cpuacct io blkio memory devices pids bpf-firewall bpf-devices");
test_cg_mask_to_string_one(CGROUP_MASK_CPU, "cpu");
test_cg_mask_to_string_one(CGROUP_MASK_CPUACCT, "cpuacct");
test_cg_mask_to_string_one(CGROUP_MASK_IO, "io");