bpf: rework how we keep track and attach cgroup bpf programs

So, the kernel's management of cgroup/BPF programs is a bit misdesigned: if you attach a BPF program to a cgroup and close the fd for it it will stay pinned to the cgroup with no chance of ever removing it again (or otherwise getting ahold of it again), because the fd is used for selecting which BPF program to detach. The only way to get rid of the program again is to destroy the cgroup itself. This is particularly bad for root the cgroup (and in fact any other cgroup that we cannot realistically remove during runtime, such as /system.slice, /init.scope or /system.slice/dbus.service) as getting rid of the program only works by rebooting the system. To counter this let's closely keep track to which cgroup a BPF program is attached and let's implicitly detach the BPF program when we are about to close the BPF fd. This hence changes the bpf_program_cgroup_attach() function to track where we attached the program and changes bpf_program_cgroup_detach() to use this information. Moreover bpf_program_unref() will now implicitly call bpf_program_cgroup_detach(). In order to simplify things, bpf_program_cgroup_attach() will now implicitly invoke bpf_program_load_kernel() when necessary, simplifying the caller's side. Finally, this adds proper reference counting to BPF programs. This is useful for working with two BPF programs in parallel: the BPF program we are preparing for installation and the BPF program we so far installed, shortening the window when we detach the old one and reattach the new one.
2018-02-20 19:28:24 +01:00 · 2018-02-20 19:28:24 +01:00 · aa2b6f1d2b
parent e0ad39fc52
commit aa2b6f1d2b
5 changed files with 118 additions and 46 deletions
--- a/src/basic/bpf-program.c
+++ b/src/basic/bpf-program.c
@ -28,6 +28,7 @@
 #include "fd-util.h"
 #include "log.h"
 #include "missing.h"
+#include "path-util.h"
 #include "util.h"

 int bpf_program_new(uint32_t prog_type, BPFProgram **ret) {
@ -37,6 +38,7 @@ int bpf_program_new(uint32_t prog_type, BPFProgram **ret) {
        if (!p)
                return log_oom();

+        p->n_ref = 1;
        p->prog_type = prog_type;
        p->kernel_fd = -1;

@ -45,12 +47,39 @@ int bpf_program_new(uint32_t prog_type, BPFProgram **ret) {
        return 0;
 }

+BPFProgram *bpf_program_ref(BPFProgram *p) {
+        if (!p)
+                return NULL;
+
+        assert(p->n_ref > 0);
+        p->n_ref++;
+
+        return p;
+}
+
 BPFProgram *bpf_program_unref(BPFProgram *p) {
        if (!p)
                return NULL;

+        assert(p->n_ref > 0);
+        p->n_ref--;
+
+        if (p->n_ref > 0)
+                return NULL;
+
+        /* Unfortunately, the kernel currently doesn't implicitly detach BPF programs from their cgroups when the last
+         * fd to the BPF program is closed. This has nasty side-effects since this means that abnormally terminated
+         * programs that attached one of their BPF programs to a cgroup will leave this programs pinned for good with
+         * zero chance of recovery, until the cgroup is removed. This is particularly problematic if the cgroup in
+         * question is the root cgroup (or any other cgroup belonging to a service that cannot be restarted during
+         * operation, such as dbus), as the memory for the BPF program can only be reclaimed through a reboot. To
+         * counter this, we track closely to which cgroup a program was attached to and will detach it on our own
+         * whenever we close the BPF fd. */
+        (void) bpf_program_cgroup_detach(p);
+
        safe_close(p->kernel_fd);
        free(p->instructions);
+        free(p->attached_path);

        return mfree(p);
 }
@ -99,13 +128,47 @@ int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size) {
 }

 int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags) {
+        _cleanup_free_ char *copy = NULL;
        _cleanup_close_ int fd = -1;
        union bpf_attr attr;
+        int r;

        assert(p);
        assert(type >= 0);
        assert(path);

+        if (!IN_SET(flags, 0, BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI))
+                return -EINVAL;
+
+        /* We need to track which cgroup the program is attached to, and we can only track one attachment, hence let's
+        * refuse this early. */
+        if (p->attached_path) {
+                if (!path_equal(p->attached_path, path))
+                        return -EBUSY;
+                if (p->attached_type != type)
+                        return -EBUSY;
+                if (p->attached_flags != flags)
+                        return -EBUSY;
+
+                /* Here's a shortcut: if we previously attached this program already, then we don't have to do so
+                 * again. Well, with one exception: if we are in BPF_F_ALLOW_OVERRIDE mode then someone else might have
+                 * replaced our program since the last time, hence let's reattach it again, just to be safe. In flags
+                 * == 0 mode this is not an issue since nobody else can replace our program in that case, and in flags
+                 * == BPF_F_ALLOW_MULTI mode any other's program would be installed in addition to ours hence ours
+                 * would remain in effect. */
+                if (flags != BPF_F_ALLOW_OVERRIDE)
+                        return 0;
+        }
+
+        /* Ensure we have a kernel object for this. */
+        r = bpf_program_load_kernel(p, NULL, 0);
+        if (r < 0)
+                return r;
+
+        copy = strdup(path);
+        if (!copy)
+                return -ENOMEM;
+
        fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
        if (fd < 0)
                return -errno;
@ -120,31 +183,43 @@ int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_
        if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0)
                return -errno;

+        free_and_replace(p->attached_path, copy);
+        p->attached_type = type;
+        p->attached_flags = flags;
+
        return 0;
 }

-int bpf_program_cgroup_detach(BPFProgram *p, int type, const char *path) {
+int bpf_program_cgroup_detach(BPFProgram *p) {
        _cleanup_close_ int fd = -1;
-        union bpf_attr attr;

-        assert(type >= 0);
-        assert(path);
+        assert(p);

-        /* Note that 'p' may be NULL, in which case any program is detached. However, note that if BPF_F_ALLOW_MULTI is
-         * used 'p' is not optional. */
+        if (!p->attached_path)
+                return -EUNATCH;

-        fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
-        if (fd < 0)
-                return -errno;
+        fd = open(p->attached_path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
+        if (fd < 0) {
+                if (errno != ENOENT)
+                        return -errno;

-        attr = (union bpf_attr) {
-                .attach_type = type,
-                .target_fd = fd,
-                .attach_bpf_fd = p ? p->kernel_fd : -1,
-        };
+                /* If the cgroup does not exist anymore, then we don't have to explicitly detach, it got detached
+                 * implicitly by the removal, hence don't complain */

-        if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0)
-                return -errno;
+        } else {
+                union bpf_attr attr;
+
+                attr = (union bpf_attr) {
+                        .attach_type = p->attached_type,
+                        .target_fd = fd,
+                        .attach_bpf_fd = p->kernel_fd,
+                };
+
+                if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0)
+                        return -errno;
+        }
+
+        p->attached_path = mfree(p->attached_path);

        return 0;
 }
--- a/src/basic/bpf-program.h
+++ b/src/basic/bpf-program.h
@ -32,22 +32,29 @@
 typedef struct BPFProgram BPFProgram;

 struct BPFProgram {
+        unsigned n_ref;
+
        int kernel_fd;
        uint32_t prog_type;

        size_t n_instructions;
        size_t allocated;
        struct bpf_insn *instructions;
+
+        char *attached_path;
+        int attached_type;
+        uint32_t attached_flags;
 };

 int bpf_program_new(uint32_t prog_type, BPFProgram **ret);
 BPFProgram *bpf_program_unref(BPFProgram *p);
+BPFProgram *bpf_program_ref(BPFProgram *p);

 int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *insn, size_t count);
 int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size);

 int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags);
-int bpf_program_cgroup_detach(BPFProgram *p, int type, const char *path);
+int bpf_program_cgroup_detach(BPFProgram *p);

 int bpf_map_new(enum bpf_map_type type, size_t key_size, size_t value_size, size_t max_entries, uint32_t flags);
 int bpf_map_update_element(int fd, const void *key, void *value);
--- a/src/core/bpf-firewall.c
+++ b/src/core/bpf-firewall.c
@ -520,10 +520,6 @@ int bpf_firewall_compile(Unit *u) {
        u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
        u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);

-        cc = unit_get_cgroup_context(u);
-        if (!cc)
-                return -EINVAL;
-
        if (u->type != UNIT_SLICE) {
                /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
                 * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
@ -558,17 +554,18 @@ int bpf_firewall_compile(Unit *u) {
 int bpf_firewall_install(Unit *u) {
        _cleanup_free_ char *path = NULL;
        CGroupContext *cc;
-        uint32_t flags;
        int r, supported;
+        uint32_t flags;

        assert(u);

-        if (!u->cgroup_path)
-                return -EINVAL;
-
        cc = unit_get_cgroup_context(u);
        if (!cc)
                return -EINVAL;
+        if (!u->cgroup_path)
+                return -EINVAL;
+        if (!u->cgroup_realized)
+                return -EINVAL;

        supported = bpf_firewall_supported();
        if (supported < 0)
@ -589,34 +586,26 @@ int bpf_firewall_install(Unit *u) {
        flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
                 (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;

-        if (u->ip_bpf_egress) {
-                r = bpf_program_load_kernel(u->ip_bpf_egress, NULL, 0);
-                if (r < 0)
-                        return log_error_errno(r, "Kernel upload of egress BPF program failed: %m");
+        /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
+         * minimize the time window when we don't account for IP traffic. */
+        u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
+        u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);

+        if (u->ip_bpf_egress) {
                r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
                if (r < 0)
                        return log_error_errno(r, "Attaching egress BPF program to cgroup %s failed: %m", path);
-        } else {
-                r = bpf_program_cgroup_detach(NULL, BPF_CGROUP_INET_EGRESS, path);
-                if (r < 0)
-                        return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_ERR, r,
-                                              "Detaching egress BPF program from cgroup failed: %m");
+
+                /* Remember that this BPF program is installed now. */
+                u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
        }

        if (u->ip_bpf_ingress) {
-                r = bpf_program_load_kernel(u->ip_bpf_ingress, NULL, 0);
-                if (r < 0)
-                        return log_error_errno(r, "Kernel upload of ingress BPF program failed: %m");
-
                r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
                if (r < 0)
                        return log_error_errno(r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
-        } else {
-                r = bpf_program_cgroup_detach(NULL, BPF_CGROUP_INET_INGRESS, path);
-                if (r < 0)
-                        return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_ERR, r,
-                                              "Detaching ingress BPF program from cgroup failed: %m");
+
+                u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
        }

        return 0;
@ -665,7 +654,6 @@ int bpf_firewall_reset_accounting(int map_fd) {
        return bpf_map_update_element(map_fd, &key, &value);
 }

-
 int bpf_firewall_supported(void) {
        struct bpf_insn trivial[] = {
                BPF_MOV64_IMM(BPF_REG_0, 1),
--- a/src/core/unit.c
+++ b/src/core/unit.c
@ -659,7 +659,9 @@ void unit_free(Unit *u) {
        safe_close(u->ipv6_deny_map_fd);

        bpf_program_unref(u->ip_bpf_ingress);
+        bpf_program_unref(u->ip_bpf_ingress_installed);
        bpf_program_unref(u->ip_bpf_egress);
+        bpf_program_unref(u->ip_bpf_egress_installed);

        condition_free_list(u->conditions);
        condition_free_list(u->asserts);
--- a/src/core/unit.h
+++ b/src/core/unit.h
@ -287,8 +287,8 @@ struct Unit {
        int ipv4_deny_map_fd;
        int ipv6_deny_map_fd;

-        BPFProgram *ip_bpf_ingress;
-        BPFProgram *ip_bpf_egress;
+        BPFProgram *ip_bpf_ingress, *ip_bpf_ingress_installed;
+        BPFProgram *ip_bpf_egress, *ip_bpf_egress_installed;

        uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX];