firewall-util: add nftables backend

Idea is to use a static ruleset, added when the first attempt to
add a masquerade or dnat rule is made.

The alternative would be to add the ruleset when the init function is called.
The disadvantage is that this enables connection tracking and NAT in the kernel
(as the ruleset needs this to work), which comes with some overhead that might
not be needed (no nspawn usage and no IPMasquerade option set).

There is no additional dependency on the 'nft' userspace binary or other libraries.
sd-netlinks nfnetlink backend is used to modify the nftables ruleset.

The commit message/comments still use nft syntax since that is what
users will see when they use the nft tool to list the ruleset.

The added initial skeleton (added on first fw_add_masquerade/local_dnat
call) looks like this:

table ip io.systemd.nat {
        set masq_saddr {
                type ipv4_addr
                flags interval
                elements = { 192.168.59.160/28 }
        }

        map map_port_ipport {
                type inet_proto . inet_service : ipv4_addr . inet_service
                elements = { tcp . 2222 : 192.168.59.169 . 22 }
        }

        chain prerouting {
                type nat hook prerouting priority dstnat + 1; policy accept;
                fib daddr type local dnat ip addr . port to meta l4proto . th dport map @map_port_ipport
        }

        chain output {
                type nat hook output priority -99; policy accept;
                ip daddr != 127.0.0.0/8 oif "lo" dnat ip addr . port to meta l4proto . th dport map @map_port_ipport
        }

        chain postrouting {
                type nat hook postrouting priority srcnat + 1; policy accept;
                ip saddr @masq_saddr masquerade
        }
}

Next calls to fw_add_masquerade/add_local_dnat will then only add/delete the
element/mapping to masq_saddr and map_port_ipport, i.e. the ruleset doesn't
change -- only the set/map content does.

Running test-firewall-util with this backend gives following output
on a parallel 'nft monitor':

$ nft monitor
add table ip io.systemd.nat
add chain ip io.systemd.nat prerouting { type nat hook prerouting priority dstnat + 1; policy accept; }
add chain ip io.systemd.nat output { type nat hook output priority -99; policy accept; }
add chain ip io.systemd.nat postrouting { type nat hook postrouting priority srcnat + 1; policy accept; }
add set ip io.systemd.nat masq_saddr { type ipv4_addr; flags interval; }
add map ip io.systemd.nat map_port_ipport { type inet_proto . inet_service : ipv4_addr . inet_service; }
add rule ip io.systemd.nat prerouting fib daddr type local dnat ip addr . port to meta l4proto . th dport map @map_port_ipport
add rule ip io.systemd.nat output ip daddr != 127.0.0.0/8 fib daddr type local dnat ip addr . port to meta l4proto . th dport map @map_port_ipport
add rule ip io.systemd.nat postrouting ip saddr @masq_saddr masquerade
add element ip io.systemd.nat masq_saddr { 10.1.2.3 }
add element ip io.systemd.nat masq_saddr { 10.0.2.0/28 }
delete element ip io.systemd.nat masq_saddr { 10.0.2.0/28 }
delete element ip io.systemd.nat masq_saddr { 10.1.2.3 }
add element ip io.systemd.nat map_port_ipport { tcp . 4711 : 1.2.3.4 . 815 }
delete element ip io.systemd.nat map_port_ipport { tcp . 4711 : 1.2.3.4 . 815 }
add element ip io.systemd.nat map_port_ipport { tcp . 4711 : 1.2.3.5 . 815 }
delete element ip io.systemd.nat map_port_ipport { tcp . 4711 : 1.2.3.5 . 815 }
CTRL-C

Things not implemented/supported:
1. Change monitoring.  The kernel allows userspace to learn about changes
   made by other clients (using nfnetlink notifications). It would be
   possible to detect when e.g. someone removes the systemd nat table.
   This would need more work.  Its also not clear on how to react to
   external changes -- it doesn't seem like a good idea to just auto-undo
   everthing.
2. 'set masq_saddr' doesn't handle overlaps.
   Example:

   fw_add_masquerade(true, AF_INET, "10.0.0.0" , 16);
   fw_add_masquerade(true, AF_INET, "10.0.0.0" , 8); /* fails */

With the iptables backend the second call works, as it adds an
independent iptables rule.

With the nftables backend, the range 10.0.0.0-10.255.255.255 clashes with
the existing range of 10.0.0.0-10.0.255.255 so 2nd add gets rejected by the
kernel.

This will generate an error message from networkd ("Could not enable IP
masquerading: File exists").

To resolve this it would be needed to either keep track of the added elements
and perform range merging when overlaps are detected.

However, the add erquests are done using the configured network on a
device, so no overlaps should occur in normal setups.

IPv6 support is added in a extra changeset.

Fixes: #13307
This commit is contained in:
Florian Westphal 2020-06-19 15:53:03 +02:00
parent 4df42cd99d
commit 715a70e721
6 changed files with 952 additions and 12 deletions

View File

@ -1770,11 +1770,6 @@ static int verify_arguments(void) {
if (arg_expose_ports && !arg_private_network)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
#if ! HAVE_LIBIPTC
if (arg_expose_ports)
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
#endif
if (arg_caps_ambient) {
if (arg_caps_ambient == (uint64_t)-1)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");

View File

@ -0,0 +1,903 @@
/* SPDX-License-Identifier: LGPL-2.1+ */
#include <arpa/inet.h>
#include <endian.h>
#include <errno.h>
#include <stddef.h>
#include <string.h>
#include <linux/netfilter/nf_tables.h>
#include <linux/netfilter/nf_nat.h>
#include <linux/netfilter_ipv4.h>
#include <netinet/ip.h>
#include "sd-netlink.h"
#include "alloc-util.h"
#include "firewall-util.h"
#include "firewall-util-private.h"
#include "in-addr-util.h"
#include "macro.h"
#include "socket-util.h"
#include "time-util.h"
#define NFT_SYSTEMD_DNAT_MAP_NAME "map_port_ipport"
#define NFT_SYSTEMD_TABLE_NAME "io.systemd.nat"
#define NFT_SYSTEMD_MASQ_SET_NAME "masq_saddr"
#define NFNL_DEFAULT_TIMEOUT_USECS (1ULL * USEC_PER_SEC)
#define UDP_DPORT_OFFSET 2
static int nfnl_netlink_sendv(sd_netlink *nfnl,
sd_netlink_message *messages[],
size_t msgcount) {
_cleanup_free_ uint32_t *serial = NULL;
size_t i;
int r;
assert(msgcount > 0);
r = sd_netlink_sendv(nfnl, messages, msgcount, &serial);
if (r < 0)
return r;
r = 0;
for (i = 1; i < msgcount - 1; i++) {
int tmp;
/* If message is an error, this returns embedded errno */
tmp = sd_netlink_read(nfnl, serial[i], NFNL_DEFAULT_TIMEOUT_USECS, NULL);
if (tmp < 0 && r == 0)
r = tmp;
}
return r;
}
static int nfnl_add_open_expr_container(sd_netlink_message *m, const char *name) {
int r;
r = sd_netlink_message_open_array(m, NFTA_LIST_ELEM);
if (r < 0)
return r;
r = sd_netlink_message_append_string(m, NFTA_EXPR_NAME, name);
if (r < 0)
return r;
return sd_netlink_message_open_container_union(m, NFTA_EXPR_DATA, name);
}
static int nfnl_add_expr_fib(sd_netlink_message *m, uint32_t nft_fib_flags,
enum nft_fib_result result,
enum nft_registers dreg) {
int r;
r = nfnl_add_open_expr_container(m, "fib");
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_FIB_FLAGS, htobe32(nft_fib_flags));
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_FIB_RESULT, htobe32(result));
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_FIB_DREG, htobe32(dreg));
if (r < 0)
return r;
r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
if (r < 0)
return r;
return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
}
static int nfnl_add_expr_meta(sd_netlink_message *m, enum nft_meta_keys key,
enum nft_registers dreg) {
int r;
r = nfnl_add_open_expr_container(m, "meta");
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_META_KEY, htobe32(key));
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_META_DREG, htobe32(dreg));
if (r < 0)
return r;
r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
if (r < 0)
return r;
return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
}
static int nfnl_add_expr_payload(sd_netlink_message *m, enum nft_payload_bases pb,
uint32_t offset, uint32_t len, enum nft_registers dreg) {
int r;
r = nfnl_add_open_expr_container(m, "payload");
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_DREG, htobe32(dreg));
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_BASE, htobe32(pb));
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_OFFSET, htobe32(offset));
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_LEN, htobe32(len));
if (r < 0)
return r;
r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
if (r < 0)
return r;
return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
}
static int nfnl_add_expr_lookup_set_data(sd_netlink_message *m, const char *set_name,
enum nft_registers sreg) {
int r;
r = nfnl_add_open_expr_container(m, "lookup");
if (r < 0)
return r;
r = sd_netlink_message_append_string(m, NFTA_LOOKUP_SET, set_name);
if (r < 0)
return r;
return sd_netlink_message_append_u32(m, NFTA_LOOKUP_SREG, htobe32(sreg));
}
static int nfnl_add_expr_lookup_set(sd_netlink_message *m, const char *set_name,
enum nft_registers sreg) {
int r;
r = nfnl_add_expr_lookup_set_data(m, set_name, sreg);
if (r < 0)
return r;
r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
if (r < 0)
return r;
return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
}
static int nfnl_add_expr_lookup_map(sd_netlink_message *m, const char *set_name,
enum nft_registers sreg, enum nft_registers dreg) {
int r;
r = nfnl_add_expr_lookup_set_data(m, set_name, sreg);
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_LOOKUP_DREG, htobe32(dreg));
if (r < 0)
return r;
r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
if (r < 0)
return r;
return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
}
static int nfnl_add_expr_data(sd_netlink_message *m, int attr, const void *data, uint32_t dlen) {
int r;
r = sd_netlink_message_open_container(m, attr);
if (r < 0)
return r;
r = sd_netlink_message_append_data(m, NFTA_DATA_VALUE, data, dlen);
if (r < 0)
return r;
return sd_netlink_message_close_container(m); /* attr */
}
static int nfnl_add_expr_cmp_data(sd_netlink_message *m, const void *data, uint32_t dlen) {
return nfnl_add_expr_data(m, NFTA_CMP_DATA, data, dlen);
}
static int nfnl_add_expr_cmp(sd_netlink_message *m, enum nft_cmp_ops cmp_op,
enum nft_registers sreg, const void *data, uint32_t dlen) {
int r;
r = nfnl_add_open_expr_container(m, "cmp");
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_CMP_OP, htobe32(cmp_op));
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_CMP_SREG, htobe32(sreg));
if (r < 0)
return r;
r = nfnl_add_expr_cmp_data(m, data, dlen);
if (r < 0)
return r;
r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
if (r < 0)
return r;
return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
}
static int nfnl_add_expr_bitwise(sd_netlink_message *m,
enum nft_registers sreg,
enum nft_registers dreg,
const void *and,
const void *xor, uint32_t len) {
int r;
r = nfnl_add_open_expr_container(m, "bitwise");
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_BITWISE_SREG, htobe32(sreg));
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_BITWISE_DREG, htobe32(dreg));
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_BITWISE_LEN, htobe32(len));
if (r < 0)
return r;
r = nfnl_add_expr_data(m, NFTA_BITWISE_MASK, and, len);
if (r < 0)
return r;
r = nfnl_add_expr_data(m, NFTA_BITWISE_XOR, xor, len);
if (r < 0)
return r;
r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
if (r < 0)
return r;
return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
}
static int nfnl_add_expr_dnat(sd_netlink_message *m,
int family,
enum nft_registers areg,
enum nft_registers preg) {
int r;
r = nfnl_add_open_expr_container(m, "nat");
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_NAT_TYPE, htobe32(NFT_NAT_DNAT));
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_NAT_FAMILY, htobe32(family));
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_NAT_REG_ADDR_MIN, htobe32(areg));
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_NAT_REG_PROTO_MIN, htobe32(preg));
if (r < 0)
return r;
r = sd_netlink_message_close_container(m);
if (r < 0)
return r;
return sd_netlink_message_close_container(m);
}
static int nfnl_add_expr_masq(sd_netlink_message *m) {
int r;
r = sd_netlink_message_open_array(m, NFTA_LIST_ELEM);
if (r < 0)
return r;
r = sd_netlink_message_append_string(m, NFTA_EXPR_NAME, "masq");
if (r < 0)
return r;
return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
}
/* -t nat -A POSTROUTING -p protocol -s source/pflen -o out_interface -d destionation/pflen -j MASQUERADE */
static int sd_nfnl_message_new_masq_rule(sd_netlink *nfnl, sd_netlink_message **ret, int family,
const char *chain) {
_cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
int r;
r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
if (r < 0)
return r;
r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
if (r < 0)
return r;
/* 1st statement: ip saddr @masq_saddr. Place iph->saddr in reg1. */
r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct iphdr, saddr),
sizeof(uint32_t), NFT_REG32_01);
if (r < 0)
return r;
/* 1st statement: use reg1 content to make lookup in @masq_saddr set. */
r = nfnl_add_expr_lookup_set(m, NFT_SYSTEMD_MASQ_SET_NAME, NFT_REG32_01);
if (r < 0)
return r;
/* 2nd statement: masq. Only executed by kernel if the previous lookup was successful. */
r = nfnl_add_expr_masq(m);
if (r < 0)
return r;
r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
if (r < 0)
return r;
*ret = TAKE_PTR(m);
return 0;
}
/* -t nat -A PREROUTING -p protocol --dport local_port -i in_interface -s source/pflen -d destionation/pflen -j DNAT --to-destination remote_addr:remote_port */
static int sd_nfnl_message_new_dnat_rule_pre(sd_netlink *nfnl, sd_netlink_message **ret, int family,
const char *chain) {
_cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
enum nft_registers proto_reg;
uint32_t local = RTN_LOCAL;
int r;
r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
if (r < 0)
return r;
r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
if (r < 0)
return r;
/* 1st statement: fib daddr type local */
r = nfnl_add_expr_fib(m, NFTA_FIB_F_DADDR, NFT_FIB_RESULT_ADDRTYPE, NFT_REG32_01);
if (r < 0)
return r;
/* 1st statement (cont.): compare RTN_LOCAL */
r = nfnl_add_expr_cmp(m, NFT_CMP_EQ, NFT_REG32_01, &local, sizeof(local));
if (r < 0)
return r;
/* 2nd statement: lookup local port in map, fetch address:dport to map to */
r = nfnl_add_expr_meta(m, NFT_META_L4PROTO, NFT_REG32_01);
if (r < 0)
return r;
r = nfnl_add_expr_payload(m, NFT_PAYLOAD_TRANSPORT_HEADER, UDP_DPORT_OFFSET,
sizeof(uint16_t), NFT_REG32_02);
if (r < 0)
return r;
/* 3rd statement: lookup 'l4proto . dport', e.g. 'tcp . 22' as key and
* store address and port for the dnat mapping in REG1/REG2.
*/
r = nfnl_add_expr_lookup_map(m, NFT_SYSTEMD_DNAT_MAP_NAME, NFT_REG32_01, NFT_REG32_01);
if (r < 0)
return r;
proto_reg = NFT_REG32_02;
r = nfnl_add_expr_dnat(m, family, NFT_REG32_01, proto_reg);
if (r < 0)
return r;
r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
if (r < 0)
return r;
*ret = TAKE_PTR(m);
return 0;
}
static int sd_nfnl_message_new_dnat_rule_out(sd_netlink *nfnl, sd_netlink_message **ret,
int family, const char *chain) {
static const uint32_t zero, one = 1;
uint32_t lonet = htobe32(0x7F000000), lomask = htobe32(0xff000000);
_cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
enum nft_registers proto_reg;
int r;
r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
if (r < 0)
return r;
r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
if (r < 0)
return r;
/* 1st statement: exclude 127.0.0.1/8: ip daddr != 127.0.0.1/8 */
r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct iphdr, daddr),
sizeof(uint32_t), NFT_REG32_01);
if (r < 0)
return r;
/* 1st statement (cont.): bitops/prefix */
r = nfnl_add_expr_bitwise(m, NFT_REG32_01, NFT_REG32_01, &lomask, &zero, sizeof(lomask));
if (r < 0)
return r;
/* 1st statement (cont.): compare reg1 with 127/8 */
r = nfnl_add_expr_cmp(m, NFT_CMP_NEQ, NFT_REG32_01, &lonet, sizeof(lonet));
if (r < 0)
return r;
/* 2nd statement: meta oif lo */
r = nfnl_add_expr_meta(m, NFT_META_OIF, NFT_REG32_01);
if (r < 0)
return r;
/* 2nd statement (cont.): compare to lo ifindex (1) */
r = nfnl_add_expr_cmp(m, NFT_CMP_EQ, NFT_REG32_01, &one, sizeof(one));
if (r < 0)
return r;
/* 3rd statement: meta l4proto . th dport dnat ip . port to map @map_port_ipport */
r = nfnl_add_expr_meta(m, NFT_META_L4PROTO, NFT_REG32_01);
if (r < 0)
return r;
/* 3rd statement (cont): store the port number in reg2 */
r = nfnl_add_expr_payload(m, NFT_PAYLOAD_TRANSPORT_HEADER, UDP_DPORT_OFFSET,
sizeof(uint16_t), NFT_REG32_02);
if (r < 0)
return r;
/* 3rd statement (cont): use reg1 and reg2 and retrieve
* the new destination ip and port number.
*
* reg1 and reg2 are clobbered and will then contain the new
* address/port number.
*/
r = nfnl_add_expr_lookup_map(m, NFT_SYSTEMD_DNAT_MAP_NAME, NFT_REG32_01, NFT_REG32_01);
if (r < 0)
return r;
/* 4th statement: dnat connection to address/port retrieved by the
* preceeding expression. */
proto_reg = NFT_REG32_02;
r = nfnl_add_expr_dnat(m, family, NFT_REG32_01, proto_reg);
if (r < 0)
return r;
r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
if (r < 0)
return r;
*ret = TAKE_PTR(m);
return 0;
}
static int nft_new_set(struct sd_netlink *nfnl,
sd_netlink_message **ret,
int family, const char *set_name,
uint32_t set_id,
uint32_t flags, uint32_t type, uint32_t klen) {
_cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
int r;
r = sd_nfnl_nft_message_new_set(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name, set_id, klen);
if (r < 0)
return r;
if (flags != 0) {
r = sd_netlink_message_append_u32(m, NFTA_SET_FLAGS, htobe32(flags));
if (r < 0)
return r;
}
r = sd_netlink_message_append_u32(m, NFTA_SET_KEY_TYPE, htobe32(type));
if (r < 0)
return r;
*ret = TAKE_PTR(m);
return r;
}
static int nft_new_map(struct sd_netlink *nfnl,
sd_netlink_message **ret,
int family, const char *set_name, uint32_t set_id,
uint32_t flags, uint32_t type, uint32_t klen, uint32_t dtype, uint32_t dlen) {
_cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
int r;
r = nft_new_set(nfnl, &m, family, set_name, set_id, flags | NFT_SET_MAP, type, klen);
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_SET_DATA_TYPE, htobe32(dtype));
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_SET_DATA_LEN, htobe32(dlen));
if (r < 0)
return r;
*ret = TAKE_PTR(m);
return 0;
}
static int nft_add_element(sd_netlink *nfnl, sd_netlink_message **ret,
int family, const char *set_name,
const void *key, uint32_t klen,
const void *data, uint32_t dlen) {
_cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
int r;
/*
* Ideally there would be an API that provides:
*
* 1) a init function to add the main ruleset skeleton
* 2) a function that populates the sets with all known address/port pairs to s/dnat for
* 3) a function that can remove address/port pairs again.
*
* At this time, the existing API is used which is built on a
* 'add/delete a rule' paradigm.
*
* This replicated here and each element gets added to the set
* one-by-one.
*/
r = sd_nfnl_nft_message_new_setelems_begin(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name);
if (r < 0)
return r;
r = sd_nfnl_nft_message_add_setelem(m, 0, key, klen, data, dlen);
if (r < 0)
return r;
/* could theoretically append more set elements to add here */
r = sd_nfnl_nft_message_add_setelem_end(m);
if (r < 0)
return r;
*ret = TAKE_PTR(m);
return 0;
}
static int nft_del_element(sd_netlink *nfnl,
sd_netlink_message **ret, int family, const char *set_name,
const void *key, uint32_t klen,
const void *data, uint32_t dlen) {
_cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
int r;
r = sd_nfnl_nft_message_del_setelems_begin(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name);
if (r < 0)
return r;
r = sd_nfnl_nft_message_add_setelem(m, 0, key, klen, data, dlen);
if (r < 0)
return r;
r = sd_nfnl_nft_message_add_setelem_end(m);
if (r < 0)
return r;
*ret = TAKE_PTR(m);
return 0;
}
/* This is needed so 'nft' userspace tool can properly format the contents
* of the set/map when someone uses 'nft' to inspect their content.
*
* The values cannot be changed, they are part of the nft tool type identifier ABI.
*/
#define TYPE_BITS 6
enum nft_key_types {
TYPE_IPADDR = 7,
TYPE_IP6ADDR = 8,
TYPE_INET_PROTOCOL = 12,
TYPE_INET_SERVICE = 13,
};
static uint32_t concat_types2(enum nft_key_types a, enum nft_key_types b) {
uint32_t type = (uint32_t)a;
type <<= TYPE_BITS;
type |= (uint32_t)b;
return type;
}
/* enough space to hold netlink messages for table skeleton */
#define NFT_INIT_MSGS 16
static int fw_nftables_init_family(sd_netlink *nfnl, int family) {
sd_netlink_message *batch[NFT_INIT_MSGS] = {};
size_t ip_type_size = sizeof(uint32_t);
int ip_type = TYPE_IPADDR, r;
size_t msgcnt = 0, i;
uint32_t set_id = 0;
r = sd_nfnl_message_batch_begin(nfnl, &batch[msgcnt]);
if (r < 0)
goto out_unref;
msgcnt++;
assert(msgcnt < NFT_INIT_MSGS);
/* Set F_EXCL so table add fails if the table already exists. */
r = sd_nfnl_nft_message_new_table(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME, NLM_F_EXCL | NLM_F_ACK);
if (r < 0)
goto out_unref;
msgcnt++;
assert(msgcnt < NFT_INIT_MSGS);
r = sd_nfnl_nft_message_new_basechain(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME,
"prerouting", "nat",
NF_INET_PRE_ROUTING, NF_IP_PRI_NAT_DST + 1);
if (r < 0)
goto out_unref;
msgcnt++;
assert(msgcnt < NFT_INIT_MSGS);
r = sd_nfnl_nft_message_new_basechain(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME,
"output", "nat",
NF_INET_LOCAL_OUT, NF_IP_PRI_NAT_DST + 1);
if (r < 0)
goto out_unref;
msgcnt++;
assert(msgcnt < NFT_INIT_MSGS);
r = sd_nfnl_nft_message_new_basechain(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME,
"postrouting", "nat",
NF_INET_POST_ROUTING, NF_IP_PRI_NAT_SRC + 1);
if (r < 0)
goto out_unref;
msgcnt++;
assert(msgcnt < NFT_INIT_MSGS);
/* set to store ip address ranges we should masquerade for */
r = nft_new_set(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_MASQ_SET_NAME, ++set_id, NFT_SET_INTERVAL, ip_type, ip_type_size);
if (r < 0)
goto out_unref;
/*
* map to store ip address:port pair to dnat to. elements in concatenation
* are rounded up to 4 bytes.
*
* Example: ip protocol . tcp daddr is sizeof(uint32_t) + sizeof(uint32_t), not
* sizeof(uint8_t) + sizeof(uint16_t).
*/
msgcnt++;
assert(msgcnt < NFT_INIT_MSGS);
r = nft_new_map(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_DNAT_MAP_NAME, ++set_id, 0,
concat_types2(TYPE_INET_PROTOCOL, TYPE_INET_SERVICE), sizeof(uint32_t) * 2,
concat_types2(ip_type, TYPE_INET_SERVICE), ip_type_size + sizeof(uint32_t));
if (r < 0)
goto out_unref;
msgcnt++;
assert(msgcnt < NFT_INIT_MSGS);
r = sd_nfnl_message_new_dnat_rule_pre(nfnl, &batch[msgcnt], family, "prerouting");
if (r < 0)
goto out_unref;
msgcnt++;
assert(msgcnt < NFT_INIT_MSGS);
r = sd_nfnl_message_new_dnat_rule_out(nfnl, &batch[msgcnt], family, "output");
if (r < 0)
goto out_unref;
msgcnt++;
r = sd_nfnl_message_new_masq_rule(nfnl, &batch[msgcnt], family, "postrouting");
if (r < 0)
goto out_unref;
msgcnt++;
assert(msgcnt < NFT_INIT_MSGS);
r = sd_nfnl_message_batch_end(nfnl, &batch[msgcnt]);
if (r < 0)
goto out_unref;
msgcnt++;
assert(msgcnt <= NFT_INIT_MSGS);
r = nfnl_netlink_sendv(nfnl, batch, msgcnt);
if (r == -EEXIST)
r = 0;
out_unref:
for (i = 0; i < msgcnt; i++)
sd_netlink_message_unref(batch[i]);
return r;
}
int fw_nftables_init(FirewallContext *ctx) {
_cleanup_(sd_netlink_unrefp) sd_netlink *nfnl = NULL;
int r;
r = sd_nfnl_socket_open(&nfnl);
if (r < 0)
return r;
r = fw_nftables_init_family(nfnl, AF_INET);
if (r < 0)
return r;
ctx->nfnl = TAKE_PTR(nfnl);
return 0;
}
void fw_nftables_exit(FirewallContext *ctx) {
ctx->nfnl = sd_netlink_unref(ctx->nfnl);
}
static int nft_message_add_setelem_iprange(sd_netlink_message *m,
const union in_addr_union *source,
unsigned int prefixlen) {
uint32_t mask, start, end;
unsigned int nplen;
int r;
assert(prefixlen <= 32);
nplen = 32 - prefixlen;
mask = (1U << nplen) - 1U;
mask = htobe32(~mask);
start = source->in.s_addr & mask;
r = sd_nfnl_nft_message_add_setelem(m, 0, &start, sizeof(start), NULL, 0);
if (r < 0)
return r;
r = sd_nfnl_nft_message_add_setelem_end(m);
if (r < 0)
return r;
end = be32toh(start) + (1U << nplen);
if (end < be32toh(start))
end = 0U;
end = htobe32(end);
r = sd_nfnl_nft_message_add_setelem(m, 1, &end, sizeof(end), NULL, 0);
if (r < 0)
return r;
r = sd_netlink_message_append_u32(m, NFTA_SET_ELEM_FLAGS, htobe32(NFT_SET_ELEM_INTERVAL_END));
if (r < 0)
return r;
r = sd_nfnl_nft_message_add_setelem_end(m);
if (r < 0)
return r;
return 0;
}
#define NFT_MASQ_MSGS 3
int fw_nftables_add_masquerade(
FirewallContext *ctx,
bool add,
int af,
const union in_addr_union *source,
unsigned int source_prefixlen) {
sd_netlink_message *transaction[NFT_MASQ_MSGS] = {};
size_t tsize;
int r;
if (!source || source_prefixlen == 0)
return -EINVAL;
r = sd_nfnl_message_batch_begin(ctx->nfnl, &transaction[0]);
if (r < 0)
return r;
tsize = 1;
if (add)
r = sd_nfnl_nft_message_new_setelems_begin(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_MASQ_SET_NAME);
else
r = sd_nfnl_nft_message_del_setelems_begin(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_MASQ_SET_NAME);
if (r < 0)
goto out_unref;
r = nft_message_add_setelem_iprange(transaction[tsize], source, source_prefixlen);
if (r < 0)
goto out_unref;
++tsize;
assert(tsize < NFT_MASQ_MSGS);
r = sd_nfnl_message_batch_end(ctx->nfnl, &transaction[tsize]);
if (r < 0)
return r;
++tsize;
r = nfnl_netlink_sendv(ctx->nfnl, transaction, tsize);
out_unref:
while (tsize > 0)
sd_netlink_message_unref(transaction[--tsize]);
return r < 0 ? r : 0;
}
#define NFT_DNAT_MSGS 4
int fw_nftables_add_local_dnat(
FirewallContext *ctx,
bool add,
int af,
int protocol,
uint16_t local_port,
const union in_addr_union *remote,
uint16_t remote_port,
const union in_addr_union *previous_remote) {
uint32_t data[2], key[2];
sd_netlink_message *transaction[NFT_DNAT_MSGS] = {};
size_t tsize;
int r;
assert(add || !previous_remote);
if (af != AF_INET)
return -EAFNOSUPPORT;
if (!IN_SET(protocol, IPPROTO_TCP, IPPROTO_UDP))
return -EPROTONOSUPPORT;
if (local_port <= 0)
return -EINVAL;
key[0] = protocol;
key[1] = htobe16(local_port);
if (!remote)
return -EOPNOTSUPP;
if (remote_port <= 0)
return -EINVAL;
data[1] = htobe16(remote_port);
r = sd_nfnl_message_batch_begin(ctx->nfnl, &transaction[0]);
if (r < 0)
return r;
tsize = 1;
/* If a previous remote is set, remove its entry */
if (add && previous_remote && previous_remote->in.s_addr != remote->in.s_addr) {
data[0] = previous_remote->in.s_addr;
r = nft_del_element(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, sizeof(data));
if (r < 0)
goto out_unref;
tsize++;
}
data[0] = remote->in.s_addr;
assert(tsize < NFT_DNAT_MSGS);
if (add)
nft_add_element(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, sizeof(data));
else
nft_del_element(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, sizeof(data));
tsize++;
assert(tsize < NFT_DNAT_MSGS);
r = sd_nfnl_message_batch_end(ctx->nfnl, &transaction[tsize]);
if (r < 0)
goto out_unref;
tsize++;
assert(tsize <= NFT_DNAT_MSGS);
r = nfnl_netlink_sendv(ctx->nfnl, transaction, tsize);
out_unref:
while (tsize > 0)
sd_netlink_message_unref(transaction[--tsize]);
return r < 0 ? r : 0;
}

View File

@ -5,18 +5,41 @@
#include <stdint.h>
#include "in-addr-util.h"
#include "sd-netlink.h"
enum FirewallBackend {
FW_BACKEND_NONE,
#if HAVE_LIBIPTC
FW_BACKEND_IPTABLES,
#endif
FW_BACKEND_NFTABLES,
};
struct FirewallContext {
enum FirewallBackend firewall_backend;
sd_netlink *nfnl;
};
int fw_nftables_init(FirewallContext *ctx);
void fw_nftables_exit(FirewallContext *ctx);
int fw_nftables_add_masquerade(
FirewallContext *ctx,
bool add,
int af,
const union in_addr_union *source,
unsigned source_prefixlen);
int fw_nftables_add_local_dnat(
FirewallContext *ctx,
bool add,
int af,
int protocol,
uint16_t local_port,
const union in_addr_union *remote,
uint16_t remote_port,
const union in_addr_union *previous_remote);
#if HAVE_LIBIPTC
int fw_iptables_add_masquerade(

View File

@ -8,7 +8,9 @@
#include "firewall-util.h"
#include "firewall-util-private.h"
static enum FirewallBackend firewall_backend_probe(void) {
static enum FirewallBackend firewall_backend_probe(FirewallContext *ctx) {
if (fw_nftables_init(ctx) == 0)
return FW_BACKEND_NFTABLES;
#if HAVE_LIBIPTC
return FW_BACKEND_IPTABLES;
#else
@ -23,11 +25,24 @@ int fw_ctx_new(FirewallContext **ret) {
if (!ctx)
return -ENOMEM;
*ret = TAKE_PTR(ctx);
return 0;
/* could probe here. However, this means that we will load
* iptable_nat or nf_tables, both will enable connection tracking.
*
* Alternative would be to probe here but only call
* fw_ctx_new when nspawn/networkd know they will call
* fw_add_masquerade/local_dnat later anyway.
*/
*ret = TAKE_PTR(ctx);
return 0;
}
FirewallContext *fw_ctx_free(FirewallContext *ctx) {
if (!ctx)
return NULL;
if (ctx->firewall_backend == FW_BACKEND_NFTABLES)
fw_nftables_exit(ctx);
return mfree(ctx);
}
@ -48,7 +63,7 @@ int fw_add_masquerade(
ctx = *fw_ctx;
if (ctx->firewall_backend == FW_BACKEND_NONE)
ctx->firewall_backend = firewall_backend_probe();
ctx->firewall_backend = firewall_backend_probe(ctx);
switch (ctx->firewall_backend) {
case FW_BACKEND_NONE:
@ -57,6 +72,8 @@ int fw_add_masquerade(
case FW_BACKEND_IPTABLES:
return fw_iptables_add_masquerade(add, af, source, source_prefixlen);
#endif
case FW_BACKEND_NFTABLES:
return fw_nftables_add_masquerade(ctx, add, af, source, source_prefixlen);
}
return -EOPNOTSUPP;
@ -81,11 +98,13 @@ int fw_add_local_dnat(
ctx = *fw_ctx;
if (ctx->firewall_backend == FW_BACKEND_NONE)
ctx->firewall_backend = firewall_backend_probe();
ctx->firewall_backend = firewall_backend_probe(ctx);
switch (ctx->firewall_backend) {
case FW_BACKEND_NONE:
return -EOPNOTSUPP;
case FW_BACKEND_NFTABLES:
return fw_nftables_add_local_dnat(ctx, add, af, protocol, local_port, remote, remote_port, previous_remote);
#if HAVE_LIBIPTC
case FW_BACKEND_IPTABLES:
return fw_iptables_add_local_dnat(add, af, protocol, local_port, remote, remote_port, previous_remote);

View File

@ -102,6 +102,7 @@ shared_sources = files('''
fileio-label.c
fileio-label.h
firewall-util.c
firewall-util-nft.c
firewall-util.h
firewall-util-private.h
format-table.c

View File

@ -568,8 +568,7 @@ tests += [
[['src/test/test-firewall-util.c'],
[libshared],
[],
'HAVE_LIBIPTC'],
[]],
[['src/test/test-netlink-manual.c'],
[],