diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index a6f64e8415..7e4989f489 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -1770,11 +1770,6 @@ static int verify_arguments(void) { if (arg_expose_ports && !arg_private_network) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking."); -#if ! HAVE_LIBIPTC - if (arg_expose_ports) - return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support."); -#endif - if (arg_caps_ambient) { if (arg_caps_ambient == (uint64_t)-1) return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all."); diff --git a/src/shared/firewall-util-nft.c b/src/shared/firewall-util-nft.c new file mode 100644 index 0000000000..6c72956e04 --- /dev/null +++ b/src/shared/firewall-util-nft.c @@ -0,0 +1,903 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sd-netlink.h" + +#include "alloc-util.h" +#include "firewall-util.h" +#include "firewall-util-private.h" +#include "in-addr-util.h" +#include "macro.h" +#include "socket-util.h" +#include "time-util.h" + +#define NFT_SYSTEMD_DNAT_MAP_NAME "map_port_ipport" +#define NFT_SYSTEMD_TABLE_NAME "io.systemd.nat" +#define NFT_SYSTEMD_MASQ_SET_NAME "masq_saddr" + +#define NFNL_DEFAULT_TIMEOUT_USECS (1ULL * USEC_PER_SEC) + +#define UDP_DPORT_OFFSET 2 + +static int nfnl_netlink_sendv(sd_netlink *nfnl, + sd_netlink_message *messages[], + size_t msgcount) { + _cleanup_free_ uint32_t *serial = NULL; + size_t i; + int r; + + assert(msgcount > 0); + + r = sd_netlink_sendv(nfnl, messages, msgcount, &serial); + if (r < 0) + return r; + + r = 0; + for (i = 1; i < msgcount - 1; i++) { + int tmp; + + /* If message is an error, this returns embedded errno */ + tmp = sd_netlink_read(nfnl, serial[i], NFNL_DEFAULT_TIMEOUT_USECS, NULL); + if (tmp < 0 && r == 0) + r = tmp; + } + + return r; +} + +static int nfnl_add_open_expr_container(sd_netlink_message *m, const char *name) { + int r; + + r = sd_netlink_message_open_array(m, NFTA_LIST_ELEM); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NFTA_EXPR_NAME, name); + if (r < 0) + return r; + + return sd_netlink_message_open_container_union(m, NFTA_EXPR_DATA, name); +} + +static int nfnl_add_expr_fib(sd_netlink_message *m, uint32_t nft_fib_flags, + enum nft_fib_result result, + enum nft_registers dreg) { + int r; + + r = nfnl_add_open_expr_container(m, "fib"); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_FIB_FLAGS, htobe32(nft_fib_flags)); + if (r < 0) + return r; + r = sd_netlink_message_append_u32(m, NFTA_FIB_RESULT, htobe32(result)); + if (r < 0) + return r; + r = sd_netlink_message_append_u32(m, NFTA_FIB_DREG, htobe32(dreg)); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */ + if (r < 0) + return r; + + return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */ +} + +static int nfnl_add_expr_meta(sd_netlink_message *m, enum nft_meta_keys key, + enum nft_registers dreg) { + int r; + + r = nfnl_add_open_expr_container(m, "meta"); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_META_KEY, htobe32(key)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_META_DREG, htobe32(dreg)); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */ + if (r < 0) + return r; + + return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */ +} + +static int nfnl_add_expr_payload(sd_netlink_message *m, enum nft_payload_bases pb, + uint32_t offset, uint32_t len, enum nft_registers dreg) { + int r; + + r = nfnl_add_open_expr_container(m, "payload"); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_DREG, htobe32(dreg)); + if (r < 0) + return r; + r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_BASE, htobe32(pb)); + if (r < 0) + return r; + r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_OFFSET, htobe32(offset)); + if (r < 0) + return r; + r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_LEN, htobe32(len)); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */ + if (r < 0) + return r; + return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */ +} + +static int nfnl_add_expr_lookup_set_data(sd_netlink_message *m, const char *set_name, + enum nft_registers sreg) { + int r; + + r = nfnl_add_open_expr_container(m, "lookup"); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NFTA_LOOKUP_SET, set_name); + if (r < 0) + return r; + + return sd_netlink_message_append_u32(m, NFTA_LOOKUP_SREG, htobe32(sreg)); +} + +static int nfnl_add_expr_lookup_set(sd_netlink_message *m, const char *set_name, + enum nft_registers sreg) { + int r; + + r = nfnl_add_expr_lookup_set_data(m, set_name, sreg); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */ + if (r < 0) + return r; + return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */ +} + +static int nfnl_add_expr_lookup_map(sd_netlink_message *m, const char *set_name, + enum nft_registers sreg, enum nft_registers dreg) { + int r; + + r = nfnl_add_expr_lookup_set_data(m, set_name, sreg); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_LOOKUP_DREG, htobe32(dreg)); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */ + if (r < 0) + return r; + + return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */ +} + +static int nfnl_add_expr_data(sd_netlink_message *m, int attr, const void *data, uint32_t dlen) { + int r; + + r = sd_netlink_message_open_container(m, attr); + if (r < 0) + return r; + r = sd_netlink_message_append_data(m, NFTA_DATA_VALUE, data, dlen); + if (r < 0) + return r; + + return sd_netlink_message_close_container(m); /* attr */ +} + +static int nfnl_add_expr_cmp_data(sd_netlink_message *m, const void *data, uint32_t dlen) { + return nfnl_add_expr_data(m, NFTA_CMP_DATA, data, dlen); +} + +static int nfnl_add_expr_cmp(sd_netlink_message *m, enum nft_cmp_ops cmp_op, + enum nft_registers sreg, const void *data, uint32_t dlen) { + int r; + + r = nfnl_add_open_expr_container(m, "cmp"); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_CMP_OP, htobe32(cmp_op)); + if (r < 0) + return r; + r = sd_netlink_message_append_u32(m, NFTA_CMP_SREG, htobe32(sreg)); + if (r < 0) + return r; + + r = nfnl_add_expr_cmp_data(m, data, dlen); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */ + if (r < 0) + return r; + return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */ +} + +static int nfnl_add_expr_bitwise(sd_netlink_message *m, + enum nft_registers sreg, + enum nft_registers dreg, + const void *and, + const void *xor, uint32_t len) { + int r; + + r = nfnl_add_open_expr_container(m, "bitwise"); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_BITWISE_SREG, htobe32(sreg)); + if (r < 0) + return r; + r = sd_netlink_message_append_u32(m, NFTA_BITWISE_DREG, htobe32(dreg)); + if (r < 0) + return r; + r = sd_netlink_message_append_u32(m, NFTA_BITWISE_LEN, htobe32(len)); + if (r < 0) + return r; + + r = nfnl_add_expr_data(m, NFTA_BITWISE_MASK, and, len); + if (r < 0) + return r; + + r = nfnl_add_expr_data(m, NFTA_BITWISE_XOR, xor, len); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */ + if (r < 0) + return r; + return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */ +} + +static int nfnl_add_expr_dnat(sd_netlink_message *m, + int family, + enum nft_registers areg, + enum nft_registers preg) { + int r; + + r = nfnl_add_open_expr_container(m, "nat"); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_NAT_TYPE, htobe32(NFT_NAT_DNAT)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_NAT_FAMILY, htobe32(family)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_NAT_REG_ADDR_MIN, htobe32(areg)); + if (r < 0) + return r; + r = sd_netlink_message_append_u32(m, NFTA_NAT_REG_PROTO_MIN, htobe32(preg)); + if (r < 0) + return r; + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + return sd_netlink_message_close_container(m); +} + +static int nfnl_add_expr_masq(sd_netlink_message *m) { + int r; + + r = sd_netlink_message_open_array(m, NFTA_LIST_ELEM); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, NFTA_EXPR_NAME, "masq"); + if (r < 0) + return r; + + return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */ +} + +/* -t nat -A POSTROUTING -p protocol -s source/pflen -o out_interface -d destionation/pflen -j MASQUERADE */ +static int sd_nfnl_message_new_masq_rule(sd_netlink *nfnl, sd_netlink_message **ret, int family, + const char *chain) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS); + if (r < 0) + return r; + + /* 1st statement: ip saddr @masq_saddr. Place iph->saddr in reg1. */ + r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct iphdr, saddr), + sizeof(uint32_t), NFT_REG32_01); + if (r < 0) + return r; + + /* 1st statement: use reg1 content to make lookup in @masq_saddr set. */ + r = nfnl_add_expr_lookup_set(m, NFT_SYSTEMD_MASQ_SET_NAME, NFT_REG32_01); + if (r < 0) + return r; + + /* 2nd statement: masq. Only executed by kernel if the previous lookup was successful. */ + r = nfnl_add_expr_masq(m); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */ + if (r < 0) + return r; + *ret = TAKE_PTR(m); + return 0; +} + +/* -t nat -A PREROUTING -p protocol --dport local_port -i in_interface -s source/pflen -d destionation/pflen -j DNAT --to-destination remote_addr:remote_port */ +static int sd_nfnl_message_new_dnat_rule_pre(sd_netlink *nfnl, sd_netlink_message **ret, int family, + const char *chain) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + enum nft_registers proto_reg; + uint32_t local = RTN_LOCAL; + int r; + + r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS); + if (r < 0) + return r; + + /* 1st statement: fib daddr type local */ + r = nfnl_add_expr_fib(m, NFTA_FIB_F_DADDR, NFT_FIB_RESULT_ADDRTYPE, NFT_REG32_01); + if (r < 0) + return r; + + /* 1st statement (cont.): compare RTN_LOCAL */ + r = nfnl_add_expr_cmp(m, NFT_CMP_EQ, NFT_REG32_01, &local, sizeof(local)); + if (r < 0) + return r; + + /* 2nd statement: lookup local port in map, fetch address:dport to map to */ + r = nfnl_add_expr_meta(m, NFT_META_L4PROTO, NFT_REG32_01); + if (r < 0) + return r; + + r = nfnl_add_expr_payload(m, NFT_PAYLOAD_TRANSPORT_HEADER, UDP_DPORT_OFFSET, + sizeof(uint16_t), NFT_REG32_02); + if (r < 0) + return r; + + /* 3rd statement: lookup 'l4proto . dport', e.g. 'tcp . 22' as key and + * store address and port for the dnat mapping in REG1/REG2. + */ + r = nfnl_add_expr_lookup_map(m, NFT_SYSTEMD_DNAT_MAP_NAME, NFT_REG32_01, NFT_REG32_01); + if (r < 0) + return r; + + proto_reg = NFT_REG32_02; + r = nfnl_add_expr_dnat(m, family, NFT_REG32_01, proto_reg); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */ + if (r < 0) + return r; + *ret = TAKE_PTR(m); + return 0; +} + +static int sd_nfnl_message_new_dnat_rule_out(sd_netlink *nfnl, sd_netlink_message **ret, + int family, const char *chain) { + static const uint32_t zero, one = 1; + + uint32_t lonet = htobe32(0x7F000000), lomask = htobe32(0xff000000); + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + enum nft_registers proto_reg; + int r; + + r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS); + if (r < 0) + return r; + + /* 1st statement: exclude 127.0.0.1/8: ip daddr != 127.0.0.1/8 */ + r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct iphdr, daddr), + sizeof(uint32_t), NFT_REG32_01); + if (r < 0) + return r; + + /* 1st statement (cont.): bitops/prefix */ + r = nfnl_add_expr_bitwise(m, NFT_REG32_01, NFT_REG32_01, &lomask, &zero, sizeof(lomask)); + if (r < 0) + return r; + + /* 1st statement (cont.): compare reg1 with 127/8 */ + r = nfnl_add_expr_cmp(m, NFT_CMP_NEQ, NFT_REG32_01, &lonet, sizeof(lonet)); + if (r < 0) + return r; + + /* 2nd statement: meta oif lo */ + r = nfnl_add_expr_meta(m, NFT_META_OIF, NFT_REG32_01); + if (r < 0) + return r; + + /* 2nd statement (cont.): compare to lo ifindex (1) */ + r = nfnl_add_expr_cmp(m, NFT_CMP_EQ, NFT_REG32_01, &one, sizeof(one)); + if (r < 0) + return r; + + /* 3rd statement: meta l4proto . th dport dnat ip . port to map @map_port_ipport */ + r = nfnl_add_expr_meta(m, NFT_META_L4PROTO, NFT_REG32_01); + if (r < 0) + return r; + + /* 3rd statement (cont): store the port number in reg2 */ + r = nfnl_add_expr_payload(m, NFT_PAYLOAD_TRANSPORT_HEADER, UDP_DPORT_OFFSET, + sizeof(uint16_t), NFT_REG32_02); + if (r < 0) + return r; + + /* 3rd statement (cont): use reg1 and reg2 and retrieve + * the new destination ip and port number. + * + * reg1 and reg2 are clobbered and will then contain the new + * address/port number. + */ + r = nfnl_add_expr_lookup_map(m, NFT_SYSTEMD_DNAT_MAP_NAME, NFT_REG32_01, NFT_REG32_01); + if (r < 0) + return r; + + /* 4th statement: dnat connection to address/port retrieved by the + * preceeding expression. */ + proto_reg = NFT_REG32_02; + r = nfnl_add_expr_dnat(m, family, NFT_REG32_01, proto_reg); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */ + if (r < 0) + return r; + *ret = TAKE_PTR(m); + return 0; +} + +static int nft_new_set(struct sd_netlink *nfnl, + sd_netlink_message **ret, + int family, const char *set_name, + uint32_t set_id, + uint32_t flags, uint32_t type, uint32_t klen) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + r = sd_nfnl_nft_message_new_set(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name, set_id, klen); + if (r < 0) + return r; + + if (flags != 0) { + r = sd_netlink_message_append_u32(m, NFTA_SET_FLAGS, htobe32(flags)); + if (r < 0) + return r; + } + + r = sd_netlink_message_append_u32(m, NFTA_SET_KEY_TYPE, htobe32(type)); + if (r < 0) + return r; + + *ret = TAKE_PTR(m); + return r; +} + +static int nft_new_map(struct sd_netlink *nfnl, + sd_netlink_message **ret, + int family, const char *set_name, uint32_t set_id, + uint32_t flags, uint32_t type, uint32_t klen, uint32_t dtype, uint32_t dlen) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + r = nft_new_set(nfnl, &m, family, set_name, set_id, flags | NFT_SET_MAP, type, klen); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_SET_DATA_TYPE, htobe32(dtype)); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_SET_DATA_LEN, htobe32(dlen)); + if (r < 0) + return r; + *ret = TAKE_PTR(m); + return 0; +} + +static int nft_add_element(sd_netlink *nfnl, sd_netlink_message **ret, + int family, const char *set_name, + const void *key, uint32_t klen, + const void *data, uint32_t dlen) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + /* + * Ideally there would be an API that provides: + * + * 1) a init function to add the main ruleset skeleton + * 2) a function that populates the sets with all known address/port pairs to s/dnat for + * 3) a function that can remove address/port pairs again. + * + * At this time, the existing API is used which is built on a + * 'add/delete a rule' paradigm. + * + * This replicated here and each element gets added to the set + * one-by-one. + */ + r = sd_nfnl_nft_message_new_setelems_begin(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name); + if (r < 0) + return r; + + r = sd_nfnl_nft_message_add_setelem(m, 0, key, klen, data, dlen); + if (r < 0) + return r; + + /* could theoretically append more set elements to add here */ + r = sd_nfnl_nft_message_add_setelem_end(m); + if (r < 0) + return r; + *ret = TAKE_PTR(m); + return 0; +} + +static int nft_del_element(sd_netlink *nfnl, + sd_netlink_message **ret, int family, const char *set_name, + const void *key, uint32_t klen, + const void *data, uint32_t dlen) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + r = sd_nfnl_nft_message_del_setelems_begin(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name); + if (r < 0) + return r; + + r = sd_nfnl_nft_message_add_setelem(m, 0, key, klen, data, dlen); + if (r < 0) + return r; + + r = sd_nfnl_nft_message_add_setelem_end(m); + if (r < 0) + return r; + *ret = TAKE_PTR(m); + return 0; +} + +/* This is needed so 'nft' userspace tool can properly format the contents + * of the set/map when someone uses 'nft' to inspect their content. + * + * The values cannot be changed, they are part of the nft tool type identifier ABI. + */ +#define TYPE_BITS 6 + +enum nft_key_types { + TYPE_IPADDR = 7, + TYPE_IP6ADDR = 8, + TYPE_INET_PROTOCOL = 12, + TYPE_INET_SERVICE = 13, +}; + +static uint32_t concat_types2(enum nft_key_types a, enum nft_key_types b) { + uint32_t type = (uint32_t)a; + + type <<= TYPE_BITS; + type |= (uint32_t)b; + + return type; +} + +/* enough space to hold netlink messages for table skeleton */ +#define NFT_INIT_MSGS 16 +static int fw_nftables_init_family(sd_netlink *nfnl, int family) { + sd_netlink_message *batch[NFT_INIT_MSGS] = {}; + size_t ip_type_size = sizeof(uint32_t); + int ip_type = TYPE_IPADDR, r; + size_t msgcnt = 0, i; + uint32_t set_id = 0; + + r = sd_nfnl_message_batch_begin(nfnl, &batch[msgcnt]); + if (r < 0) + goto out_unref; + + msgcnt++; + assert(msgcnt < NFT_INIT_MSGS); + /* Set F_EXCL so table add fails if the table already exists. */ + r = sd_nfnl_nft_message_new_table(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME, NLM_F_EXCL | NLM_F_ACK); + if (r < 0) + goto out_unref; + + msgcnt++; + assert(msgcnt < NFT_INIT_MSGS); + + r = sd_nfnl_nft_message_new_basechain(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME, + "prerouting", "nat", + NF_INET_PRE_ROUTING, NF_IP_PRI_NAT_DST + 1); + if (r < 0) + goto out_unref; + + msgcnt++; + assert(msgcnt < NFT_INIT_MSGS); + r = sd_nfnl_nft_message_new_basechain(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME, + "output", "nat", + NF_INET_LOCAL_OUT, NF_IP_PRI_NAT_DST + 1); + if (r < 0) + goto out_unref; + + msgcnt++; + assert(msgcnt < NFT_INIT_MSGS); + r = sd_nfnl_nft_message_new_basechain(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME, + "postrouting", "nat", + NF_INET_POST_ROUTING, NF_IP_PRI_NAT_SRC + 1); + if (r < 0) + goto out_unref; + + msgcnt++; + assert(msgcnt < NFT_INIT_MSGS); + /* set to store ip address ranges we should masquerade for */ + r = nft_new_set(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_MASQ_SET_NAME, ++set_id, NFT_SET_INTERVAL, ip_type, ip_type_size); + if (r < 0) + goto out_unref; + + /* + * map to store ip address:port pair to dnat to. elements in concatenation + * are rounded up to 4 bytes. + * + * Example: ip protocol . tcp daddr is sizeof(uint32_t) + sizeof(uint32_t), not + * sizeof(uint8_t) + sizeof(uint16_t). + */ + msgcnt++; + assert(msgcnt < NFT_INIT_MSGS); + r = nft_new_map(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_DNAT_MAP_NAME, ++set_id, 0, + concat_types2(TYPE_INET_PROTOCOL, TYPE_INET_SERVICE), sizeof(uint32_t) * 2, + concat_types2(ip_type, TYPE_INET_SERVICE), ip_type_size + sizeof(uint32_t)); + if (r < 0) + goto out_unref; + + msgcnt++; + assert(msgcnt < NFT_INIT_MSGS); + r = sd_nfnl_message_new_dnat_rule_pre(nfnl, &batch[msgcnt], family, "prerouting"); + if (r < 0) + goto out_unref; + + msgcnt++; + assert(msgcnt < NFT_INIT_MSGS); + r = sd_nfnl_message_new_dnat_rule_out(nfnl, &batch[msgcnt], family, "output"); + if (r < 0) + goto out_unref; + + msgcnt++; + r = sd_nfnl_message_new_masq_rule(nfnl, &batch[msgcnt], family, "postrouting"); + if (r < 0) + goto out_unref; + + msgcnt++; + assert(msgcnt < NFT_INIT_MSGS); + r = sd_nfnl_message_batch_end(nfnl, &batch[msgcnt]); + if (r < 0) + goto out_unref; + + msgcnt++; + assert(msgcnt <= NFT_INIT_MSGS); + r = nfnl_netlink_sendv(nfnl, batch, msgcnt); + if (r == -EEXIST) + r = 0; + +out_unref: + for (i = 0; i < msgcnt; i++) + sd_netlink_message_unref(batch[i]); + + return r; +} + +int fw_nftables_init(FirewallContext *ctx) { + _cleanup_(sd_netlink_unrefp) sd_netlink *nfnl = NULL; + int r; + + r = sd_nfnl_socket_open(&nfnl); + if (r < 0) + return r; + + r = fw_nftables_init_family(nfnl, AF_INET); + if (r < 0) + return r; + + ctx->nfnl = TAKE_PTR(nfnl); + return 0; +} + +void fw_nftables_exit(FirewallContext *ctx) { + ctx->nfnl = sd_netlink_unref(ctx->nfnl); +} + +static int nft_message_add_setelem_iprange(sd_netlink_message *m, + const union in_addr_union *source, + unsigned int prefixlen) { + uint32_t mask, start, end; + unsigned int nplen; + int r; + + assert(prefixlen <= 32); + nplen = 32 - prefixlen; + + mask = (1U << nplen) - 1U; + mask = htobe32(~mask); + start = source->in.s_addr & mask; + + r = sd_nfnl_nft_message_add_setelem(m, 0, &start, sizeof(start), NULL, 0); + if (r < 0) + return r; + + r = sd_nfnl_nft_message_add_setelem_end(m); + if (r < 0) + return r; + + end = be32toh(start) + (1U << nplen); + if (end < be32toh(start)) + end = 0U; + end = htobe32(end); + + r = sd_nfnl_nft_message_add_setelem(m, 1, &end, sizeof(end), NULL, 0); + if (r < 0) + return r; + + r = sd_netlink_message_append_u32(m, NFTA_SET_ELEM_FLAGS, htobe32(NFT_SET_ELEM_INTERVAL_END)); + if (r < 0) + return r; + + r = sd_nfnl_nft_message_add_setelem_end(m); + if (r < 0) + return r; + + return 0; +} + +#define NFT_MASQ_MSGS 3 + +int fw_nftables_add_masquerade( + FirewallContext *ctx, + bool add, + int af, + const union in_addr_union *source, + unsigned int source_prefixlen) { + sd_netlink_message *transaction[NFT_MASQ_MSGS] = {}; + size_t tsize; + int r; + + if (!source || source_prefixlen == 0) + return -EINVAL; + + r = sd_nfnl_message_batch_begin(ctx->nfnl, &transaction[0]); + if (r < 0) + return r; + tsize = 1; + if (add) + r = sd_nfnl_nft_message_new_setelems_begin(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_MASQ_SET_NAME); + else + r = sd_nfnl_nft_message_del_setelems_begin(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_MASQ_SET_NAME); + + if (r < 0) + goto out_unref; + + r = nft_message_add_setelem_iprange(transaction[tsize], source, source_prefixlen); + if (r < 0) + goto out_unref; + + ++tsize; + assert(tsize < NFT_MASQ_MSGS); + r = sd_nfnl_message_batch_end(ctx->nfnl, &transaction[tsize]); + if (r < 0) + return r; + ++tsize; + r = nfnl_netlink_sendv(ctx->nfnl, transaction, tsize); + +out_unref: + while (tsize > 0) + sd_netlink_message_unref(transaction[--tsize]); + return r < 0 ? r : 0; +} + +#define NFT_DNAT_MSGS 4 + +int fw_nftables_add_local_dnat( + FirewallContext *ctx, + bool add, + int af, + int protocol, + uint16_t local_port, + const union in_addr_union *remote, + uint16_t remote_port, + const union in_addr_union *previous_remote) { + uint32_t data[2], key[2]; + sd_netlink_message *transaction[NFT_DNAT_MSGS] = {}; + size_t tsize; + int r; + + assert(add || !previous_remote); + + if (af != AF_INET) + return -EAFNOSUPPORT; + + if (!IN_SET(protocol, IPPROTO_TCP, IPPROTO_UDP)) + return -EPROTONOSUPPORT; + + if (local_port <= 0) + return -EINVAL; + + key[0] = protocol; + key[1] = htobe16(local_port); + + if (!remote) + return -EOPNOTSUPP; + + if (remote_port <= 0) + return -EINVAL; + + data[1] = htobe16(remote_port); + + r = sd_nfnl_message_batch_begin(ctx->nfnl, &transaction[0]); + if (r < 0) + return r; + + tsize = 1; + /* If a previous remote is set, remove its entry */ + if (add && previous_remote && previous_remote->in.s_addr != remote->in.s_addr) { + data[0] = previous_remote->in.s_addr; + + r = nft_del_element(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, sizeof(data)); + if (r < 0) + goto out_unref; + + tsize++; + } + + data[0] = remote->in.s_addr; + + assert(tsize < NFT_DNAT_MSGS); + if (add) + nft_add_element(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, sizeof(data)); + else + nft_del_element(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, sizeof(data)); + + tsize++; + assert(tsize < NFT_DNAT_MSGS); + + r = sd_nfnl_message_batch_end(ctx->nfnl, &transaction[tsize]); + if (r < 0) + goto out_unref; + + tsize++; + assert(tsize <= NFT_DNAT_MSGS); + r = nfnl_netlink_sendv(ctx->nfnl, transaction, tsize); + +out_unref: + while (tsize > 0) + sd_netlink_message_unref(transaction[--tsize]); + return r < 0 ? r : 0; +} diff --git a/src/shared/firewall-util-private.h b/src/shared/firewall-util-private.h index 7f9efbc513..f034af180e 100644 --- a/src/shared/firewall-util-private.h +++ b/src/shared/firewall-util-private.h @@ -5,18 +5,41 @@ #include #include "in-addr-util.h" +#include "sd-netlink.h" enum FirewallBackend { FW_BACKEND_NONE, #if HAVE_LIBIPTC FW_BACKEND_IPTABLES, #endif + FW_BACKEND_NFTABLES, }; struct FirewallContext { enum FirewallBackend firewall_backend; + sd_netlink *nfnl; }; +int fw_nftables_init(FirewallContext *ctx); +void fw_nftables_exit(FirewallContext *ctx); + +int fw_nftables_add_masquerade( + FirewallContext *ctx, + bool add, + int af, + const union in_addr_union *source, + unsigned source_prefixlen); + +int fw_nftables_add_local_dnat( + FirewallContext *ctx, + bool add, + int af, + int protocol, + uint16_t local_port, + const union in_addr_union *remote, + uint16_t remote_port, + const union in_addr_union *previous_remote); + #if HAVE_LIBIPTC int fw_iptables_add_masquerade( diff --git a/src/shared/firewall-util.c b/src/shared/firewall-util.c index edfe5787b1..3bed941127 100644 --- a/src/shared/firewall-util.c +++ b/src/shared/firewall-util.c @@ -8,7 +8,9 @@ #include "firewall-util.h" #include "firewall-util-private.h" -static enum FirewallBackend firewall_backend_probe(void) { +static enum FirewallBackend firewall_backend_probe(FirewallContext *ctx) { + if (fw_nftables_init(ctx) == 0) + return FW_BACKEND_NFTABLES; #if HAVE_LIBIPTC return FW_BACKEND_IPTABLES; #else @@ -23,11 +25,24 @@ int fw_ctx_new(FirewallContext **ret) { if (!ctx) return -ENOMEM; - *ret = TAKE_PTR(ctx); - return 0; + /* could probe here. However, this means that we will load + * iptable_nat or nf_tables, both will enable connection tracking. + * + * Alternative would be to probe here but only call + * fw_ctx_new when nspawn/networkd know they will call + * fw_add_masquerade/local_dnat later anyway. + */ + *ret = TAKE_PTR(ctx); + return 0; } FirewallContext *fw_ctx_free(FirewallContext *ctx) { + if (!ctx) + return NULL; + + if (ctx->firewall_backend == FW_BACKEND_NFTABLES) + fw_nftables_exit(ctx); + return mfree(ctx); } @@ -48,7 +63,7 @@ int fw_add_masquerade( ctx = *fw_ctx; if (ctx->firewall_backend == FW_BACKEND_NONE) - ctx->firewall_backend = firewall_backend_probe(); + ctx->firewall_backend = firewall_backend_probe(ctx); switch (ctx->firewall_backend) { case FW_BACKEND_NONE: @@ -57,6 +72,8 @@ int fw_add_masquerade( case FW_BACKEND_IPTABLES: return fw_iptables_add_masquerade(add, af, source, source_prefixlen); #endif + case FW_BACKEND_NFTABLES: + return fw_nftables_add_masquerade(ctx, add, af, source, source_prefixlen); } return -EOPNOTSUPP; @@ -81,11 +98,13 @@ int fw_add_local_dnat( ctx = *fw_ctx; if (ctx->firewall_backend == FW_BACKEND_NONE) - ctx->firewall_backend = firewall_backend_probe(); + ctx->firewall_backend = firewall_backend_probe(ctx); switch (ctx->firewall_backend) { case FW_BACKEND_NONE: return -EOPNOTSUPP; + case FW_BACKEND_NFTABLES: + return fw_nftables_add_local_dnat(ctx, add, af, protocol, local_port, remote, remote_port, previous_remote); #if HAVE_LIBIPTC case FW_BACKEND_IPTABLES: return fw_iptables_add_local_dnat(add, af, protocol, local_port, remote, remote_port, previous_remote); diff --git a/src/shared/meson.build b/src/shared/meson.build index de916e0a4c..18a22a6389 100644 --- a/src/shared/meson.build +++ b/src/shared/meson.build @@ -102,6 +102,7 @@ shared_sources = files(''' fileio-label.c fileio-label.h firewall-util.c + firewall-util-nft.c firewall-util.h firewall-util-private.h format-table.c diff --git a/src/test/meson.build b/src/test/meson.build index 3afe5d58cb..e016f40ab1 100644 --- a/src/test/meson.build +++ b/src/test/meson.build @@ -568,8 +568,7 @@ tests += [ [['src/test/test-firewall-util.c'], [libshared], - [], - 'HAVE_LIBIPTC'], + []], [['src/test/test-netlink-manual.c'], [],