diff --git a/man/systemd.network.xml b/man/systemd.network.xml index 04cbde391e..6f08edb369 100644 --- a/man/systemd.network.xml +++ b/man/systemd.network.xml @@ -2287,6 +2287,56 @@ + + [TrafficControlQueueingDiscipline] Section Options + The [TrafficControlQueueingDiscipline] section manages the Traffic control. It can be used + to configure the kernel packet scheduler and simulate packet delay and loss for UDP or TCP applications, + or limit the bandwidth usage of a particular service to simulate internet connections. + + + + Parent= + + Specifies the parent Queueing Discipline (qdisc). Takes one of root + or clsact. Defaults to root. + + + + + NetworkEmulatorDelaySec= + + Specifies the fixed amount of delay to be added to all packets going out of the + interface. Defaults to unset. + + + + + NetworkEmulatorDelayJitterSec= + + Specifies the chosen delay to be added to the packets outgoing to the network + interface. Defaults to unset. + + + + + NetworkEmulatorPacketLimit= + + Specifies the maximum number of packets the qdisc may hold queued at a time. + An unsigned integer ranges 0 to 4294967294. Defaults to 1000. + + + + + NetworkEmulatorLossRate= + + Specifies an independent loss probability to be added to the packets outgoing from the + network interface. Takes a percentage value, suffixed with "%". Defaults to unset. + + + + + + [BridgeVLAN] Section Options The [BridgeVLAN] section manages the VLAN ID configuration of a bridge port and accepts diff --git a/src/basic/linux/pkt_sched.h b/src/basic/linux/pkt_sched.h new file mode 100644 index 0000000000..daf605760c --- /dev/null +++ b/src/basic/linux/pkt_sched.h @@ -0,0 +1,1184 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_PKT_SCHED_H +#define __LINUX_PKT_SCHED_H + +#include +#include + +/* Logical priority bands not depending on specific packet scheduler. + Every scheduler will map them to real traffic classes, if it has + no more precise mechanism to classify packets. + + These numbers have no special meaning, though their coincidence + with obsolete IPv6 values is not occasional :-). New IPv6 drafts + preferred full anarchy inspired by diffserv group. + + Note: TC_PRIO_BESTEFFORT does not mean that it is the most unhappy + class, actually, as rule it will be handled with more care than + filler or even bulk. +*/ + +#define TC_PRIO_BESTEFFORT 0 +#define TC_PRIO_FILLER 1 +#define TC_PRIO_BULK 2 +#define TC_PRIO_INTERACTIVE_BULK 4 +#define TC_PRIO_INTERACTIVE 6 +#define TC_PRIO_CONTROL 7 + +#define TC_PRIO_MAX 15 + +/* Generic queue statistics, available for all the elements. + Particular schedulers may have also their private records. +*/ + +struct tc_stats { + __u64 bytes; /* Number of enqueued bytes */ + __u32 packets; /* Number of enqueued packets */ + __u32 drops; /* Packets dropped because of lack of resources */ + __u32 overlimits; /* Number of throttle events when this + * flow goes out of allocated bandwidth */ + __u32 bps; /* Current flow byte rate */ + __u32 pps; /* Current flow packet rate */ + __u32 qlen; + __u32 backlog; +}; + +struct tc_estimator { + signed char interval; + unsigned char ewma_log; +}; + +/* "Handles" + --------- + + All the traffic control objects have 32bit identifiers, or "handles". + + They can be considered as opaque numbers from user API viewpoint, + but actually they always consist of two fields: major and + minor numbers, which are interpreted by kernel specially, + that may be used by applications, though not recommended. + + F.e. qdisc handles always have minor number equal to zero, + classes (or flows) have major equal to parent qdisc major, and + minor uniquely identifying class inside qdisc. + + Macros to manipulate handles: +*/ + +#define TC_H_MAJ_MASK (0xFFFF0000U) +#define TC_H_MIN_MASK (0x0000FFFFU) +#define TC_H_MAJ(h) ((h)&TC_H_MAJ_MASK) +#define TC_H_MIN(h) ((h)&TC_H_MIN_MASK) +#define TC_H_MAKE(maj,min) (((maj)&TC_H_MAJ_MASK)|((min)&TC_H_MIN_MASK)) + +#define TC_H_UNSPEC (0U) +#define TC_H_ROOT (0xFFFFFFFFU) +#define TC_H_INGRESS (0xFFFFFFF1U) +#define TC_H_CLSACT TC_H_INGRESS + +#define TC_H_MIN_PRIORITY 0xFFE0U +#define TC_H_MIN_INGRESS 0xFFF2U +#define TC_H_MIN_EGRESS 0xFFF3U + +/* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */ +enum tc_link_layer { + TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */ + TC_LINKLAYER_ETHERNET, + TC_LINKLAYER_ATM, +}; +#define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */ + +struct tc_ratespec { + unsigned char cell_log; + __u8 linklayer; /* lower 4 bits */ + unsigned short overhead; + short cell_align; + unsigned short mpu; + __u32 rate; +}; + +#define TC_RTAB_SIZE 1024 + +struct tc_sizespec { + unsigned char cell_log; + unsigned char size_log; + short cell_align; + int overhead; + unsigned int linklayer; + unsigned int mpu; + unsigned int mtu; + unsigned int tsize; +}; + +enum { + TCA_STAB_UNSPEC, + TCA_STAB_BASE, + TCA_STAB_DATA, + __TCA_STAB_MAX +}; + +#define TCA_STAB_MAX (__TCA_STAB_MAX - 1) + +/* FIFO section */ + +struct tc_fifo_qopt { + __u32 limit; /* Queue length: bytes for bfifo, packets for pfifo */ +}; + +/* SKBPRIO section */ + +/* + * Priorities go from zero to (SKBPRIO_MAX_PRIORITY - 1). + * SKBPRIO_MAX_PRIORITY should be at least 64 in order for skbprio to be able + * to map one to one the DS field of IPV4 and IPV6 headers. + * Memory allocation grows linearly with SKBPRIO_MAX_PRIORITY. + */ + +#define SKBPRIO_MAX_PRIORITY 64 + +struct tc_skbprio_qopt { + __u32 limit; /* Queue length in packets. */ +}; + +/* PRIO section */ + +#define TCQ_PRIO_BANDS 16 +#define TCQ_MIN_PRIO_BANDS 2 + +struct tc_prio_qopt { + int bands; /* Number of bands */ + __u8 priomap[TC_PRIO_MAX+1]; /* Map: logical priority -> PRIO band */ +}; + +/* MULTIQ section */ + +struct tc_multiq_qopt { + __u16 bands; /* Number of bands */ + __u16 max_bands; /* Maximum number of queues */ +}; + +/* PLUG section */ + +#define TCQ_PLUG_BUFFER 0 +#define TCQ_PLUG_RELEASE_ONE 1 +#define TCQ_PLUG_RELEASE_INDEFINITE 2 +#define TCQ_PLUG_LIMIT 3 + +struct tc_plug_qopt { + /* TCQ_PLUG_BUFFER: Inset a plug into the queue and + * buffer any incoming packets + * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head + * to beginning of the next plug. + * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. + * Stop buffering packets until the next TCQ_PLUG_BUFFER + * command is received (just act as a pass-thru queue). + * TCQ_PLUG_LIMIT: Increase/decrease queue size + */ + int action; + __u32 limit; +}; + +/* TBF section */ + +struct tc_tbf_qopt { + struct tc_ratespec rate; + struct tc_ratespec peakrate; + __u32 limit; + __u32 buffer; + __u32 mtu; +}; + +enum { + TCA_TBF_UNSPEC, + TCA_TBF_PARMS, + TCA_TBF_RTAB, + TCA_TBF_PTAB, + TCA_TBF_RATE64, + TCA_TBF_PRATE64, + TCA_TBF_BURST, + TCA_TBF_PBURST, + TCA_TBF_PAD, + __TCA_TBF_MAX, +}; + +#define TCA_TBF_MAX (__TCA_TBF_MAX - 1) + + +/* TEQL section */ + +/* TEQL does not require any parameters */ + +/* SFQ section */ + +struct tc_sfq_qopt { + unsigned quantum; /* Bytes per round allocated to flow */ + int perturb_period; /* Period of hash perturbation */ + __u32 limit; /* Maximal packets in queue */ + unsigned divisor; /* Hash divisor */ + unsigned flows; /* Maximal number of flows */ +}; + +struct tc_sfqred_stats { + __u32 prob_drop; /* Early drops, below max threshold */ + __u32 forced_drop; /* Early drops, after max threshold */ + __u32 prob_mark; /* Marked packets, below max threshold */ + __u32 forced_mark; /* Marked packets, after max threshold */ + __u32 prob_mark_head; /* Marked packets, below max threshold */ + __u32 forced_mark_head;/* Marked packets, after max threshold */ +}; + +struct tc_sfq_qopt_v1 { + struct tc_sfq_qopt v0; + unsigned int depth; /* max number of packets per flow */ + unsigned int headdrop; + /* SFQRED parameters */ + __u32 limit; /* HARD maximal flow queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; + __u32 max_P; /* probability, high resolution */ + /* SFQRED stats */ + struct tc_sfqred_stats stats; +}; + + +struct tc_sfq_xstats { + __s32 allot; +}; + +/* RED section */ + +enum { + TCA_RED_UNSPEC, + TCA_RED_PARMS, + TCA_RED_STAB, + TCA_RED_MAX_P, + __TCA_RED_MAX, +}; + +#define TCA_RED_MAX (__TCA_RED_MAX - 1) + +struct tc_red_qopt { + __u32 limit; /* HARD maximal queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; +#define TC_RED_ECN 1 +#define TC_RED_HARDDROP 2 +#define TC_RED_ADAPTATIVE 4 +}; + +struct tc_red_xstats { + __u32 early; /* Early drops */ + __u32 pdrop; /* Drops due to queue limits */ + __u32 other; /* Drops due to drop() calls */ + __u32 marked; /* Marked packets */ +}; + +/* GRED section */ + +#define MAX_DPs 16 + +enum { + TCA_GRED_UNSPEC, + TCA_GRED_PARMS, + TCA_GRED_STAB, + TCA_GRED_DPS, + TCA_GRED_MAX_P, + TCA_GRED_LIMIT, + TCA_GRED_VQ_LIST, /* nested TCA_GRED_VQ_ENTRY */ + __TCA_GRED_MAX, +}; + +#define TCA_GRED_MAX (__TCA_GRED_MAX - 1) + +enum { + TCA_GRED_VQ_ENTRY_UNSPEC, + TCA_GRED_VQ_ENTRY, /* nested TCA_GRED_VQ_* */ + __TCA_GRED_VQ_ENTRY_MAX, +}; +#define TCA_GRED_VQ_ENTRY_MAX (__TCA_GRED_VQ_ENTRY_MAX - 1) + +enum { + TCA_GRED_VQ_UNSPEC, + TCA_GRED_VQ_PAD, + TCA_GRED_VQ_DP, /* u32 */ + TCA_GRED_VQ_STAT_BYTES, /* u64 */ + TCA_GRED_VQ_STAT_PACKETS, /* u32 */ + TCA_GRED_VQ_STAT_BACKLOG, /* u32 */ + TCA_GRED_VQ_STAT_PROB_DROP, /* u32 */ + TCA_GRED_VQ_STAT_PROB_MARK, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_DROP, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_MARK, /* u32 */ + TCA_GRED_VQ_STAT_PDROP, /* u32 */ + TCA_GRED_VQ_STAT_OTHER, /* u32 */ + TCA_GRED_VQ_FLAGS, /* u32 */ + __TCA_GRED_VQ_MAX +}; + +#define TCA_GRED_VQ_MAX (__TCA_GRED_VQ_MAX - 1) + +struct tc_gred_qopt { + __u32 limit; /* HARD maximal queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + __u32 DP; /* up to 2^32 DPs */ + __u32 backlog; + __u32 qave; + __u32 forced; + __u32 early; + __u32 other; + __u32 pdrop; + __u8 Wlog; /* log(W) */ + __u8 Plog; /* log(P_max/(qth_max-qth_min)) */ + __u8 Scell_log; /* cell size for idle damping */ + __u8 prio; /* prio of this VQ */ + __u32 packets; + __u32 bytesin; +}; + +/* gred setup */ +struct tc_gred_sopt { + __u32 DPs; + __u32 def_DP; + __u8 grio; + __u8 flags; + __u16 pad1; +}; + +/* CHOKe section */ + +enum { + TCA_CHOKE_UNSPEC, + TCA_CHOKE_PARMS, + TCA_CHOKE_STAB, + TCA_CHOKE_MAX_P, + __TCA_CHOKE_MAX, +}; + +#define TCA_CHOKE_MAX (__TCA_CHOKE_MAX - 1) + +struct tc_choke_qopt { + __u32 limit; /* Hard queue length (packets) */ + __u32 qth_min; /* Min average threshold (packets) */ + __u32 qth_max; /* Max average threshold (packets) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; /* see RED flags */ +}; + +struct tc_choke_xstats { + __u32 early; /* Early drops */ + __u32 pdrop; /* Drops due to queue limits */ + __u32 other; /* Drops due to drop() calls */ + __u32 marked; /* Marked packets */ + __u32 matched; /* Drops due to flow match */ +}; + +/* HTB section */ +#define TC_HTB_NUMPRIO 8 +#define TC_HTB_MAXDEPTH 8 +#define TC_HTB_PROTOVER 3 /* the same as HTB and TC's major */ + +struct tc_htb_opt { + struct tc_ratespec rate; + struct tc_ratespec ceil; + __u32 buffer; + __u32 cbuffer; + __u32 quantum; + __u32 level; /* out only */ + __u32 prio; +}; +struct tc_htb_glob { + __u32 version; /* to match HTB/TC */ + __u32 rate2quantum; /* bps->quantum divisor */ + __u32 defcls; /* default class number */ + __u32 debug; /* debug flags */ + + /* stats */ + __u32 direct_pkts; /* count of non shaped packets */ +}; +enum { + TCA_HTB_UNSPEC, + TCA_HTB_PARMS, + TCA_HTB_INIT, + TCA_HTB_CTAB, + TCA_HTB_RTAB, + TCA_HTB_DIRECT_QLEN, + TCA_HTB_RATE64, + TCA_HTB_CEIL64, + TCA_HTB_PAD, + __TCA_HTB_MAX, +}; + +#define TCA_HTB_MAX (__TCA_HTB_MAX - 1) + +struct tc_htb_xstats { + __u32 lends; + __u32 borrows; + __u32 giants; /* unused since 'Make HTB scheduler work with TSO.' */ + __s32 tokens; + __s32 ctokens; +}; + +/* HFSC section */ + +struct tc_hfsc_qopt { + __u16 defcls; /* default class */ +}; + +struct tc_service_curve { + __u32 m1; /* slope of the first segment in bps */ + __u32 d; /* x-projection of the first segment in us */ + __u32 m2; /* slope of the second segment in bps */ +}; + +struct tc_hfsc_stats { + __u64 work; /* total work done */ + __u64 rtwork; /* work done by real-time criteria */ + __u32 period; /* current period */ + __u32 level; /* class level in hierarchy */ +}; + +enum { + TCA_HFSC_UNSPEC, + TCA_HFSC_RSC, + TCA_HFSC_FSC, + TCA_HFSC_USC, + __TCA_HFSC_MAX, +}; + +#define TCA_HFSC_MAX (__TCA_HFSC_MAX - 1) + + +/* CBQ section */ + +#define TC_CBQ_MAXPRIO 8 +#define TC_CBQ_MAXLEVEL 8 +#define TC_CBQ_DEF_EWMA 5 + +struct tc_cbq_lssopt { + unsigned char change; + unsigned char flags; +#define TCF_CBQ_LSS_BOUNDED 1 +#define TCF_CBQ_LSS_ISOLATED 2 + unsigned char ewma_log; + unsigned char level; +#define TCF_CBQ_LSS_FLAGS 1 +#define TCF_CBQ_LSS_EWMA 2 +#define TCF_CBQ_LSS_MAXIDLE 4 +#define TCF_CBQ_LSS_MINIDLE 8 +#define TCF_CBQ_LSS_OFFTIME 0x10 +#define TCF_CBQ_LSS_AVPKT 0x20 + __u32 maxidle; + __u32 minidle; + __u32 offtime; + __u32 avpkt; +}; + +struct tc_cbq_wrropt { + unsigned char flags; + unsigned char priority; + unsigned char cpriority; + unsigned char __reserved; + __u32 allot; + __u32 weight; +}; + +struct tc_cbq_ovl { + unsigned char strategy; +#define TC_CBQ_OVL_CLASSIC 0 +#define TC_CBQ_OVL_DELAY 1 +#define TC_CBQ_OVL_LOWPRIO 2 +#define TC_CBQ_OVL_DROP 3 +#define TC_CBQ_OVL_RCLASSIC 4 + unsigned char priority2; + __u16 pad; + __u32 penalty; +}; + +struct tc_cbq_police { + unsigned char police; + unsigned char __res1; + unsigned short __res2; +}; + +struct tc_cbq_fopt { + __u32 split; + __u32 defmap; + __u32 defchange; +}; + +struct tc_cbq_xstats { + __u32 borrows; + __u32 overactions; + __s32 avgidle; + __s32 undertime; +}; + +enum { + TCA_CBQ_UNSPEC, + TCA_CBQ_LSSOPT, + TCA_CBQ_WRROPT, + TCA_CBQ_FOPT, + TCA_CBQ_OVL_STRATEGY, + TCA_CBQ_RATE, + TCA_CBQ_RTAB, + TCA_CBQ_POLICE, + __TCA_CBQ_MAX, +}; + +#define TCA_CBQ_MAX (__TCA_CBQ_MAX - 1) + +/* dsmark section */ + +enum { + TCA_DSMARK_UNSPEC, + TCA_DSMARK_INDICES, + TCA_DSMARK_DEFAULT_INDEX, + TCA_DSMARK_SET_TC_INDEX, + TCA_DSMARK_MASK, + TCA_DSMARK_VALUE, + __TCA_DSMARK_MAX, +}; + +#define TCA_DSMARK_MAX (__TCA_DSMARK_MAX - 1) + +/* ATM section */ + +enum { + TCA_ATM_UNSPEC, + TCA_ATM_FD, /* file/socket descriptor */ + TCA_ATM_PTR, /* pointer to descriptor - later */ + TCA_ATM_HDR, /* LL header */ + TCA_ATM_EXCESS, /* excess traffic class (0 for CLP) */ + TCA_ATM_ADDR, /* PVC address (for output only) */ + TCA_ATM_STATE, /* VC state (ATM_VS_*; for output only) */ + __TCA_ATM_MAX, +}; + +#define TCA_ATM_MAX (__TCA_ATM_MAX - 1) + +/* Network emulator */ + +enum { + TCA_NETEM_UNSPEC, + TCA_NETEM_CORR, + TCA_NETEM_DELAY_DIST, + TCA_NETEM_REORDER, + TCA_NETEM_CORRUPT, + TCA_NETEM_LOSS, + TCA_NETEM_RATE, + TCA_NETEM_ECN, + TCA_NETEM_RATE64, + TCA_NETEM_PAD, + TCA_NETEM_LATENCY64, + TCA_NETEM_JITTER64, + TCA_NETEM_SLOT, + TCA_NETEM_SLOT_DIST, + __TCA_NETEM_MAX, +}; + +#define TCA_NETEM_MAX (__TCA_NETEM_MAX - 1) + +struct tc_netem_qopt { + __u32 latency; /* added delay (us) */ + __u32 limit; /* fifo limit (packets) */ + __u32 loss; /* random packet loss (0=none ~0=100%) */ + __u32 gap; /* re-ordering gap (0 for none) */ + __u32 duplicate; /* random packet dup (0=none ~0=100%) */ + __u32 jitter; /* random jitter in latency (us) */ +}; + +struct tc_netem_corr { + __u32 delay_corr; /* delay correlation */ + __u32 loss_corr; /* packet loss correlation */ + __u32 dup_corr; /* duplicate correlation */ +}; + +struct tc_netem_reorder { + __u32 probability; + __u32 correlation; +}; + +struct tc_netem_corrupt { + __u32 probability; + __u32 correlation; +}; + +struct tc_netem_rate { + __u32 rate; /* byte/s */ + __s32 packet_overhead; + __u32 cell_size; + __s32 cell_overhead; +}; + +struct tc_netem_slot { + __s64 min_delay; /* nsec */ + __s64 max_delay; + __s32 max_packets; + __s32 max_bytes; + __s64 dist_delay; /* nsec */ + __s64 dist_jitter; /* nsec */ +}; + +enum { + NETEM_LOSS_UNSPEC, + NETEM_LOSS_GI, /* General Intuitive - 4 state model */ + NETEM_LOSS_GE, /* Gilbert Elliot models */ + __NETEM_LOSS_MAX +}; +#define NETEM_LOSS_MAX (__NETEM_LOSS_MAX - 1) + +/* State transition probabilities for 4 state model */ +struct tc_netem_gimodel { + __u32 p13; + __u32 p31; + __u32 p32; + __u32 p14; + __u32 p23; +}; + +/* Gilbert-Elliot models */ +struct tc_netem_gemodel { + __u32 p; + __u32 r; + __u32 h; + __u32 k1; +}; + +#define NETEM_DIST_SCALE 8192 +#define NETEM_DIST_MAX 16384 + +/* DRR */ + +enum { + TCA_DRR_UNSPEC, + TCA_DRR_QUANTUM, + __TCA_DRR_MAX +}; + +#define TCA_DRR_MAX (__TCA_DRR_MAX - 1) + +struct tc_drr_stats { + __u32 deficit; +}; + +/* MQPRIO */ +#define TC_QOPT_BITMASK 15 +#define TC_QOPT_MAX_QUEUE 16 + +enum { + TC_MQPRIO_HW_OFFLOAD_NONE, /* no offload requested */ + TC_MQPRIO_HW_OFFLOAD_TCS, /* offload TCs, no queue counts */ + __TC_MQPRIO_HW_OFFLOAD_MAX +}; + +#define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1) + +enum { + TC_MQPRIO_MODE_DCB, + TC_MQPRIO_MODE_CHANNEL, + __TC_MQPRIO_MODE_MAX +}; + +#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1) + +enum { + TC_MQPRIO_SHAPER_DCB, + TC_MQPRIO_SHAPER_BW_RATE, /* Add new shapers below */ + __TC_MQPRIO_SHAPER_MAX +}; + +#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1) + +struct tc_mqprio_qopt { + __u8 num_tc; + __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; + __u8 hw; + __u16 count[TC_QOPT_MAX_QUEUE]; + __u16 offset[TC_QOPT_MAX_QUEUE]; +}; + +#define TC_MQPRIO_F_MODE 0x1 +#define TC_MQPRIO_F_SHAPER 0x2 +#define TC_MQPRIO_F_MIN_RATE 0x4 +#define TC_MQPRIO_F_MAX_RATE 0x8 + +enum { + TCA_MQPRIO_UNSPEC, + TCA_MQPRIO_MODE, + TCA_MQPRIO_SHAPER, + TCA_MQPRIO_MIN_RATE64, + TCA_MQPRIO_MAX_RATE64, + __TCA_MQPRIO_MAX, +}; + +#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1) + +/* SFB */ + +enum { + TCA_SFB_UNSPEC, + TCA_SFB_PARMS, + __TCA_SFB_MAX, +}; + +#define TCA_SFB_MAX (__TCA_SFB_MAX - 1) + +/* + * Note: increment, decrement are Q0.16 fixed-point values. + */ +struct tc_sfb_qopt { + __u32 rehash_interval; /* delay between hash move, in ms */ + __u32 warmup_time; /* double buffering warmup time in ms (warmup_time < rehash_interval) */ + __u32 max; /* max len of qlen_min */ + __u32 bin_size; /* maximum queue length per bin */ + __u32 increment; /* probability increment, (d1 in Blue) */ + __u32 decrement; /* probability decrement, (d2 in Blue) */ + __u32 limit; /* max SFB queue length */ + __u32 penalty_rate; /* inelastic flows are rate limited to 'rate' pps */ + __u32 penalty_burst; +}; + +struct tc_sfb_xstats { + __u32 earlydrop; + __u32 penaltydrop; + __u32 bucketdrop; + __u32 queuedrop; + __u32 childdrop; /* drops in child qdisc */ + __u32 marked; + __u32 maxqlen; + __u32 maxprob; + __u32 avgprob; +}; + +#define SFB_MAX_PROB 0xFFFF + +/* QFQ */ +enum { + TCA_QFQ_UNSPEC, + TCA_QFQ_WEIGHT, + TCA_QFQ_LMAX, + __TCA_QFQ_MAX +}; + +#define TCA_QFQ_MAX (__TCA_QFQ_MAX - 1) + +struct tc_qfq_stats { + __u32 weight; + __u32 lmax; +}; + +/* CODEL */ + +enum { + TCA_CODEL_UNSPEC, + TCA_CODEL_TARGET, + TCA_CODEL_LIMIT, + TCA_CODEL_INTERVAL, + TCA_CODEL_ECN, + TCA_CODEL_CE_THRESHOLD, + __TCA_CODEL_MAX +}; + +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) + +struct tc_codel_xstats { + __u32 maxpacket; /* largest packet we've seen so far */ + __u32 count; /* how many drops we've done since the last time we + * entered dropping state + */ + __u32 lastcount; /* count at entry to dropping state */ + __u32 ldelay; /* in-queue delay seen by most recently dequeued packet */ + __s32 drop_next; /* time to drop next packet */ + __u32 drop_overlimit; /* number of time max qdisc packet limit was hit */ + __u32 ecn_mark; /* number of packets we ECN marked instead of dropped */ + __u32 dropping; /* are we in dropping state ? */ + __u32 ce_mark; /* number of CE marked packets because of ce_threshold */ +}; + +/* FQ_CODEL */ + +enum { + TCA_FQ_CODEL_UNSPEC, + TCA_FQ_CODEL_TARGET, + TCA_FQ_CODEL_LIMIT, + TCA_FQ_CODEL_INTERVAL, + TCA_FQ_CODEL_ECN, + TCA_FQ_CODEL_FLOWS, + TCA_FQ_CODEL_QUANTUM, + TCA_FQ_CODEL_CE_THRESHOLD, + TCA_FQ_CODEL_DROP_BATCH_SIZE, + TCA_FQ_CODEL_MEMORY_LIMIT, + __TCA_FQ_CODEL_MAX +}; + +#define TCA_FQ_CODEL_MAX (__TCA_FQ_CODEL_MAX - 1) + +enum { + TCA_FQ_CODEL_XSTATS_QDISC, + TCA_FQ_CODEL_XSTATS_CLASS, +}; + +struct tc_fq_codel_qd_stats { + __u32 maxpacket; /* largest packet we've seen so far */ + __u32 drop_overlimit; /* number of time max qdisc + * packet limit was hit + */ + __u32 ecn_mark; /* number of packets we ECN marked + * instead of being dropped + */ + __u32 new_flow_count; /* number of time packets + * created a 'new flow' + */ + __u32 new_flows_len; /* count of flows in new list */ + __u32 old_flows_len; /* count of flows in old list */ + __u32 ce_mark; /* packets above ce_threshold */ + __u32 memory_usage; /* in bytes */ + __u32 drop_overmemory; +}; + +struct tc_fq_codel_cl_stats { + __s32 deficit; + __u32 ldelay; /* in-queue delay seen by most recently + * dequeued packet + */ + __u32 count; + __u32 lastcount; + __u32 dropping; + __s32 drop_next; +}; + +struct tc_fq_codel_xstats { + __u32 type; + union { + struct tc_fq_codel_qd_stats qdisc_stats; + struct tc_fq_codel_cl_stats class_stats; + }; +}; + +/* FQ */ + +enum { + TCA_FQ_UNSPEC, + + TCA_FQ_PLIMIT, /* limit of total number of packets in queue */ + + TCA_FQ_FLOW_PLIMIT, /* limit of packets per flow */ + + TCA_FQ_QUANTUM, /* RR quantum */ + + TCA_FQ_INITIAL_QUANTUM, /* RR quantum for new flow */ + + TCA_FQ_RATE_ENABLE, /* enable/disable rate limiting */ + + TCA_FQ_FLOW_DEFAULT_RATE,/* obsolete, do not use */ + + TCA_FQ_FLOW_MAX_RATE, /* per flow max rate */ + + TCA_FQ_BUCKETS_LOG, /* log2(number of buckets) */ + + TCA_FQ_FLOW_REFILL_DELAY, /* flow credit refill delay in usec */ + + TCA_FQ_ORPHAN_MASK, /* mask applied to orphaned skb hashes */ + + TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */ + + TCA_FQ_CE_THRESHOLD, /* DCTCP-like CE-marking threshold */ + + __TCA_FQ_MAX +}; + +#define TCA_FQ_MAX (__TCA_FQ_MAX - 1) + +struct tc_fq_qd_stats { + __u64 gc_flows; + __u64 highprio_packets; + __u64 tcp_retrans; + __u64 throttled; + __u64 flows_plimit; + __u64 pkts_too_long; + __u64 allocation_errors; + __s64 time_next_delayed_flow; + __u32 flows; + __u32 inactive_flows; + __u32 throttled_flows; + __u32 unthrottle_latency_ns; + __u64 ce_mark; /* packets above ce_threshold */ +}; + +/* Heavy-Hitter Filter */ + +enum { + TCA_HHF_UNSPEC, + TCA_HHF_BACKLOG_LIMIT, + TCA_HHF_QUANTUM, + TCA_HHF_HH_FLOWS_LIMIT, + TCA_HHF_RESET_TIMEOUT, + TCA_HHF_ADMIT_BYTES, + TCA_HHF_EVICT_TIMEOUT, + TCA_HHF_NON_HH_WEIGHT, + __TCA_HHF_MAX +}; + +#define TCA_HHF_MAX (__TCA_HHF_MAX - 1) + +struct tc_hhf_xstats { + __u32 drop_overlimit; /* number of times max qdisc packet limit + * was hit + */ + __u32 hh_overlimit; /* number of times max heavy-hitters was hit */ + __u32 hh_tot_count; /* number of captured heavy-hitters so far */ + __u32 hh_cur_count; /* number of current heavy-hitters */ +}; + +/* PIE */ +enum { + TCA_PIE_UNSPEC, + TCA_PIE_TARGET, + TCA_PIE_LIMIT, + TCA_PIE_TUPDATE, + TCA_PIE_ALPHA, + TCA_PIE_BETA, + TCA_PIE_ECN, + TCA_PIE_BYTEMODE, + __TCA_PIE_MAX +}; +#define TCA_PIE_MAX (__TCA_PIE_MAX - 1) + +struct tc_pie_xstats { + __u64 prob; /* current probability */ + __u32 delay; /* current delay in ms */ + __u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */ + __u32 packets_in; /* total number of packets enqueued */ + __u32 dropped; /* packets dropped due to pie_action */ + __u32 overlimit; /* dropped due to lack of space in queue */ + __u32 maxq; /* maximum queue size */ + __u32 ecn_mark; /* packets marked with ecn*/ +}; + +/* CBS */ +struct tc_cbs_qopt { + __u8 offload; + __u8 _pad[3]; + __s32 hicredit; + __s32 locredit; + __s32 idleslope; + __s32 sendslope; +}; + +enum { + TCA_CBS_UNSPEC, + TCA_CBS_PARMS, + __TCA_CBS_MAX, +}; + +#define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + + +/* ETF */ +struct tc_etf_qopt { + __s32 delta; + __s32 clockid; + __u32 flags; +#define TC_ETF_DEADLINE_MODE_ON _BITUL(0) +#define TC_ETF_OFFLOAD_ON _BITUL(1) +#define TC_ETF_SKIP_SOCK_CHECK _BITUL(2) +}; + +enum { + TCA_ETF_UNSPEC, + TCA_ETF_PARMS, + __TCA_ETF_MAX, +}; + +#define TCA_ETF_MAX (__TCA_ETF_MAX - 1) + + +/* CAKE */ +enum { + TCA_CAKE_UNSPEC, + TCA_CAKE_PAD, + TCA_CAKE_BASE_RATE64, + TCA_CAKE_DIFFSERV_MODE, + TCA_CAKE_ATM, + TCA_CAKE_FLOW_MODE, + TCA_CAKE_OVERHEAD, + TCA_CAKE_RTT, + TCA_CAKE_TARGET, + TCA_CAKE_AUTORATE, + TCA_CAKE_MEMORY, + TCA_CAKE_NAT, + TCA_CAKE_RAW, + TCA_CAKE_WASH, + TCA_CAKE_MPU, + TCA_CAKE_INGRESS, + TCA_CAKE_ACK_FILTER, + TCA_CAKE_SPLIT_GSO, + TCA_CAKE_FWMARK, + __TCA_CAKE_MAX +}; +#define TCA_CAKE_MAX (__TCA_CAKE_MAX - 1) + +enum { + __TCA_CAKE_STATS_INVALID, + TCA_CAKE_STATS_PAD, + TCA_CAKE_STATS_CAPACITY_ESTIMATE64, + TCA_CAKE_STATS_MEMORY_LIMIT, + TCA_CAKE_STATS_MEMORY_USED, + TCA_CAKE_STATS_AVG_NETOFF, + TCA_CAKE_STATS_MIN_NETLEN, + TCA_CAKE_STATS_MAX_NETLEN, + TCA_CAKE_STATS_MIN_ADJLEN, + TCA_CAKE_STATS_MAX_ADJLEN, + TCA_CAKE_STATS_TIN_STATS, + TCA_CAKE_STATS_DEFICIT, + TCA_CAKE_STATS_COBALT_COUNT, + TCA_CAKE_STATS_DROPPING, + TCA_CAKE_STATS_DROP_NEXT_US, + TCA_CAKE_STATS_P_DROP, + TCA_CAKE_STATS_BLUE_TIMER_US, + __TCA_CAKE_STATS_MAX +}; +#define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1) + +enum { + __TCA_CAKE_TIN_STATS_INVALID, + TCA_CAKE_TIN_STATS_PAD, + TCA_CAKE_TIN_STATS_SENT_PACKETS, + TCA_CAKE_TIN_STATS_SENT_BYTES64, + TCA_CAKE_TIN_STATS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ECN_MARKED_PACKETS, + TCA_CAKE_TIN_STATS_ECN_MARKED_BYTES64, + TCA_CAKE_TIN_STATS_BACKLOG_PACKETS, + TCA_CAKE_TIN_STATS_BACKLOG_BYTES, + TCA_CAKE_TIN_STATS_THRESHOLD_RATE64, + TCA_CAKE_TIN_STATS_TARGET_US, + TCA_CAKE_TIN_STATS_INTERVAL_US, + TCA_CAKE_TIN_STATS_WAY_INDIRECT_HITS, + TCA_CAKE_TIN_STATS_WAY_MISSES, + TCA_CAKE_TIN_STATS_WAY_COLLISIONS, + TCA_CAKE_TIN_STATS_PEAK_DELAY_US, + TCA_CAKE_TIN_STATS_AVG_DELAY_US, + TCA_CAKE_TIN_STATS_BASE_DELAY_US, + TCA_CAKE_TIN_STATS_SPARSE_FLOWS, + TCA_CAKE_TIN_STATS_BULK_FLOWS, + TCA_CAKE_TIN_STATS_UNRESPONSIVE_FLOWS, + TCA_CAKE_TIN_STATS_MAX_SKBLEN, + TCA_CAKE_TIN_STATS_FLOW_QUANTUM, + __TCA_CAKE_TIN_STATS_MAX +}; +#define TCA_CAKE_TIN_STATS_MAX (__TCA_CAKE_TIN_STATS_MAX - 1) +#define TC_CAKE_MAX_TINS (8) + +enum { + CAKE_FLOW_NONE = 0, + CAKE_FLOW_SRC_IP, + CAKE_FLOW_DST_IP, + CAKE_FLOW_HOSTS, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_DST_IP */ + CAKE_FLOW_FLOWS, + CAKE_FLOW_DUAL_SRC, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_DUAL_DST, /* = CAKE_FLOW_DST_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_TRIPLE, /* = CAKE_FLOW_HOSTS | CAKE_FLOW_FLOWS */ + CAKE_FLOW_MAX, +}; + +enum { + CAKE_DIFFSERV_DIFFSERV3 = 0, + CAKE_DIFFSERV_DIFFSERV4, + CAKE_DIFFSERV_DIFFSERV8, + CAKE_DIFFSERV_BESTEFFORT, + CAKE_DIFFSERV_PRECEDENCE, + CAKE_DIFFSERV_MAX +}; + +enum { + CAKE_ACK_NONE = 0, + CAKE_ACK_FILTER, + CAKE_ACK_AGGRESSIVE, + CAKE_ACK_MAX +}; + +enum { + CAKE_ATM_NONE = 0, + CAKE_ATM_ATM, + CAKE_ATM_PTM, + CAKE_ATM_MAX +}; + + +/* TAPRIO */ +enum { + TC_TAPRIO_CMD_SET_GATES = 0x00, + TC_TAPRIO_CMD_SET_AND_HOLD = 0x01, + TC_TAPRIO_CMD_SET_AND_RELEASE = 0x02, +}; + +enum { + TCA_TAPRIO_SCHED_ENTRY_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY_INDEX, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_CMD, /* u8 */ + TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_INTERVAL, /* u32 */ + __TCA_TAPRIO_SCHED_ENTRY_MAX, +}; +#define TCA_TAPRIO_SCHED_ENTRY_MAX (__TCA_TAPRIO_SCHED_ENTRY_MAX - 1) + +/* The format for schedule entry list is: + * [TCA_TAPRIO_SCHED_ENTRY_LIST] + * [TCA_TAPRIO_SCHED_ENTRY] + * [TCA_TAPRIO_SCHED_ENTRY_CMD] + * [TCA_TAPRIO_SCHED_ENTRY_GATES] + * [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] + */ +enum { + TCA_TAPRIO_SCHED_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY, + __TCA_TAPRIO_SCHED_MAX, +}; + +#define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1) + +/* The format for the admin sched (dump only): + * [TCA_TAPRIO_SCHED_ADMIN_SCHED] + * [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_CMD] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_GATES] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_INTERVAL] + */ + +#define TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST BIT(0) +#define TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD BIT(1) + +enum { + TCA_TAPRIO_ATTR_UNSPEC, + TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ + TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST, /* nested of entry */ + TCA_TAPRIO_ATTR_SCHED_BASE_TIME, /* s64 */ + TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */ + TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ + TCA_TAPRIO_PAD, + TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */ + TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */ + TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */ + TCA_TAPRIO_ATTR_FLAGS, /* u32 */ + TCA_TAPRIO_ATTR_TXTIME_DELAY, /* u32 */ + __TCA_TAPRIO_ATTR_MAX, +}; + +#define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1) + +#endif diff --git a/src/libsystemd/sd-netlink/netlink-types.c b/src/libsystemd/sd-netlink/netlink-types.c index ac85b8b18c..cd6e5c87d5 100644 --- a/src/libsystemd/sd-netlink/netlink-types.c +++ b/src/libsystemd/sd-netlink/netlink-types.c @@ -18,9 +18,10 @@ #include #include #include -#include #include +#include #include +#include #include #include @@ -733,6 +734,18 @@ static const NLTypeSystem rtnl_nexthop_type_system = { .types = rtnl_nexthop_types, }; +static const NLType rtnl_qdisc_types[] = { + [TCA_KIND] = { .type = NETLINK_TYPE_STRING }, + [TCA_OPTIONS] = { .size = sizeof(struct tc_netem_qopt) }, + [TCA_INGRESS_BLOCK] = { .type = NETLINK_TYPE_U32 }, + [TCA_EGRESS_BLOCK] = { .type = NETLINK_TYPE_U32 }, +}; + +static const NLTypeSystem rtnl_qdisc_type_system = { + .count = ELEMENTSOF(rtnl_qdisc_types), + .types = rtnl_qdisc_types, +}; + static const NLType rtnl_types[] = { [NLMSG_DONE] = { .type = NETLINK_TYPE_NESTED, .type_system = &empty_type_system, .size = 0 }, [NLMSG_ERROR] = { .type = NETLINK_TYPE_NESTED, .type_system = &empty_type_system, .size = sizeof(struct nlmsgerr) }, @@ -758,6 +771,9 @@ static const NLType rtnl_types[] = { [RTM_NEWNEXTHOP] = { .type = NETLINK_TYPE_NESTED, .type_system = &rtnl_nexthop_type_system, .size = sizeof(struct nhmsg) }, [RTM_DELNEXTHOP] = { .type = NETLINK_TYPE_NESTED, .type_system = &rtnl_nexthop_type_system, .size = sizeof(struct nhmsg) }, [RTM_GETNEXTHOP] = { .type = NETLINK_TYPE_NESTED, .type_system = &rtnl_nexthop_type_system, .size = sizeof(struct nhmsg) }, + [RTM_NEWQDISC] = { .type = NETLINK_TYPE_NESTED, .type_system = &rtnl_qdisc_type_system, .size = sizeof(struct tcmsg) }, + [RTM_DELQDISC] = { .type = NETLINK_TYPE_NESTED, .type_system = &rtnl_qdisc_type_system, .size = sizeof(struct tcmsg) }, + [RTM_GETQDISC] = { .type = NETLINK_TYPE_NESTED, .type_system = &rtnl_qdisc_type_system, .size = sizeof(struct tcmsg) }, }; const NLTypeSystem rtnl_type_system_root = { diff --git a/src/libsystemd/sd-netlink/netlink-util.h b/src/libsystemd/sd-netlink/netlink-util.h index 923accb7e3..94410e520a 100644 --- a/src/libsystemd/sd-netlink/netlink-util.h +++ b/src/libsystemd/sd-netlink/netlink-util.h @@ -41,6 +41,10 @@ static inline bool rtnl_message_type_is_routing_policy_rule(uint16_t type) { return IN_SET(type, RTM_NEWRULE, RTM_DELRULE, RTM_GETRULE); } +static inline bool rtnl_message_type_is_qdisc(uint16_t type) { + return IN_SET(type, RTM_NEWQDISC, RTM_DELQDISC, RTM_GETQDISC); +} + int rtnl_set_link_name(sd_netlink **rtnl, int ifindex, const char *name); int rtnl_set_link_properties(sd_netlink **rtnl, int ifindex, const char *alias, const struct ether_addr *mac, uint32_t mtu); diff --git a/src/libsystemd/sd-netlink/rtnl-message.c b/src/libsystemd/sd-netlink/rtnl-message.c index 429b21b149..194676a6e5 100644 --- a/src/libsystemd/sd-netlink/rtnl-message.c +++ b/src/libsystemd/sd-netlink/rtnl-message.c @@ -1033,3 +1033,46 @@ int sd_rtnl_message_routing_policy_rule_get_rtm_src_prefixlen(const sd_netlink_m return 0; } + +int sd_rtnl_message_new_qdisc(sd_netlink *rtnl, sd_netlink_message **ret, uint16_t nlmsg_type, int tcm_family, int tcm_ifindex) { + struct tcmsg *tcm; + int r; + + assert_return(rtnl_message_type_is_qdisc(nlmsg_type), -EINVAL); + assert_return(ret, -EINVAL); + + r = message_new(rtnl, ret, nlmsg_type); + if (r < 0) + return r; + + if (nlmsg_type == RTM_NEWQDISC) + (*ret)->hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL; + + tcm = NLMSG_DATA((*ret)->hdr); + tcm->tcm_family = tcm_family; + tcm->tcm_ifindex = tcm_ifindex; + + return 0; +} + +int sd_rtnl_message_set_qdisc_parent(sd_netlink_message *m, uint32_t parent) { + struct tcmsg *tcm; + + assert_return(rtnl_message_type_is_qdisc(m->hdr->nlmsg_type), -EINVAL); + + tcm = NLMSG_DATA(m->hdr); + tcm->tcm_parent = parent; + + return 0; +} + +int sd_rtnl_message_set_qdisc_handle(sd_netlink_message *m, uint32_t handle) { + struct tcmsg *tcm; + + assert_return(rtnl_message_type_is_qdisc(m->hdr->nlmsg_type), -EINVAL); + + tcm = NLMSG_DATA(m->hdr); + tcm->tcm_handle = handle; + + return 0; +} diff --git a/src/network/meson.build b/src/network/meson.build index fd21008d10..06b6ec64a2 100644 --- a/src/network/meson.build +++ b/src/network/meson.build @@ -105,6 +105,12 @@ sources = files(''' networkd-util.h networkd-wifi.c networkd-wifi.h + tc/netem.c + tc/netem.h + tc/qdisc.c + tc/qdisc.h + tc/tc-util.c + tc/tc-util.h '''.split()) systemd_networkd_sources = files('networkd.c') diff --git a/src/network/networkd-link.c b/src/network/networkd-link.c index 906267764e..ccac6004ba 100644 --- a/src/network/networkd-link.c +++ b/src/network/networkd-link.c @@ -43,6 +43,7 @@ #include "tmpfile-util.h" #include "udev-util.h" #include "util.h" +#include "tc/qdisc.h" #include "virt.h" uint32_t link_get_vrf_table(Link *link) { @@ -2581,6 +2582,8 @@ static int link_drop_config(Link *link) { } static int link_configure(Link *link) { + QDiscs *qdisc; + Iterator i; int r; assert(link); @@ -2590,6 +2593,9 @@ static int link_configure(Link *link) { if (link->iftype == ARPHRD_CAN) return link_configure_can(link); + ORDERED_HASHMAP_FOREACH(qdisc, link->network->qdiscs_by_section, i) + (void) qdisc_configure(link, qdisc); + /* Drop foreign config, but ignore loopback or critical devices. * We do not want to remove loopback address or addresses used for root NFS. */ if (!(link->flags & IFF_LOOPBACK) && diff --git a/src/network/networkd-link.h b/src/network/networkd-link.h index 1e4510b277..e6a9c41ca5 100644 --- a/src/network/networkd-link.h +++ b/src/network/networkd-link.h @@ -78,6 +78,7 @@ typedef struct Link { unsigned nexthop_messages; unsigned routing_policy_rule_messages; unsigned routing_policy_rule_remove_messages; + unsigned qdisc_messages; unsigned enslaving; Set *addresses; diff --git a/src/network/networkd-network-gperf.gperf b/src/network/networkd-network-gperf.gperf index 68199ac45f..1375626e86 100644 --- a/src/network/networkd-network-gperf.gperf +++ b/src/network/networkd-network-gperf.gperf @@ -13,6 +13,8 @@ _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") #include "networkd-ndisc.h" #include "networkd-network.h" #include "vlan-util.h" +#include "tc/qdisc.h" +#include "tc/netem.h" %} struct ConfigPerfItem; %null_strings @@ -241,6 +243,11 @@ CAN.BitRate, config_parse_si_size, CAN.SamplePoint, config_parse_permille, 0, offsetof(Network, can_sample_point) CAN.RestartSec, config_parse_sec, 0, offsetof(Network, can_restart_us) CAN.TripleSampling, config_parse_tristate, 0, offsetof(Network, can_triple_sampling) +TrafficControlQueueingDiscipline.Parent, config_parse_tc_qdiscs_parent, 0, 0 +TrafficControlQueueingDiscipline.NetworkEmulatorDelaySec, config_parse_tc_network_emulator_delay, 0, 0 +TrafficControlQueueingDiscipline.NetworkEmulatorDelayJitterSec, config_parse_tc_network_emulator_delay, 0, 0 +TrafficControlQueueingDiscipline.NetworkEmulatorLossRate, config_parse_tc_network_emulator_loss_rate, 0, 0 +TrafficControlQueueingDiscipline.NetworkEmulatorPacketLimit, config_parse_tc_network_emulator_packet_limit, 0, 0 /* backwards compatibility: do not add new entries to this section */ Network.IPv4LL, config_parse_ipv4ll, 0, offsetof(Network, link_local) DHCP.ClientIdentifier, config_parse_dhcp_client_identifier, 0, offsetof(Network, dhcp_client_identifier) diff --git a/src/network/networkd-network.c b/src/network/networkd-network.c index 8cdcb57306..0956d2a9b7 100644 --- a/src/network/networkd-network.c +++ b/src/network/networkd-network.c @@ -471,6 +471,7 @@ int network_load_one(Manager *manager, OrderedHashmap **networks, const char *fi "IPv6PrefixDelegation\0" "IPv6Prefix\0" "IPv6RoutePrefix\0" + "TrafficControlQueueingDiscipline\0" "CAN\0", config_item_perf_lookup, network_network_gperf_lookup, CONFIG_PARSE_WARN, network); @@ -663,6 +664,7 @@ static Network *network_free(Network *network) { hashmap_free(network->address_labels_by_section); hashmap_free(network->prefixes_by_section); hashmap_free(network->rules_by_section); + ordered_hashmap_free_with_destructor(network->qdiscs_by_section, qdisc_free); if (network->manager && network->manager->duids_requesting_uuid) diff --git a/src/network/networkd-network.h b/src/network/networkd-network.h index 8b4b5d042d..9820d6af29 100644 --- a/src/network/networkd-network.h +++ b/src/network/networkd-network.h @@ -28,6 +28,7 @@ #include "networkd-util.h" #include "ordered-set.h" #include "resolve-util.h" +#include "tc/qdisc.h" typedef enum IPv6PrivacyExtensions { /* The values map to the kernel's /proc/sys/net/ipv6/conf/xxx/use_tempaddr values */ @@ -265,6 +266,7 @@ struct Network { Hashmap *prefixes_by_section; Hashmap *route_prefixes_by_section; Hashmap *rules_by_section; + OrderedHashmap *qdiscs_by_section; /* All kinds of DNS configuration */ struct in_addr_data *dns; diff --git a/src/network/tc/netem.c b/src/network/tc/netem.c new file mode 100644 index 0000000000..e0e3e9a48e --- /dev/null +++ b/src/network/tc/netem.c @@ -0,0 +1,213 @@ +/* SPDX-License-Identifier: LGPL-2.1+ + * Copyright © 2019 VMware, Inc. */ + +#include +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "hashmap.h" +#include "in-addr-util.h" +#include "netem.h" +#include "netlink-util.h" +#include "networkd-manager.h" +#include "parse-util.h" +#include "qdisc.h" +#include "string-util.h" +#include "tc-util.h" +#include "util.h" + +int network_emulator_new(NetworkEmulator **ret) { + NetworkEmulator *ne = NULL; + + ne = new(NetworkEmulator, 1); + if (!ne) + return -ENOMEM; + + *ne = (NetworkEmulator) { + .delay = USEC_INFINITY, + .jitter = USEC_INFINITY, + }; + + *ret = TAKE_PTR(ne); + + return 0; +} + +int network_emulator_fill_message(Link *link, QDiscs *qdisc, sd_netlink_message *req) { + struct tc_netem_qopt opt = { + .limit = 1000, + }; + int r; + + assert(link); + assert(qdisc); + assert(req); + + if (qdisc->ne.limit > 0) + opt.limit = qdisc->ne.limit; + + if (qdisc->ne.loss > 0) + opt.loss = qdisc->ne.loss; + + if (qdisc->ne.delay != USEC_INFINITY) { + r = tc_time_to_tick(qdisc->ne.delay, &opt.latency); + if (r < 0) + return log_link_error_errno(link, r, "Failed to calculate latency in TCA_OPTION: %m"); + } + + if (qdisc->ne.jitter != USEC_INFINITY) { + r = tc_time_to_tick(qdisc->ne.jitter, &opt.jitter); + if (r < 0) + return log_link_error_errno(link, r, "Failed to calculate jitter in TCA_OPTION: %m"); + } + + r = sd_netlink_message_append_data(req, TCA_OPTIONS, &opt, sizeof(struct tc_netem_qopt)); + if (r < 0) + return log_link_error_errno(link, r, "Could not append TCA_OPTION attribute: %m"); + + return 0; +} + +int config_parse_tc_network_emulator_delay( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDiscs *qdisc = NULL; + Network *network = data; + usec_t u; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + r = qdisc_new_static(network, filename, section_line, &qdisc); + if (r < 0) + return r; + + if (isempty(rvalue)) { + if (streq(lvalue, "NetworkEmulatorDelaySec")) + qdisc->ne.delay = USEC_INFINITY; + else if (streq(lvalue, "NetworkEmulatorDelayJitterSec")) + qdisc->ne.jitter = USEC_INFINITY; + + qdisc = NULL; + return 0; + } + + r = parse_sec(rvalue, &u); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to parse '%s=', ignoring assignment: %s", + lvalue, rvalue); + return 0; + } + + if (streq(lvalue, "NetworkEmulatorDelaySec")) + qdisc->ne.delay = u; + else if (streq(lvalue, "NetworkEmulatorDelayJitterSec")) + qdisc->ne.jitter = u; + + qdisc->has_network_emulator = true; + qdisc = NULL; + + return 0; +} + +int config_parse_tc_network_emulator_loss_rate( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDiscs *qdisc = NULL; + Network *network = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + r = qdisc_new_static(network, filename, section_line, &qdisc); + if (r < 0) + return r; + + if (isempty(rvalue)) { + qdisc->ne.loss = 0; + + qdisc = NULL; + return 0; + } + + r = parse_tc_percent(rvalue, &qdisc->ne.loss); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to parse 'NetworkEmularorLossRate=', ignoring assignment: %s", + rvalue); + return 0; + } + + qdisc = NULL; + return 0; +} + +int config_parse_tc_network_emulator_packet_limit( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDiscs *qdisc = NULL; + Network *network = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + r = qdisc_new_static(network, filename, section_line, &qdisc); + if (r < 0) + return r; + + if (isempty(rvalue)) { + qdisc->ne.limit = 0; + qdisc = NULL; + + return 0; + } + + r = safe_atou(rvalue, &qdisc->ne.limit); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to parse 'NetworkEmulatorPacketLimit=', ignoring assignment: %s", + rvalue); + return 0; + } + + qdisc = NULL; + return 0; +} diff --git a/src/network/tc/netem.h b/src/network/tc/netem.h new file mode 100644 index 0000000000..33dfdd8588 --- /dev/null +++ b/src/network/tc/netem.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: LGPL-2.1+ + * Copyright © 2019 VMware, Inc. */ +#pragma once + +#include "sd-netlink.h" + +#include "conf-parser.h" +#include "macro.h" +#include "../networkd-link.h" +#include "time-util.h" + +typedef struct NetworkEmulator NetworkEmulator; +typedef struct QDiscs QDiscs; + +struct NetworkEmulator { + usec_t delay; + usec_t jitter; + + uint32_t limit; + uint32_t loss; +}; + +int network_emulator_new(NetworkEmulator **ret); +int network_emulator_fill_message(Link *link, QDiscs *qdisc, sd_netlink_message *req); + +CONFIG_PARSER_PROTOTYPE(config_parse_tc_network_emulator_delay); +CONFIG_PARSER_PROTOTYPE(config_parse_tc_network_emulator_loss_rate); +CONFIG_PARSER_PROTOTYPE(config_parse_tc_network_emulator_packet_limit); diff --git a/src/network/tc/qdisc.c b/src/network/tc/qdisc.c new file mode 100644 index 0000000000..2a4724ea9f --- /dev/null +++ b/src/network/tc/qdisc.c @@ -0,0 +1,196 @@ +/* SPDX-License-Identifier: LGPL-2.1+ + * Copyright © 2019 VMware, Inc. */ + +#include + +#include "alloc-util.h" +#include "conf-parser.h" +#include "in-addr-util.h" +#include "netlink-util.h" +#include "networkd-manager.h" +#include "parse-util.h" +#include "qdisc.h" +#include "set.h" +#include "string-util.h" +#include "util.h" + +static int qdisc_new(QDiscs **ret) { + QDiscs *qdisc; + + qdisc = new(QDiscs, 1); + if (!qdisc) + return -ENOMEM; + + *qdisc = (QDiscs) { + .family = AF_UNSPEC, + .parent = TC_H_ROOT, + }; + + *ret = TAKE_PTR(qdisc); + + return 0; +} + +int qdisc_new_static(Network *network, const char *filename, unsigned section_line, QDiscs **ret) { + _cleanup_(network_config_section_freep) NetworkConfigSection *n = NULL; + _cleanup_(qdisc_freep) QDiscs *qdisc = NULL; + int r; + + assert(network); + assert(ret); + assert(!!filename == (section_line > 0)); + + if (filename) { + r = network_config_section_new(filename, section_line, &n); + if (r < 0) + return r; + + qdisc = ordered_hashmap_get(network->qdiscs_by_section, n); + if (qdisc) { + *ret = TAKE_PTR(qdisc); + + return 0; + } + } + + r = qdisc_new(&qdisc); + if (r < 0) + return r; + + qdisc->network = network; + + if (filename) { + qdisc->section = TAKE_PTR(n); + + r = ordered_hashmap_ensure_allocated(&network->qdiscs_by_section, &network_config_hash_ops); + if (r < 0) + return r; + + r = ordered_hashmap_put(network->qdiscs_by_section, qdisc->section, qdisc); + if (r < 0) + return r; + } + + *ret = TAKE_PTR(qdisc); + + return 0; +} + +void qdisc_free(QDiscs *qdisc) { + if (!qdisc) + return; + + if (qdisc->network && qdisc->section) + ordered_hashmap_remove(qdisc->network->qdiscs_by_section, qdisc->section); + + network_config_section_free(qdisc->section); + + free(qdisc); +} + +static int qdisc_handler(sd_netlink *rtnl, sd_netlink_message *m, Link *link) { + int r; + + assert(link); + assert(link->qdisc_messages > 0); + link->qdisc_messages--; + + if (IN_SET(link->state, LINK_STATE_FAILED, LINK_STATE_LINGER)) + return 1; + + r = sd_netlink_message_get_errno(m); + if (r < 0 && r != -EEXIST) { + log_link_error_errno(link, r, "Could not set QDisc: %m"); + return 1; + } + + return 1; +} + +int qdisc_configure(Link *link, QDiscs *qdisc) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL; + int r; + + assert(link); + assert(link->manager); + assert(link->manager->rtnl); + assert(link->ifindex > 0); + + r = sd_rtnl_message_new_qdisc(link->manager->rtnl, &req, RTM_NEWQDISC, qdisc->family, link->ifindex); + if (r < 0) + return log_link_error_errno(link, r, "Could not create RTM_NEWQDISC message: %m"); + + r = sd_rtnl_message_set_qdisc_parent(req, qdisc->parent); + if (r < 0) + return log_link_error_errno(link, r, "Could not create tcm_parent message: %m"); + + if (qdisc->parent == TC_H_CLSACT) { + r = sd_rtnl_message_set_qdisc_handle(req, TC_H_MAKE(TC_H_CLSACT, 0)); + if (r < 0) + return log_link_error_errno(link, r, "Could not set tcm_handle message: %m"); + + r = sd_netlink_message_append_string(req, TCA_KIND, "clsact"); + if (r < 0) + return log_link_error_errno(link, r, "Could not append TCA_KIND attribute: %m"); + } + + if (qdisc->has_network_emulator) { + r = sd_netlink_message_append_string(req, TCA_KIND, "netem"); + if (r < 0) + return log_link_error_errno(link, r, "Could not append TCA_KIND attribute: %m"); + + r = network_emulator_fill_message(link, qdisc, req); + if (r < 0) + return r; + } + + r = netlink_call_async(link->manager->rtnl, NULL, req, qdisc_handler, link_netlink_destroy_callback, link); + if (r < 0) + return log_link_error_errno(link, r, "Could not send rtnetlink message: %m"); + + link_ref(link); + link->qdisc_messages++; + + return 0; +} + +int config_parse_tc_qdiscs_parent( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + _cleanup_(qdisc_free_or_set_invalidp) QDiscs *qdisc = NULL; + Network *network = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + r = qdisc_new_static(network, filename, section_line, &qdisc); + if (r < 0) + return r; + + if (streq(rvalue, "root")) + qdisc->parent = TC_H_ROOT; + else if (streq(rvalue, "clsact")) + qdisc->parent = TC_H_CLSACT; + else { + log_syntax(unit, LOG_ERR, filename, line, r, + "Failed to parse [QueueDiscs] 'Parent=', ignoring assignment: %s", + rvalue); + return 0; + } + + qdisc = NULL; + + return 0; +} diff --git a/src/network/tc/qdisc.h b/src/network/tc/qdisc.h new file mode 100644 index 0000000000..80b893b837 --- /dev/null +++ b/src/network/tc/qdisc.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1+ + * Copyright © 2019 VMware, Inc. */ +#pragma once + +#include "conf-parser.h" +#include "macro.h" +#include "netem.h" +#include "../networkd-util.h" + +typedef struct QDiscs QDiscs; + +struct QDiscs { + NetworkConfigSection *section; + Network *network; + + Link *link; + + int family; + + uint32_t handle; + uint32_t parent; + + bool has_network_emulator:1; + + NetworkEmulator ne; +}; + +void qdisc_free(QDiscs *qdisc); +int qdisc_new_static(Network *network, const char *filename, unsigned section_line, QDiscs **ret); + +int qdisc_configure(Link *link, QDiscs *qdisc); + +DEFINE_NETWORK_SECTION_FUNCTIONS(QDiscs, qdisc_free); + +CONFIG_PARSER_PROTOTYPE(config_parse_tc_qdiscs_parent); diff --git a/src/network/tc/tc-util.c b/src/network/tc/tc-util.c new file mode 100644 index 0000000000..7e1cf53e11 --- /dev/null +++ b/src/network/tc/tc-util.c @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: LGPL-2.1+ + * Copyright © 2019 VMware, Inc. */ + +#include "alloc-util.h" +#include "fileio.h" +#include "parse-util.h" +#include "tc-util.h" +#include "time-util.h" + +static int tc_init(double *ticks_in_usec) { + uint32_t clock_resolution, ticks_to_usec, usec_to_ticks; + _cleanup_free_ char *line = NULL; + double clock_factor; + int r; + + r = read_one_line_file("/proc/net/psched", &line); + if (r < 0) + return r; + + r = sscanf(line, "%08x%08x%08x", &ticks_to_usec, &usec_to_ticks, &clock_resolution); + if (r < 3) + return -EIO; + + clock_factor = (double) clock_resolution / USEC_PER_SEC; + *ticks_in_usec = (double) ticks_to_usec / usec_to_ticks * clock_factor; + + return 0; +} + +int tc_time_to_tick(usec_t t, uint32_t *ret) { + static double ticks_in_usec = -1; + usec_t a; + int r; + + assert(ret); + + if (ticks_in_usec < 0) { + r = tc_init(&ticks_in_usec); + if (r < 0) + return r; + } + + a = t * ticks_in_usec; + if (a > UINT32_MAX) + return -ERANGE; + + *ret = a; + return 0; +} + +int parse_tc_percent(const char *s, uint32_t *percent) { + int r; + + assert(s); + assert(percent); + + r = parse_permille(s); + if (r < 0) + return r; + + *percent = (double) r / 1000 * UINT32_MAX; + return 0; +} diff --git a/src/network/tc/tc-util.h b/src/network/tc/tc-util.h new file mode 100644 index 0000000000..ce7ab40538 --- /dev/null +++ b/src/network/tc/tc-util.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: LGPL-2.1+ + * Copyright © 2019 VMware, Inc. */ +#pragma once + +#include "time-util.h" + +int tc_time_to_tick(usec_t t, uint32_t *ret); +int parse_tc_percent(const char *s, uint32_t *percent); diff --git a/src/systemd/sd-netlink.h b/src/systemd/sd-netlink.h index 099d76d3c7..b34befebc8 100644 --- a/src/systemd/sd-netlink.h +++ b/src/systemd/sd-netlink.h @@ -202,6 +202,10 @@ int sd_rtnl_message_routing_policy_rule_get_rtm_type(const sd_netlink_message *m int sd_rtnl_message_routing_policy_rule_set_flags(sd_netlink_message *m, unsigned flags); int sd_rtnl_message_routing_policy_rule_get_flags(const sd_netlink_message *m, unsigned *flags); +int sd_rtnl_message_new_qdisc(sd_netlink *rtnl, sd_netlink_message **ret, uint16_t nlmsg_type, int tcm_family, int tcm_ifindex); +int sd_rtnl_message_set_qdisc_parent(sd_netlink_message *m, uint32_t parent); +int sd_rtnl_message_set_qdisc_handle(sd_netlink_message *m, uint32_t handle); + /* genl */ int sd_genl_socket_open(sd_netlink **nl); int sd_genl_message_new(sd_netlink *nl, sd_genl_family family, uint8_t cmd, sd_netlink_message **m); diff --git a/test/fuzz/fuzz-network-parser/directives.network b/test/fuzz/fuzz-network-parser/directives.network index 7a25561336..0d0892fd3a 100644 --- a/test/fuzz/fuzz-network-parser/directives.network +++ b/test/fuzz/fuzz-network-parser/directives.network @@ -262,3 +262,9 @@ DNS= [NextHop] Id= Gateway= +[TrafficControlQueueingDiscipline] +Parent= +NetworkEmulatorDelaySec= +NetworkEmulatorDelayJitterSec= +NetworkEmulatorLossRate= +NetworkEmulatorPacketLimit=