Loading...
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __BPF_TCP_HELPERS_H
3#define __BPF_TCP_HELPERS_H
4
5#include <stdbool.h>
6#include <linux/types.h>
7#include <bpf/bpf_helpers.h>
8#include <bpf/bpf_core_read.h>
9#include <bpf/bpf_tracing.h>
10
11#define BPF_STRUCT_OPS(name, args...) \
12SEC("struct_ops/"#name) \
13BPF_PROG(name, args)
14
15#ifndef SOL_TCP
16#define SOL_TCP 6
17#endif
18
19#ifndef TCP_CA_NAME_MAX
20#define TCP_CA_NAME_MAX 16
21#endif
22
23#define tcp_jiffies32 ((__u32)bpf_jiffies64())
24
25struct sock_common {
26 unsigned char skc_state;
27 __u16 skc_num;
28} __attribute__((preserve_access_index));
29
30enum sk_pacing {
31 SK_PACING_NONE = 0,
32 SK_PACING_NEEDED = 1,
33 SK_PACING_FQ = 2,
34};
35
36struct sock {
37 struct sock_common __sk_common;
38#define sk_state __sk_common.skc_state
39 unsigned long sk_pacing_rate;
40 __u32 sk_pacing_status; /* see enum sk_pacing */
41} __attribute__((preserve_access_index));
42
43struct inet_sock {
44 struct sock sk;
45} __attribute__((preserve_access_index));
46
47struct inet_connection_sock {
48 struct inet_sock icsk_inet;
49 __u8 icsk_ca_state:6,
50 icsk_ca_setsockopt:1,
51 icsk_ca_dst_locked:1;
52 struct {
53 __u8 pending;
54 } icsk_ack;
55 __u64 icsk_ca_priv[104 / sizeof(__u64)];
56} __attribute__((preserve_access_index));
57
58struct request_sock {
59 struct sock_common __req_common;
60} __attribute__((preserve_access_index));
61
62struct tcp_sock {
63 struct inet_connection_sock inet_conn;
64
65 __u32 rcv_nxt;
66 __u32 snd_nxt;
67 __u32 snd_una;
68 __u32 window_clamp;
69 __u8 ecn_flags;
70 __u32 delivered;
71 __u32 delivered_ce;
72 __u32 snd_cwnd;
73 __u32 snd_cwnd_cnt;
74 __u32 snd_cwnd_clamp;
75 __u32 snd_ssthresh;
76 __u8 syn_data:1, /* SYN includes data */
77 syn_fastopen:1, /* SYN includes Fast Open option */
78 syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
79 syn_fastopen_ch:1, /* Active TFO re-enabling probe */
80 syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
81 save_syn:1, /* Save headers of SYN packet */
82 is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
83 syn_smc:1; /* SYN includes SMC */
84 __u32 max_packets_out;
85 __u32 lsndtime;
86 __u32 prior_cwnd;
87 __u64 tcp_mstamp; /* most recent packet received/sent */
88 bool is_mptcp;
89} __attribute__((preserve_access_index));
90
91static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk)
92{
93 return (struct inet_connection_sock *)sk;
94}
95
96static __always_inline void *inet_csk_ca(const struct sock *sk)
97{
98 return (void *)inet_csk(sk)->icsk_ca_priv;
99}
100
101static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk)
102{
103 return (struct tcp_sock *)sk;
104}
105
106static __always_inline bool before(__u32 seq1, __u32 seq2)
107{
108 return (__s32)(seq1-seq2) < 0;
109}
110#define after(seq2, seq1) before(seq1, seq2)
111
112#define TCP_ECN_OK 1
113#define TCP_ECN_QUEUE_CWR 2
114#define TCP_ECN_DEMAND_CWR 4
115#define TCP_ECN_SEEN 8
116
117enum inet_csk_ack_state_t {
118 ICSK_ACK_SCHED = 1,
119 ICSK_ACK_TIMER = 2,
120 ICSK_ACK_PUSHED = 4,
121 ICSK_ACK_PUSHED2 = 8,
122 ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */
123};
124
125enum tcp_ca_event {
126 CA_EVENT_TX_START = 0,
127 CA_EVENT_CWND_RESTART = 1,
128 CA_EVENT_COMPLETE_CWR = 2,
129 CA_EVENT_LOSS = 3,
130 CA_EVENT_ECN_NO_CE = 4,
131 CA_EVENT_ECN_IS_CE = 5,
132};
133
134struct ack_sample {
135 __u32 pkts_acked;
136 __s32 rtt_us;
137 __u32 in_flight;
138} __attribute__((preserve_access_index));
139
140struct rate_sample {
141 __u64 prior_mstamp; /* starting timestamp for interval */
142 __u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
143 __s32 delivered; /* number of packets delivered over interval */
144 long interval_us; /* time for tp->delivered to incr "delivered" */
145 __u32 snd_interval_us; /* snd interval for delivered packets */
146 __u32 rcv_interval_us; /* rcv interval for delivered packets */
147 long rtt_us; /* RTT of last (S)ACKed packet (or -1) */
148 int losses; /* number of packets marked lost upon ACK */
149 __u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */
150 __u32 prior_in_flight; /* in flight before this ACK */
151 bool is_app_limited; /* is sample from packet with bubble in pipe? */
152 bool is_retrans; /* is sample from retransmission? */
153 bool is_ack_delayed; /* is this (likely) a delayed ACK? */
154} __attribute__((preserve_access_index));
155
156#define TCP_CA_NAME_MAX 16
157#define TCP_CONG_NEEDS_ECN 0x2
158
159struct tcp_congestion_ops {
160 char name[TCP_CA_NAME_MAX];
161 __u32 flags;
162
163 /* initialize private data (optional) */
164 void (*init)(struct sock *sk);
165 /* cleanup private data (optional) */
166 void (*release)(struct sock *sk);
167
168 /* return slow start threshold (required) */
169 __u32 (*ssthresh)(struct sock *sk);
170 /* do new cwnd calculation (required) */
171 void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked);
172 /* call before changing ca_state (optional) */
173 void (*set_state)(struct sock *sk, __u8 new_state);
174 /* call when cwnd event occurs (optional) */
175 void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
176 /* call when ack arrives (optional) */
177 void (*in_ack_event)(struct sock *sk, __u32 flags);
178 /* new value of cwnd after loss (required) */
179 __u32 (*undo_cwnd)(struct sock *sk);
180 /* hook for packet ack accounting (optional) */
181 void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
182 /* override sysctl_tcp_min_tso_segs */
183 __u32 (*min_tso_segs)(struct sock *sk);
184 /* returns the multiplier used in tcp_sndbuf_expand (optional) */
185 __u32 (*sndbuf_expand)(struct sock *sk);
186 /* call when packets are delivered to update cwnd and pacing rate,
187 * after all the ca_state processing. (optional)
188 */
189 void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
190 void *owner;
191};
192
193#define min(a, b) ((a) < (b) ? (a) : (b))
194#define max(a, b) ((a) > (b) ? (a) : (b))
195#define min_not_zero(x, y) ({ \
196 typeof(x) __x = (x); \
197 typeof(y) __y = (y); \
198 __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
199
200static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp)
201{
202 return tp->snd_cwnd < tp->snd_ssthresh;
203}
204
205static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk)
206{
207 const struct tcp_sock *tp = tcp_sk(sk);
208
209 /* If in slow start, ensure cwnd grows to twice what was ACKed. */
210 if (tcp_in_slow_start(tp))
211 return tp->snd_cwnd < 2 * tp->max_packets_out;
212
213 return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited);
214}
215
216static __always_inline bool tcp_cc_eq(const char *a, const char *b)
217{
218 int i;
219
220 for (i = 0; i < TCP_CA_NAME_MAX; i++) {
221 if (a[i] != b[i])
222 return false;
223 if (!a[i])
224 break;
225 }
226
227 return true;
228}
229
230extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym;
231extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym;
232
233struct mptcp_sock {
234 struct inet_connection_sock sk;
235
236 __u32 token;
237 struct sock *first;
238 char ca_name[TCP_CA_NAME_MAX];
239} __attribute__((preserve_access_index));
240
241#endif
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __BPF_TCP_HELPERS_H
3#define __BPF_TCP_HELPERS_H
4
5#include <stdbool.h>
6#include <linux/types.h>
7#include <bpf/bpf_helpers.h>
8#include <bpf/bpf_core_read.h>
9#include <bpf/bpf_tracing.h>
10
11#define BPF_STRUCT_OPS(name, args...) \
12SEC("struct_ops/"#name) \
13BPF_PROG(name, args)
14
15#define tcp_jiffies32 ((__u32)bpf_jiffies64())
16
17struct sock_common {
18 unsigned char skc_state;
19} __attribute__((preserve_access_index));
20
21enum sk_pacing {
22 SK_PACING_NONE = 0,
23 SK_PACING_NEEDED = 1,
24 SK_PACING_FQ = 2,
25};
26
27struct sock {
28 struct sock_common __sk_common;
29 unsigned long sk_pacing_rate;
30 __u32 sk_pacing_status; /* see enum sk_pacing */
31} __attribute__((preserve_access_index));
32
33struct inet_sock {
34 struct sock sk;
35} __attribute__((preserve_access_index));
36
37struct inet_connection_sock {
38 struct inet_sock icsk_inet;
39 __u8 icsk_ca_state:6,
40 icsk_ca_setsockopt:1,
41 icsk_ca_dst_locked:1;
42 struct {
43 __u8 pending;
44 } icsk_ack;
45 __u64 icsk_ca_priv[104 / sizeof(__u64)];
46} __attribute__((preserve_access_index));
47
48struct tcp_sock {
49 struct inet_connection_sock inet_conn;
50
51 __u32 rcv_nxt;
52 __u32 snd_nxt;
53 __u32 snd_una;
54 __u8 ecn_flags;
55 __u32 delivered;
56 __u32 delivered_ce;
57 __u32 snd_cwnd;
58 __u32 snd_cwnd_cnt;
59 __u32 snd_cwnd_clamp;
60 __u32 snd_ssthresh;
61 __u8 syn_data:1, /* SYN includes data */
62 syn_fastopen:1, /* SYN includes Fast Open option */
63 syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
64 syn_fastopen_ch:1, /* Active TFO re-enabling probe */
65 syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
66 save_syn:1, /* Save headers of SYN packet */
67 is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
68 syn_smc:1; /* SYN includes SMC */
69 __u32 max_packets_out;
70 __u32 lsndtime;
71 __u32 prior_cwnd;
72 __u64 tcp_mstamp; /* most recent packet received/sent */
73} __attribute__((preserve_access_index));
74
75static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk)
76{
77 return (struct inet_connection_sock *)sk;
78}
79
80static __always_inline void *inet_csk_ca(const struct sock *sk)
81{
82 return (void *)inet_csk(sk)->icsk_ca_priv;
83}
84
85static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk)
86{
87 return (struct tcp_sock *)sk;
88}
89
90static __always_inline bool before(__u32 seq1, __u32 seq2)
91{
92 return (__s32)(seq1-seq2) < 0;
93}
94#define after(seq2, seq1) before(seq1, seq2)
95
96#define TCP_ECN_OK 1
97#define TCP_ECN_QUEUE_CWR 2
98#define TCP_ECN_DEMAND_CWR 4
99#define TCP_ECN_SEEN 8
100
101enum inet_csk_ack_state_t {
102 ICSK_ACK_SCHED = 1,
103 ICSK_ACK_TIMER = 2,
104 ICSK_ACK_PUSHED = 4,
105 ICSK_ACK_PUSHED2 = 8,
106 ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */
107};
108
109enum tcp_ca_event {
110 CA_EVENT_TX_START = 0,
111 CA_EVENT_CWND_RESTART = 1,
112 CA_EVENT_COMPLETE_CWR = 2,
113 CA_EVENT_LOSS = 3,
114 CA_EVENT_ECN_NO_CE = 4,
115 CA_EVENT_ECN_IS_CE = 5,
116};
117
118enum tcp_ca_state {
119 TCP_CA_Open = 0,
120 TCP_CA_Disorder = 1,
121 TCP_CA_CWR = 2,
122 TCP_CA_Recovery = 3,
123 TCP_CA_Loss = 4
124};
125
126struct ack_sample {
127 __u32 pkts_acked;
128 __s32 rtt_us;
129 __u32 in_flight;
130} __attribute__((preserve_access_index));
131
132struct rate_sample {
133 __u64 prior_mstamp; /* starting timestamp for interval */
134 __u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
135 __s32 delivered; /* number of packets delivered over interval */
136 long interval_us; /* time for tp->delivered to incr "delivered" */
137 __u32 snd_interval_us; /* snd interval for delivered packets */
138 __u32 rcv_interval_us; /* rcv interval for delivered packets */
139 long rtt_us; /* RTT of last (S)ACKed packet (or -1) */
140 int losses; /* number of packets marked lost upon ACK */
141 __u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */
142 __u32 prior_in_flight; /* in flight before this ACK */
143 bool is_app_limited; /* is sample from packet with bubble in pipe? */
144 bool is_retrans; /* is sample from retransmission? */
145 bool is_ack_delayed; /* is this (likely) a delayed ACK? */
146} __attribute__((preserve_access_index));
147
148#define TCP_CA_NAME_MAX 16
149#define TCP_CONG_NEEDS_ECN 0x2
150
151struct tcp_congestion_ops {
152 char name[TCP_CA_NAME_MAX];
153 __u32 flags;
154
155 /* initialize private data (optional) */
156 void (*init)(struct sock *sk);
157 /* cleanup private data (optional) */
158 void (*release)(struct sock *sk);
159
160 /* return slow start threshold (required) */
161 __u32 (*ssthresh)(struct sock *sk);
162 /* do new cwnd calculation (required) */
163 void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked);
164 /* call before changing ca_state (optional) */
165 void (*set_state)(struct sock *sk, __u8 new_state);
166 /* call when cwnd event occurs (optional) */
167 void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
168 /* call when ack arrives (optional) */
169 void (*in_ack_event)(struct sock *sk, __u32 flags);
170 /* new value of cwnd after loss (required) */
171 __u32 (*undo_cwnd)(struct sock *sk);
172 /* hook for packet ack accounting (optional) */
173 void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
174 /* override sysctl_tcp_min_tso_segs */
175 __u32 (*min_tso_segs)(struct sock *sk);
176 /* returns the multiplier used in tcp_sndbuf_expand (optional) */
177 __u32 (*sndbuf_expand)(struct sock *sk);
178 /* call when packets are delivered to update cwnd and pacing rate,
179 * after all the ca_state processing. (optional)
180 */
181 void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
182};
183
184#define min(a, b) ((a) < (b) ? (a) : (b))
185#define max(a, b) ((a) > (b) ? (a) : (b))
186#define min_not_zero(x, y) ({ \
187 typeof(x) __x = (x); \
188 typeof(y) __y = (y); \
189 __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
190
191static __always_inline __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked)
192{
193 __u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);
194
195 acked -= cwnd - tp->snd_cwnd;
196 tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
197
198 return acked;
199}
200
201static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp)
202{
203 return tp->snd_cwnd < tp->snd_ssthresh;
204}
205
206static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk)
207{
208 const struct tcp_sock *tp = tcp_sk(sk);
209
210 /* If in slow start, ensure cwnd grows to twice what was ACKed. */
211 if (tcp_in_slow_start(tp))
212 return tp->snd_cwnd < 2 * tp->max_packets_out;
213
214 return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited);
215}
216
217static __always_inline void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked)
218{
219 /* If credits accumulated at a higher w, apply them gently now. */
220 if (tp->snd_cwnd_cnt >= w) {
221 tp->snd_cwnd_cnt = 0;
222 tp->snd_cwnd++;
223 }
224
225 tp->snd_cwnd_cnt += acked;
226 if (tp->snd_cwnd_cnt >= w) {
227 __u32 delta = tp->snd_cwnd_cnt / w;
228
229 tp->snd_cwnd_cnt -= delta * w;
230 tp->snd_cwnd += delta;
231 }
232 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
233}
234
235#endif