Linux Audio

Check our new training course

In-person Linux kernel drivers training

Jun 16-20, 2025
Register
Loading...
v6.13.7
  1// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
  2/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
  3
  4#define BPF_NO_KFUNC_PROTOTYPES
  5#include "vmlinux.h"
  6
  7#include <bpf/bpf_helpers.h>
  8#include <bpf/bpf_endian.h>
  9#include <asm/errno.h>
 10
 11#include "bpf_compiler.h"
 12
 13#define TC_ACT_OK 0
 14#define TC_ACT_SHOT 2
 15
 16#define NSEC_PER_SEC 1000000000L
 17
 18#define ETH_ALEN 6
 19#define ETH_P_IP 0x0800
 20#define ETH_P_IPV6 0x86DD
 21
 22#define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3])
 23
 
 24#define IP_MF 0x2000
 25#define IP_OFFSET 0x1fff
 26
 27#define NEXTHDR_TCP 6
 28
 29#define TCPOPT_NOP 1
 30#define TCPOPT_EOL 0
 31#define TCPOPT_MSS 2
 32#define TCPOPT_WINDOW 3
 33#define TCPOPT_SACK_PERM 4
 34#define TCPOPT_TIMESTAMP 8
 35
 36#define TCPOLEN_MSS 4
 37#define TCPOLEN_WINDOW 3
 38#define TCPOLEN_SACK_PERM 2
 39#define TCPOLEN_TIMESTAMP 10
 40
 41#define TCP_TS_HZ 1000
 42#define TS_OPT_WSCALE_MASK 0xf
 43#define TS_OPT_SACK (1 << 4)
 44#define TS_OPT_ECN (1 << 5)
 45#define TSBITS 6
 46#define TSMASK (((__u32)1 << TSBITS) - 1)
 47#define TCP_MAX_WSCALE 14U
 48
 49#define IPV4_MAXLEN 60
 50#define TCP_MAXLEN 60
 51
 52#define DEFAULT_MSS4 1460
 53#define DEFAULT_MSS6 1440
 54#define DEFAULT_WSCALE 7
 55#define DEFAULT_TTL 64
 56#define MAX_ALLOWED_PORTS 8
 57
 58#define MAX_PACKET_OFF 0xffff
 59
 60#define swap(a, b) \
 61	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
 62
 63#define __get_unaligned_t(type, ptr) ({						\
 64	const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \
 65	__pptr->x;								\
 66})
 67
 68#define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr))
 69
 70struct {
 71	__uint(type, BPF_MAP_TYPE_ARRAY);
 72	__type(key, __u32);
 73	__type(value, __u64);
 74	__uint(max_entries, 2);
 75} values SEC(".maps");
 76
 77struct {
 78	__uint(type, BPF_MAP_TYPE_ARRAY);
 79	__type(key, __u32);
 80	__type(value, __u16);
 81	__uint(max_entries, MAX_ALLOWED_PORTS);
 82} allowed_ports SEC(".maps");
 83
 84/* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in
 85 * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally.
 86 */
 87
 88struct bpf_ct_opts___local {
 89	s32 netns_id;
 90	s32 error;
 91	u8 l4proto;
 92	u8 dir;
 93	u8 reserved[2];
 94} __attribute__((preserve_access_index));
 95
 96#define BPF_F_CURRENT_NETNS (-1)
 97
 98extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx,
 99					 struct bpf_sock_tuple *bpf_tuple,
100					 __u32 len_tuple,
101					 struct bpf_ct_opts___local *opts,
102					 __u32 len_opts) __ksym;
103
104extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx,
105					 struct bpf_sock_tuple *bpf_tuple,
106					 u32 len_tuple,
107					 struct bpf_ct_opts___local *opts,
108					 u32 len_opts) __ksym;
109
110extern void bpf_ct_release(struct nf_conn *ct) __ksym;
111
112static __always_inline void swap_eth_addr(__u8 *a, __u8 *b)
113{
114	__u8 tmp[ETH_ALEN];
115
116	__builtin_memcpy(tmp, a, ETH_ALEN);
117	__builtin_memcpy(a, b, ETH_ALEN);
118	__builtin_memcpy(b, tmp, ETH_ALEN);
119}
120
121static __always_inline __u16 csum_fold(__u32 csum)
122{
123	csum = (csum & 0xffff) + (csum >> 16);
124	csum = (csum & 0xffff) + (csum >> 16);
125	return (__u16)~csum;
126}
127
128static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
129					       __u32 len, __u8 proto,
130					       __u32 csum)
131{
132	__u64 s = csum;
133
134	s += (__u32)saddr;
135	s += (__u32)daddr;
136#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
137	s += proto + len;
138#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
139	s += (proto + len) << 8;
140#else
141#error Unknown endian
142#endif
143	s = (s & 0xffffffff) + (s >> 32);
144	s = (s & 0xffffffff) + (s >> 32);
145
146	return csum_fold((__u32)s);
147}
148
149static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
150					     const struct in6_addr *daddr,
151					     __u32 len, __u8 proto, __u32 csum)
152{
153	__u64 sum = csum;
154	int i;
155
156	__pragma_loop_unroll
157	for (i = 0; i < 4; i++)
158		sum += (__u32)saddr->in6_u.u6_addr32[i];
159
160	__pragma_loop_unroll
161	for (i = 0; i < 4; i++)
162		sum += (__u32)daddr->in6_u.u6_addr32[i];
163
164	/* Don't combine additions to avoid 32-bit overflow. */
165	sum += bpf_htonl(len);
166	sum += bpf_htonl(proto);
167
168	sum = (sum & 0xffffffff) + (sum >> 32);
169	sum = (sum & 0xffffffff) + (sum >> 32);
170
171	return csum_fold((__u32)sum);
172}
173
174static __always_inline __u64 tcp_clock_ns(void)
175{
176	return bpf_ktime_get_ns();
177}
178
179static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
180{
181	return ns / (NSEC_PER_SEC / TCP_TS_HZ);
182}
183
184static __always_inline __u32 tcp_clock_ms(void)
185{
186	return tcp_ns_to_ts(tcp_clock_ns());
187}
188
189struct tcpopt_context {
190	void *data;
191	void *data_end;
192	__be32 *tsecr;
193	__u8 wscale;
194	bool option_timestamp;
195	bool option_sack;
196	__u32 off;
197};
198
199static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz)
200{
201	__u64 off = ctx->off;
202	__u8 *data;
203
204	/* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */
205	if (off > MAX_PACKET_OFF - sz)
206		return NULL;
207
208	data = ctx->data + off;
209	barrier_var(data);
210	if (data + sz >= ctx->data_end)
211		return NULL;
212
213	ctx->off += sz;
214	return data;
215}
216
217static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
218{
219	__u8 *opcode, *opsize, *wscale, *tsecr;
220	__u32 off = ctx->off;
221
222	opcode = next(ctx, 1);
223	if (!opcode)
224		return 1;
225
226	if (*opcode == TCPOPT_EOL)
227		return 1;
228	if (*opcode == TCPOPT_NOP)
229		return 0;
230
231	opsize = next(ctx, 1);
232	if (!opsize || *opsize < 2)
233		return 1;
234
235	switch (*opcode) {
236	case TCPOPT_WINDOW:
237		wscale = next(ctx, 1);
238		if (!wscale)
239			return 1;
240		if (*opsize == TCPOLEN_WINDOW)
241			ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE;
242		break;
243	case TCPOPT_TIMESTAMP:
244		tsecr = next(ctx, 4);
245		if (!tsecr)
246			return 1;
247		if (*opsize == TCPOLEN_TIMESTAMP) {
248			ctx->option_timestamp = true;
249			/* Client's tsval becomes our tsecr. */
250			*ctx->tsecr = get_unaligned((__be32 *)tsecr);
251		}
252		break;
253	case TCPOPT_SACK_PERM:
254		if (*opsize == TCPOLEN_SACK_PERM)
255			ctx->option_sack = true;
256		break;
257	}
258
259	ctx->off = off + *opsize;
260
261	return 0;
262}
263
264static int tscookie_tcpopt_parse_batch(__u32 index, void *context)
265{
266	int i;
267
268	for (i = 0; i < 7; i++)
269		if (tscookie_tcpopt_parse(context))
270			return 1;
271	return 0;
272}
273
274static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
275					  __u16 tcp_len, __be32 *tsval,
276					  __be32 *tsecr, void *data, void *data_end)
277{
278	struct tcpopt_context loop_ctx = {
279		.data = data,
280		.data_end = data_end,
281		.tsecr = tsecr,
282		.wscale = TS_OPT_WSCALE_MASK,
283		.option_timestamp = false,
284		.option_sack = false,
285		/* Note: currently verifier would track .off as unbound scalar.
286		 *       In case if verifier would at some point get smarter and
287		 *       compute bounded value for this var, beware that it might
288		 *       hinder bpf_loop() convergence validation.
289		 */
290		.off = (__u8 *)(tcp_header + 1) - (__u8 *)data,
291	};
292	u32 cookie;
293
294	bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0);
295
296	if (!loop_ctx.option_timestamp)
297		return false;
298
299	cookie = tcp_clock_ms() & ~TSMASK;
300	cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;
301	if (loop_ctx.option_sack)
302		cookie |= TS_OPT_SACK;
303	if (tcp_header->ece && tcp_header->cwr)
304		cookie |= TS_OPT_ECN;
305	*tsval = bpf_htonl(cookie);
306
307	return true;
308}
309
310static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
311						 __u8 *ttl, bool ipv6)
312{
313	__u32 key = 0;
314	__u64 *value;
315
316	value = bpf_map_lookup_elem(&values, &key);
317	if (value && *value != 0) {
318		if (ipv6)
319			*mss = (*value >> 32) & 0xffff;
320		else
321			*mss = *value & 0xffff;
322		*wscale = (*value >> 16) & 0xf;
323		*ttl = (*value >> 24) & 0xff;
324		return;
325	}
326
327	*mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4;
328	*wscale = DEFAULT_WSCALE;
329	*ttl = DEFAULT_TTL;
330}
331
332static __always_inline void values_inc_synacks(void)
333{
334	__u32 key = 1;
335	__u64 *value;
336
337	value = bpf_map_lookup_elem(&values, &key);
338	if (value)
339		__sync_fetch_and_add(value, 1);
340}
341
342static __always_inline bool check_port_allowed(__u16 port)
343{
344	__u32 i;
345
346	for (i = 0; i < MAX_ALLOWED_PORTS; i++) {
347		__u32 key = i;
348		__u16 *value;
349
350		value = bpf_map_lookup_elem(&allowed_ports, &key);
351
352		if (!value)
353			break;
354		/* 0 is a terminator value. Check it first to avoid matching on
355		 * a forbidden port == 0 and returning true.
356		 */
357		if (*value == 0)
358			break;
359
360		if (*value == port)
361			return true;
362	}
363
364	return false;
365}
366
367struct header_pointers {
368	struct ethhdr *eth;
369	struct iphdr *ipv4;
370	struct ipv6hdr *ipv6;
371	struct tcphdr *tcp;
372	__u16 tcp_len;
373};
374
375static __always_inline int tcp_dissect(void *data, void *data_end,
376				       struct header_pointers *hdr)
377{
378	hdr->eth = data;
379	if (hdr->eth + 1 > data_end)
380		return XDP_DROP;
381
382	switch (bpf_ntohs(hdr->eth->h_proto)) {
383	case ETH_P_IP:
384		hdr->ipv6 = NULL;
385
386		hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
387		if (hdr->ipv4 + 1 > data_end)
388			return XDP_DROP;
389		if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4))
390			return XDP_DROP;
391		if (hdr->ipv4->version != 4)
392			return XDP_DROP;
393
394		if (hdr->ipv4->protocol != IPPROTO_TCP)
395			return XDP_PASS;
396
397		hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
398		break;
399	case ETH_P_IPV6:
400		hdr->ipv4 = NULL;
401
402		hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
403		if (hdr->ipv6 + 1 > data_end)
404			return XDP_DROP;
405		if (hdr->ipv6->version != 6)
406			return XDP_DROP;
407
408		/* XXX: Extension headers are not supported and could circumvent
409		 * XDP SYN flood protection.
410		 */
411		if (hdr->ipv6->nexthdr != NEXTHDR_TCP)
412			return XDP_PASS;
413
414		hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
415		break;
416	default:
417		/* XXX: VLANs will circumvent XDP SYN flood protection. */
418		return XDP_PASS;
419	}
420
421	if (hdr->tcp + 1 > data_end)
422		return XDP_DROP;
423	hdr->tcp_len = hdr->tcp->doff * 4;
424	if (hdr->tcp_len < sizeof(*hdr->tcp))
425		return XDP_DROP;
426
427	return XDP_TX;
428}
429
430static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp)
431{
432	struct bpf_ct_opts___local ct_lookup_opts = {
433		.netns_id = BPF_F_CURRENT_NETNS,
434		.l4proto = IPPROTO_TCP,
435	};
436	struct bpf_sock_tuple tup = {};
437	struct nf_conn *ct;
438	__u32 tup_size;
439
440	if (hdr->ipv4) {
441		/* TCP doesn't normally use fragments, and XDP can't reassemble
442		 * them.
443		 */
444		if ((hdr->ipv4->frag_off & bpf_htons(IP_MF | IP_OFFSET)) != 0)
445			return XDP_DROP;
446
447		tup.ipv4.saddr = hdr->ipv4->saddr;
448		tup.ipv4.daddr = hdr->ipv4->daddr;
449		tup.ipv4.sport = hdr->tcp->source;
450		tup.ipv4.dport = hdr->tcp->dest;
451		tup_size = sizeof(tup.ipv4);
452	} else if (hdr->ipv6) {
453		__builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr));
454		__builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr));
455		tup.ipv6.sport = hdr->tcp->source;
456		tup.ipv6.dport = hdr->tcp->dest;
457		tup_size = sizeof(tup.ipv6);
458	} else {
459		/* The verifier can't track that either ipv4 or ipv6 is not
460		 * NULL.
461		 */
462		return XDP_ABORTED;
463	}
464	if (xdp)
465		ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
466	else
467		ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
468	if (ct) {
469		unsigned long status = ct->status;
470
471		bpf_ct_release(ct);
472		if (status & IPS_CONFIRMED)
473			return XDP_PASS;
474	} else if (ct_lookup_opts.error != -ENOENT) {
475		return XDP_ABORTED;
476	}
477
478	/* error == -ENOENT || !(status & IPS_CONFIRMED) */
479	return XDP_TX;
480}
481
482static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss,
483					  __u8 wscale)
484{
485	__be32 *start = buf;
486
487	*buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
488
489	if (!tsopt)
490		return buf - start;
491
492	if (tsopt[0] & bpf_htonl(1 << 4))
493		*buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) |
494				   (TCPOLEN_SACK_PERM << 16) |
495				   (TCPOPT_TIMESTAMP << 8) |
496				   TCPOLEN_TIMESTAMP);
497	else
498		*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
499				   (TCPOPT_NOP << 16) |
500				   (TCPOPT_TIMESTAMP << 8) |
501				   TCPOLEN_TIMESTAMP);
502	*buf++ = tsopt[0];
503	*buf++ = tsopt[1];
504
505	if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf))
506		*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
507				   (TCPOPT_WINDOW << 16) |
508				   (TCPOLEN_WINDOW << 8) |
509				   wscale);
510
511	return buf - start;
512}
513
514static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header,
515					   __u32 cookie, __be32 *tsopt,
516					   __u16 mss, __u8 wscale)
517{
518	void *tcp_options;
519
520	tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK;
521	if (tsopt && (tsopt[0] & bpf_htonl(1 << 5)))
522		tcp_flag_word(tcp_header) |= TCP_FLAG_ECE;
523	tcp_header->doff = 5; /* doff is part of tcp_flag_word. */
524	swap(tcp_header->source, tcp_header->dest);
525	tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1);
526	tcp_header->seq = bpf_htonl(cookie);
527	tcp_header->window = 0;
528	tcp_header->urg_ptr = 0;
529	tcp_header->check = 0; /* Calculate checksum later. */
530
531	tcp_options = (void *)(tcp_header + 1);
532	tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale);
533}
534
535static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr,
536					     __u32 cookie, __be32 *tsopt)
537{
538	__u8 wscale;
539	__u16 mss;
540	__u8 ttl;
541
542	values_get_tcpipopts(&mss, &wscale, &ttl, false);
543
544	swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
545
546	swap(hdr->ipv4->saddr, hdr->ipv4->daddr);
547	hdr->ipv4->check = 0; /* Calculate checksum later. */
548	hdr->ipv4->tos = 0;
549	hdr->ipv4->id = 0;
550	hdr->ipv4->ttl = ttl;
551
552	tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
553
554	hdr->tcp_len = hdr->tcp->doff * 4;
555	hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len);
556}
557
558static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr,
559					     __u32 cookie, __be32 *tsopt)
560{
561	__u8 wscale;
562	__u16 mss;
563	__u8 ttl;
564
565	values_get_tcpipopts(&mss, &wscale, &ttl, true);
566
567	swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
568
569	swap(hdr->ipv6->saddr, hdr->ipv6->daddr);
570	*(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000);
571	hdr->ipv6->hop_limit = ttl;
572
573	tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
574
575	hdr->tcp_len = hdr->tcp->doff * 4;
576	hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len);
577}
578
579static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
580						void *ctx,
581						void *data, void *data_end,
582						bool xdp)
583{
584	__u32 old_pkt_size, new_pkt_size;
585	/* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the
586	 * BPF verifier if tsopt is not volatile. Volatile forces it to store
587	 * the pointer value and use it directly, otherwise tcp_mkoptions is
588	 * (mis)compiled like this:
589	 *   if (!tsopt)
590	 *       return buf - start;
591	 *   reg = stored_return_value_of_tscookie_init;
592	 *   if (reg)
593	 *       tsopt = tsopt_buf;
594	 *   else
595	 *       tsopt = NULL;
596	 *   ...
597	 *   *buf++ = tsopt[1];
598	 * It creates a dead branch where tsopt is assigned NULL, but the
599	 * verifier can't prove it's dead and blocks the program.
600	 */
601	__be32 * volatile tsopt = NULL;
602	__be32 tsopt_buf[2] = {};
603	__u16 ip_len;
604	__u32 cookie;
605	__s64 value;
606
607	/* Checksum is not yet verified, but both checksum failure and TCP
608	 * header checks return XDP_DROP, so the order doesn't matter.
609	 */
610	if (hdr->tcp->fin || hdr->tcp->rst)
611		return XDP_DROP;
612
613	/* Issue SYN cookies on allowed ports, drop SYN packets on blocked
614	 * ports.
615	 */
616	if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest)))
617		return XDP_DROP;
618
619	if (hdr->ipv4) {
620		/* Check the IPv4 and TCP checksums before creating a SYNACK. */
621		value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0);
622		if (value < 0)
623			return XDP_ABORTED;
624		if (csum_fold(value) != 0)
625			return XDP_DROP; /* Bad IPv4 checksum. */
626
627		value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
628		if (value < 0)
629			return XDP_ABORTED;
630		if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr,
631				      hdr->tcp_len, IPPROTO_TCP, value) != 0)
632			return XDP_DROP; /* Bad TCP checksum. */
633
634		ip_len = sizeof(*hdr->ipv4);
635
636		value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp,
637						       hdr->tcp_len);
638	} else if (hdr->ipv6) {
639		/* Check the TCP checksum before creating a SYNACK. */
640		value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
641		if (value < 0)
642			return XDP_ABORTED;
643		if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr,
644				    hdr->tcp_len, IPPROTO_TCP, value) != 0)
645			return XDP_DROP; /* Bad TCP checksum. */
646
647		ip_len = sizeof(*hdr->ipv6);
648
649		value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp,
650						       hdr->tcp_len);
651	} else {
652		return XDP_ABORTED;
653	}
654
655	if (value < 0)
656		return XDP_ABORTED;
657	cookie = (__u32)value;
658
659	if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,
660			  &tsopt_buf[0], &tsopt_buf[1], data, data_end))
661		tsopt = tsopt_buf;
662
663	/* Check that there is enough space for a SYNACK. It also covers
664	 * the check that the destination of the __builtin_memmove below
665	 * doesn't overflow.
666	 */
667	if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end)
668		return XDP_ABORTED;
669
670	if (hdr->ipv4) {
671		if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) {
672			struct tcphdr *new_tcp_header;
673
674			new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4);
675			__builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp));
676			hdr->tcp = new_tcp_header;
677
678			hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4;
679		}
680
681		tcpv4_gen_synack(hdr, cookie, tsopt);
682	} else if (hdr->ipv6) {
683		tcpv6_gen_synack(hdr, cookie, tsopt);
684	} else {
685		return XDP_ABORTED;
686	}
687
688	/* Recalculate checksums. */
689	hdr->tcp->check = 0;
690	value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
691	if (value < 0)
692		return XDP_ABORTED;
693	if (hdr->ipv4) {
694		hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr,
695						    hdr->ipv4->daddr,
696						    hdr->tcp_len,
697						    IPPROTO_TCP,
698						    value);
699
700		hdr->ipv4->check = 0;
701		value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0);
702		if (value < 0)
703			return XDP_ABORTED;
704		hdr->ipv4->check = csum_fold(value);
705	} else if (hdr->ipv6) {
706		hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr,
707						  &hdr->ipv6->daddr,
708						  hdr->tcp_len,
709						  IPPROTO_TCP,
710						  value);
711	} else {
712		return XDP_ABORTED;
713	}
714
715	/* Set the new packet size. */
716	old_pkt_size = data_end - data;
717	new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4;
718	if (xdp) {
719		if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size))
720			return XDP_ABORTED;
721	} else {
722		if (bpf_skb_change_tail(ctx, new_pkt_size, 0))
723			return XDP_ABORTED;
724	}
725
726	values_inc_synacks();
727
728	return XDP_TX;
729}
730
731static __always_inline int syncookie_handle_ack(struct header_pointers *hdr)
732{
733	int err;
734
735	if (hdr->tcp->rst)
736		return XDP_DROP;
737
738	if (hdr->ipv4)
739		err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp);
740	else if (hdr->ipv6)
741		err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp);
742	else
743		return XDP_ABORTED;
744	if (err)
745		return XDP_DROP;
746
747	return XDP_PASS;
748}
749
750static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end,
751					   struct header_pointers *hdr, bool xdp)
752{
753	int ret;
754
755	ret = tcp_dissect(data, data_end, hdr);
756	if (ret != XDP_TX)
757		return ret;
758
759	ret = tcp_lookup(ctx, hdr, xdp);
760	if (ret != XDP_TX)
761		return ret;
762
763	/* Packet is TCP and doesn't belong to an established connection. */
764
765	if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1)
766		return XDP_DROP;
767
768	/* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len
769	 * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier.
770	 */
771	if (xdp) {
772		if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len))
773			return XDP_ABORTED;
774	} else {
775		/* Without volatile the verifier throws this error:
776		 * R9 32-bit pointer arithmetic prohibited
777		 */
778		volatile u64 old_len = data_end - data;
779
780		if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0))
781			return XDP_ABORTED;
782	}
783
784	return XDP_TX;
785}
786
787static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end,
788					   struct header_pointers *hdr, bool xdp)
789{
790	if (hdr->ipv4) {
791		hdr->eth = data;
792		hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
793		/* IPV4_MAXLEN is needed when calculating checksum.
794		 * At least sizeof(struct iphdr) is needed here to access ihl.
795		 */
796		if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end)
797			return XDP_ABORTED;
798		hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
799	} else if (hdr->ipv6) {
800		hdr->eth = data;
801		hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
802		hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
803	} else {
804		return XDP_ABORTED;
805	}
806
807	if ((void *)hdr->tcp + TCP_MAXLEN > data_end)
808		return XDP_ABORTED;
809
810	/* We run out of registers, tcp_len gets spilled to the stack, and the
811	 * verifier forgets its min and max values checked above in tcp_dissect.
812	 */
813	hdr->tcp_len = hdr->tcp->doff * 4;
814	if (hdr->tcp_len < sizeof(*hdr->tcp))
815		return XDP_ABORTED;
816
817	return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) :
818			       syncookie_handle_ack(hdr);
819}
820
821SEC("xdp")
822int syncookie_xdp(struct xdp_md *ctx)
823{
824	void *data_end = (void *)(long)ctx->data_end;
825	void *data = (void *)(long)ctx->data;
826	struct header_pointers hdr;
827	int ret;
828
829	ret = syncookie_part1(ctx, data, data_end, &hdr, true);
830	if (ret != XDP_TX)
831		return ret;
832
833	data_end = (void *)(long)ctx->data_end;
834	data = (void *)(long)ctx->data;
835
836	return syncookie_part2(ctx, data, data_end, &hdr, true);
837}
838
839SEC("tc")
840int syncookie_tc(struct __sk_buff *skb)
841{
842	void *data_end = (void *)(long)skb->data_end;
843	void *data = (void *)(long)skb->data;
844	struct header_pointers hdr;
845	int ret;
846
847	ret = syncookie_part1(skb, data, data_end, &hdr, false);
848	if (ret != XDP_TX)
849		return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT;
850
851	data_end = (void *)(long)skb->data_end;
852	data = (void *)(long)skb->data;
853
854	ret = syncookie_part2(skb, data, data_end, &hdr, false);
855	switch (ret) {
856	case XDP_PASS:
857		return TC_ACT_OK;
858	case XDP_TX:
859		return bpf_redirect(skb->ifindex, 0);
860	default:
861		return TC_ACT_SHOT;
862	}
863}
864
865char _license[] SEC("license") = "GPL";
v6.8
  1// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
  2/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
  3
 
  4#include "vmlinux.h"
  5
  6#include <bpf/bpf_helpers.h>
  7#include <bpf/bpf_endian.h>
  8#include <asm/errno.h>
  9
 
 
 10#define TC_ACT_OK 0
 11#define TC_ACT_SHOT 2
 12
 13#define NSEC_PER_SEC 1000000000L
 14
 15#define ETH_ALEN 6
 16#define ETH_P_IP 0x0800
 17#define ETH_P_IPV6 0x86DD
 18
 19#define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3])
 20
 21#define IP_DF 0x4000
 22#define IP_MF 0x2000
 23#define IP_OFFSET 0x1fff
 24
 25#define NEXTHDR_TCP 6
 26
 27#define TCPOPT_NOP 1
 28#define TCPOPT_EOL 0
 29#define TCPOPT_MSS 2
 30#define TCPOPT_WINDOW 3
 31#define TCPOPT_SACK_PERM 4
 32#define TCPOPT_TIMESTAMP 8
 33
 34#define TCPOLEN_MSS 4
 35#define TCPOLEN_WINDOW 3
 36#define TCPOLEN_SACK_PERM 2
 37#define TCPOLEN_TIMESTAMP 10
 38
 39#define TCP_TS_HZ 1000
 40#define TS_OPT_WSCALE_MASK 0xf
 41#define TS_OPT_SACK (1 << 4)
 42#define TS_OPT_ECN (1 << 5)
 43#define TSBITS 6
 44#define TSMASK (((__u32)1 << TSBITS) - 1)
 45#define TCP_MAX_WSCALE 14U
 46
 47#define IPV4_MAXLEN 60
 48#define TCP_MAXLEN 60
 49
 50#define DEFAULT_MSS4 1460
 51#define DEFAULT_MSS6 1440
 52#define DEFAULT_WSCALE 7
 53#define DEFAULT_TTL 64
 54#define MAX_ALLOWED_PORTS 8
 55
 56#define MAX_PACKET_OFF 0xffff
 57
 58#define swap(a, b) \
 59	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
 60
 61#define __get_unaligned_t(type, ptr) ({						\
 62	const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \
 63	__pptr->x;								\
 64})
 65
 66#define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr))
 67
 68struct {
 69	__uint(type, BPF_MAP_TYPE_ARRAY);
 70	__type(key, __u32);
 71	__type(value, __u64);
 72	__uint(max_entries, 2);
 73} values SEC(".maps");
 74
 75struct {
 76	__uint(type, BPF_MAP_TYPE_ARRAY);
 77	__type(key, __u32);
 78	__type(value, __u16);
 79	__uint(max_entries, MAX_ALLOWED_PORTS);
 80} allowed_ports SEC(".maps");
 81
 82/* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in
 83 * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally.
 84 */
 85
 86struct bpf_ct_opts___local {
 87	s32 netns_id;
 88	s32 error;
 89	u8 l4proto;
 90	u8 dir;
 91	u8 reserved[2];
 92} __attribute__((preserve_access_index));
 93
 94#define BPF_F_CURRENT_NETNS (-1)
 95
 96extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx,
 97					 struct bpf_sock_tuple *bpf_tuple,
 98					 __u32 len_tuple,
 99					 struct bpf_ct_opts___local *opts,
100					 __u32 len_opts) __ksym;
101
102extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx,
103					 struct bpf_sock_tuple *bpf_tuple,
104					 u32 len_tuple,
105					 struct bpf_ct_opts___local *opts,
106					 u32 len_opts) __ksym;
107
108extern void bpf_ct_release(struct nf_conn *ct) __ksym;
109
110static __always_inline void swap_eth_addr(__u8 *a, __u8 *b)
111{
112	__u8 tmp[ETH_ALEN];
113
114	__builtin_memcpy(tmp, a, ETH_ALEN);
115	__builtin_memcpy(a, b, ETH_ALEN);
116	__builtin_memcpy(b, tmp, ETH_ALEN);
117}
118
119static __always_inline __u16 csum_fold(__u32 csum)
120{
121	csum = (csum & 0xffff) + (csum >> 16);
122	csum = (csum & 0xffff) + (csum >> 16);
123	return (__u16)~csum;
124}
125
126static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
127					       __u32 len, __u8 proto,
128					       __u32 csum)
129{
130	__u64 s = csum;
131
132	s += (__u32)saddr;
133	s += (__u32)daddr;
134#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
135	s += proto + len;
136#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
137	s += (proto + len) << 8;
138#else
139#error Unknown endian
140#endif
141	s = (s & 0xffffffff) + (s >> 32);
142	s = (s & 0xffffffff) + (s >> 32);
143
144	return csum_fold((__u32)s);
145}
146
147static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
148					     const struct in6_addr *daddr,
149					     __u32 len, __u8 proto, __u32 csum)
150{
151	__u64 sum = csum;
152	int i;
153
154#pragma unroll
155	for (i = 0; i < 4; i++)
156		sum += (__u32)saddr->in6_u.u6_addr32[i];
157
158#pragma unroll
159	for (i = 0; i < 4; i++)
160		sum += (__u32)daddr->in6_u.u6_addr32[i];
161
162	/* Don't combine additions to avoid 32-bit overflow. */
163	sum += bpf_htonl(len);
164	sum += bpf_htonl(proto);
165
166	sum = (sum & 0xffffffff) + (sum >> 32);
167	sum = (sum & 0xffffffff) + (sum >> 32);
168
169	return csum_fold((__u32)sum);
170}
171
172static __always_inline __u64 tcp_clock_ns(void)
173{
174	return bpf_ktime_get_ns();
175}
176
177static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
178{
179	return ns / (NSEC_PER_SEC / TCP_TS_HZ);
180}
181
182static __always_inline __u32 tcp_clock_ms(void)
183{
184	return tcp_ns_to_ts(tcp_clock_ns());
185}
186
187struct tcpopt_context {
188	void *data;
189	void *data_end;
190	__be32 *tsecr;
191	__u8 wscale;
192	bool option_timestamp;
193	bool option_sack;
194	__u32 off;
195};
196
197static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz)
198{
199	__u64 off = ctx->off;
200	__u8 *data;
201
202	/* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */
203	if (off > MAX_PACKET_OFF - sz)
204		return NULL;
205
206	data = ctx->data + off;
207	barrier_var(data);
208	if (data + sz >= ctx->data_end)
209		return NULL;
210
211	ctx->off += sz;
212	return data;
213}
214
215static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
216{
217	__u8 *opcode, *opsize, *wscale, *tsecr;
218	__u32 off = ctx->off;
219
220	opcode = next(ctx, 1);
221	if (!opcode)
222		return 1;
223
224	if (*opcode == TCPOPT_EOL)
225		return 1;
226	if (*opcode == TCPOPT_NOP)
227		return 0;
228
229	opsize = next(ctx, 1);
230	if (!opsize || *opsize < 2)
231		return 1;
232
233	switch (*opcode) {
234	case TCPOPT_WINDOW:
235		wscale = next(ctx, 1);
236		if (!wscale)
237			return 1;
238		if (*opsize == TCPOLEN_WINDOW)
239			ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE;
240		break;
241	case TCPOPT_TIMESTAMP:
242		tsecr = next(ctx, 4);
243		if (!tsecr)
244			return 1;
245		if (*opsize == TCPOLEN_TIMESTAMP) {
246			ctx->option_timestamp = true;
247			/* Client's tsval becomes our tsecr. */
248			*ctx->tsecr = get_unaligned((__be32 *)tsecr);
249		}
250		break;
251	case TCPOPT_SACK_PERM:
252		if (*opsize == TCPOLEN_SACK_PERM)
253			ctx->option_sack = true;
254		break;
255	}
256
257	ctx->off = off + *opsize;
258
259	return 0;
260}
261
262static int tscookie_tcpopt_parse_batch(__u32 index, void *context)
263{
264	int i;
265
266	for (i = 0; i < 7; i++)
267		if (tscookie_tcpopt_parse(context))
268			return 1;
269	return 0;
270}
271
272static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
273					  __u16 tcp_len, __be32 *tsval,
274					  __be32 *tsecr, void *data, void *data_end)
275{
276	struct tcpopt_context loop_ctx = {
277		.data = data,
278		.data_end = data_end,
279		.tsecr = tsecr,
280		.wscale = TS_OPT_WSCALE_MASK,
281		.option_timestamp = false,
282		.option_sack = false,
283		/* Note: currently verifier would track .off as unbound scalar.
284		 *       In case if verifier would at some point get smarter and
285		 *       compute bounded value for this var, beware that it might
286		 *       hinder bpf_loop() convergence validation.
287		 */
288		.off = (__u8 *)(tcp_header + 1) - (__u8 *)data,
289	};
290	u32 cookie;
291
292	bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0);
293
294	if (!loop_ctx.option_timestamp)
295		return false;
296
297	cookie = tcp_clock_ms() & ~TSMASK;
298	cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;
299	if (loop_ctx.option_sack)
300		cookie |= TS_OPT_SACK;
301	if (tcp_header->ece && tcp_header->cwr)
302		cookie |= TS_OPT_ECN;
303	*tsval = bpf_htonl(cookie);
304
305	return true;
306}
307
308static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
309						 __u8 *ttl, bool ipv6)
310{
311	__u32 key = 0;
312	__u64 *value;
313
314	value = bpf_map_lookup_elem(&values, &key);
315	if (value && *value != 0) {
316		if (ipv6)
317			*mss = (*value >> 32) & 0xffff;
318		else
319			*mss = *value & 0xffff;
320		*wscale = (*value >> 16) & 0xf;
321		*ttl = (*value >> 24) & 0xff;
322		return;
323	}
324
325	*mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4;
326	*wscale = DEFAULT_WSCALE;
327	*ttl = DEFAULT_TTL;
328}
329
330static __always_inline void values_inc_synacks(void)
331{
332	__u32 key = 1;
333	__u64 *value;
334
335	value = bpf_map_lookup_elem(&values, &key);
336	if (value)
337		__sync_fetch_and_add(value, 1);
338}
339
340static __always_inline bool check_port_allowed(__u16 port)
341{
342	__u32 i;
343
344	for (i = 0; i < MAX_ALLOWED_PORTS; i++) {
345		__u32 key = i;
346		__u16 *value;
347
348		value = bpf_map_lookup_elem(&allowed_ports, &key);
349
350		if (!value)
351			break;
352		/* 0 is a terminator value. Check it first to avoid matching on
353		 * a forbidden port == 0 and returning true.
354		 */
355		if (*value == 0)
356			break;
357
358		if (*value == port)
359			return true;
360	}
361
362	return false;
363}
364
365struct header_pointers {
366	struct ethhdr *eth;
367	struct iphdr *ipv4;
368	struct ipv6hdr *ipv6;
369	struct tcphdr *tcp;
370	__u16 tcp_len;
371};
372
373static __always_inline int tcp_dissect(void *data, void *data_end,
374				       struct header_pointers *hdr)
375{
376	hdr->eth = data;
377	if (hdr->eth + 1 > data_end)
378		return XDP_DROP;
379
380	switch (bpf_ntohs(hdr->eth->h_proto)) {
381	case ETH_P_IP:
382		hdr->ipv6 = NULL;
383
384		hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
385		if (hdr->ipv4 + 1 > data_end)
386			return XDP_DROP;
387		if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4))
388			return XDP_DROP;
389		if (hdr->ipv4->version != 4)
390			return XDP_DROP;
391
392		if (hdr->ipv4->protocol != IPPROTO_TCP)
393			return XDP_PASS;
394
395		hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
396		break;
397	case ETH_P_IPV6:
398		hdr->ipv4 = NULL;
399
400		hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
401		if (hdr->ipv6 + 1 > data_end)
402			return XDP_DROP;
403		if (hdr->ipv6->version != 6)
404			return XDP_DROP;
405
406		/* XXX: Extension headers are not supported and could circumvent
407		 * XDP SYN flood protection.
408		 */
409		if (hdr->ipv6->nexthdr != NEXTHDR_TCP)
410			return XDP_PASS;
411
412		hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
413		break;
414	default:
415		/* XXX: VLANs will circumvent XDP SYN flood protection. */
416		return XDP_PASS;
417	}
418
419	if (hdr->tcp + 1 > data_end)
420		return XDP_DROP;
421	hdr->tcp_len = hdr->tcp->doff * 4;
422	if (hdr->tcp_len < sizeof(*hdr->tcp))
423		return XDP_DROP;
424
425	return XDP_TX;
426}
427
428static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp)
429{
430	struct bpf_ct_opts___local ct_lookup_opts = {
431		.netns_id = BPF_F_CURRENT_NETNS,
432		.l4proto = IPPROTO_TCP,
433	};
434	struct bpf_sock_tuple tup = {};
435	struct nf_conn *ct;
436	__u32 tup_size;
437
438	if (hdr->ipv4) {
439		/* TCP doesn't normally use fragments, and XDP can't reassemble
440		 * them.
441		 */
442		if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF))
443			return XDP_DROP;
444
445		tup.ipv4.saddr = hdr->ipv4->saddr;
446		tup.ipv4.daddr = hdr->ipv4->daddr;
447		tup.ipv4.sport = hdr->tcp->source;
448		tup.ipv4.dport = hdr->tcp->dest;
449		tup_size = sizeof(tup.ipv4);
450	} else if (hdr->ipv6) {
451		__builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr));
452		__builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr));
453		tup.ipv6.sport = hdr->tcp->source;
454		tup.ipv6.dport = hdr->tcp->dest;
455		tup_size = sizeof(tup.ipv6);
456	} else {
457		/* The verifier can't track that either ipv4 or ipv6 is not
458		 * NULL.
459		 */
460		return XDP_ABORTED;
461	}
462	if (xdp)
463		ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
464	else
465		ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
466	if (ct) {
467		unsigned long status = ct->status;
468
469		bpf_ct_release(ct);
470		if (status & IPS_CONFIRMED)
471			return XDP_PASS;
472	} else if (ct_lookup_opts.error != -ENOENT) {
473		return XDP_ABORTED;
474	}
475
476	/* error == -ENOENT || !(status & IPS_CONFIRMED) */
477	return XDP_TX;
478}
479
480static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss,
481					  __u8 wscale)
482{
483	__be32 *start = buf;
484
485	*buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
486
487	if (!tsopt)
488		return buf - start;
489
490	if (tsopt[0] & bpf_htonl(1 << 4))
491		*buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) |
492				   (TCPOLEN_SACK_PERM << 16) |
493				   (TCPOPT_TIMESTAMP << 8) |
494				   TCPOLEN_TIMESTAMP);
495	else
496		*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
497				   (TCPOPT_NOP << 16) |
498				   (TCPOPT_TIMESTAMP << 8) |
499				   TCPOLEN_TIMESTAMP);
500	*buf++ = tsopt[0];
501	*buf++ = tsopt[1];
502
503	if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf))
504		*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
505				   (TCPOPT_WINDOW << 16) |
506				   (TCPOLEN_WINDOW << 8) |
507				   wscale);
508
509	return buf - start;
510}
511
512static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header,
513					   __u32 cookie, __be32 *tsopt,
514					   __u16 mss, __u8 wscale)
515{
516	void *tcp_options;
517
518	tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK;
519	if (tsopt && (tsopt[0] & bpf_htonl(1 << 5)))
520		tcp_flag_word(tcp_header) |= TCP_FLAG_ECE;
521	tcp_header->doff = 5; /* doff is part of tcp_flag_word. */
522	swap(tcp_header->source, tcp_header->dest);
523	tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1);
524	tcp_header->seq = bpf_htonl(cookie);
525	tcp_header->window = 0;
526	tcp_header->urg_ptr = 0;
527	tcp_header->check = 0; /* Calculate checksum later. */
528
529	tcp_options = (void *)(tcp_header + 1);
530	tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale);
531}
532
533static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr,
534					     __u32 cookie, __be32 *tsopt)
535{
536	__u8 wscale;
537	__u16 mss;
538	__u8 ttl;
539
540	values_get_tcpipopts(&mss, &wscale, &ttl, false);
541
542	swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
543
544	swap(hdr->ipv4->saddr, hdr->ipv4->daddr);
545	hdr->ipv4->check = 0; /* Calculate checksum later. */
546	hdr->ipv4->tos = 0;
547	hdr->ipv4->id = 0;
548	hdr->ipv4->ttl = ttl;
549
550	tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
551
552	hdr->tcp_len = hdr->tcp->doff * 4;
553	hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len);
554}
555
556static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr,
557					     __u32 cookie, __be32 *tsopt)
558{
559	__u8 wscale;
560	__u16 mss;
561	__u8 ttl;
562
563	values_get_tcpipopts(&mss, &wscale, &ttl, true);
564
565	swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
566
567	swap(hdr->ipv6->saddr, hdr->ipv6->daddr);
568	*(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000);
569	hdr->ipv6->hop_limit = ttl;
570
571	tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
572
573	hdr->tcp_len = hdr->tcp->doff * 4;
574	hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len);
575}
576
577static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
578						void *ctx,
579						void *data, void *data_end,
580						bool xdp)
581{
582	__u32 old_pkt_size, new_pkt_size;
583	/* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the
584	 * BPF verifier if tsopt is not volatile. Volatile forces it to store
585	 * the pointer value and use it directly, otherwise tcp_mkoptions is
586	 * (mis)compiled like this:
587	 *   if (!tsopt)
588	 *       return buf - start;
589	 *   reg = stored_return_value_of_tscookie_init;
590	 *   if (reg)
591	 *       tsopt = tsopt_buf;
592	 *   else
593	 *       tsopt = NULL;
594	 *   ...
595	 *   *buf++ = tsopt[1];
596	 * It creates a dead branch where tsopt is assigned NULL, but the
597	 * verifier can't prove it's dead and blocks the program.
598	 */
599	__be32 * volatile tsopt = NULL;
600	__be32 tsopt_buf[2] = {};
601	__u16 ip_len;
602	__u32 cookie;
603	__s64 value;
604
605	/* Checksum is not yet verified, but both checksum failure and TCP
606	 * header checks return XDP_DROP, so the order doesn't matter.
607	 */
608	if (hdr->tcp->fin || hdr->tcp->rst)
609		return XDP_DROP;
610
611	/* Issue SYN cookies on allowed ports, drop SYN packets on blocked
612	 * ports.
613	 */
614	if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest)))
615		return XDP_DROP;
616
617	if (hdr->ipv4) {
618		/* Check the IPv4 and TCP checksums before creating a SYNACK. */
619		value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0);
620		if (value < 0)
621			return XDP_ABORTED;
622		if (csum_fold(value) != 0)
623			return XDP_DROP; /* Bad IPv4 checksum. */
624
625		value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
626		if (value < 0)
627			return XDP_ABORTED;
628		if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr,
629				      hdr->tcp_len, IPPROTO_TCP, value) != 0)
630			return XDP_DROP; /* Bad TCP checksum. */
631
632		ip_len = sizeof(*hdr->ipv4);
633
634		value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp,
635						       hdr->tcp_len);
636	} else if (hdr->ipv6) {
637		/* Check the TCP checksum before creating a SYNACK. */
638		value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
639		if (value < 0)
640			return XDP_ABORTED;
641		if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr,
642				    hdr->tcp_len, IPPROTO_TCP, value) != 0)
643			return XDP_DROP; /* Bad TCP checksum. */
644
645		ip_len = sizeof(*hdr->ipv6);
646
647		value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp,
648						       hdr->tcp_len);
649	} else {
650		return XDP_ABORTED;
651	}
652
653	if (value < 0)
654		return XDP_ABORTED;
655	cookie = (__u32)value;
656
657	if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,
658			  &tsopt_buf[0], &tsopt_buf[1], data, data_end))
659		tsopt = tsopt_buf;
660
661	/* Check that there is enough space for a SYNACK. It also covers
662	 * the check that the destination of the __builtin_memmove below
663	 * doesn't overflow.
664	 */
665	if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end)
666		return XDP_ABORTED;
667
668	if (hdr->ipv4) {
669		if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) {
670			struct tcphdr *new_tcp_header;
671
672			new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4);
673			__builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp));
674			hdr->tcp = new_tcp_header;
675
676			hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4;
677		}
678
679		tcpv4_gen_synack(hdr, cookie, tsopt);
680	} else if (hdr->ipv6) {
681		tcpv6_gen_synack(hdr, cookie, tsopt);
682	} else {
683		return XDP_ABORTED;
684	}
685
686	/* Recalculate checksums. */
687	hdr->tcp->check = 0;
688	value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
689	if (value < 0)
690		return XDP_ABORTED;
691	if (hdr->ipv4) {
692		hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr,
693						    hdr->ipv4->daddr,
694						    hdr->tcp_len,
695						    IPPROTO_TCP,
696						    value);
697
698		hdr->ipv4->check = 0;
699		value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0);
700		if (value < 0)
701			return XDP_ABORTED;
702		hdr->ipv4->check = csum_fold(value);
703	} else if (hdr->ipv6) {
704		hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr,
705						  &hdr->ipv6->daddr,
706						  hdr->tcp_len,
707						  IPPROTO_TCP,
708						  value);
709	} else {
710		return XDP_ABORTED;
711	}
712
713	/* Set the new packet size. */
714	old_pkt_size = data_end - data;
715	new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4;
716	if (xdp) {
717		if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size))
718			return XDP_ABORTED;
719	} else {
720		if (bpf_skb_change_tail(ctx, new_pkt_size, 0))
721			return XDP_ABORTED;
722	}
723
724	values_inc_synacks();
725
726	return XDP_TX;
727}
728
729static __always_inline int syncookie_handle_ack(struct header_pointers *hdr)
730{
731	int err;
732
733	if (hdr->tcp->rst)
734		return XDP_DROP;
735
736	if (hdr->ipv4)
737		err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp);
738	else if (hdr->ipv6)
739		err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp);
740	else
741		return XDP_ABORTED;
742	if (err)
743		return XDP_DROP;
744
745	return XDP_PASS;
746}
747
748static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end,
749					   struct header_pointers *hdr, bool xdp)
750{
751	int ret;
752
753	ret = tcp_dissect(data, data_end, hdr);
754	if (ret != XDP_TX)
755		return ret;
756
757	ret = tcp_lookup(ctx, hdr, xdp);
758	if (ret != XDP_TX)
759		return ret;
760
761	/* Packet is TCP and doesn't belong to an established connection. */
762
763	if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1)
764		return XDP_DROP;
765
766	/* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len
767	 * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier.
768	 */
769	if (xdp) {
770		if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len))
771			return XDP_ABORTED;
772	} else {
773		/* Without volatile the verifier throws this error:
774		 * R9 32-bit pointer arithmetic prohibited
775		 */
776		volatile u64 old_len = data_end - data;
777
778		if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0))
779			return XDP_ABORTED;
780	}
781
782	return XDP_TX;
783}
784
785static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end,
786					   struct header_pointers *hdr, bool xdp)
787{
788	if (hdr->ipv4) {
789		hdr->eth = data;
790		hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
791		/* IPV4_MAXLEN is needed when calculating checksum.
792		 * At least sizeof(struct iphdr) is needed here to access ihl.
793		 */
794		if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end)
795			return XDP_ABORTED;
796		hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
797	} else if (hdr->ipv6) {
798		hdr->eth = data;
799		hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
800		hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
801	} else {
802		return XDP_ABORTED;
803	}
804
805	if ((void *)hdr->tcp + TCP_MAXLEN > data_end)
806		return XDP_ABORTED;
807
808	/* We run out of registers, tcp_len gets spilled to the stack, and the
809	 * verifier forgets its min and max values checked above in tcp_dissect.
810	 */
811	hdr->tcp_len = hdr->tcp->doff * 4;
812	if (hdr->tcp_len < sizeof(*hdr->tcp))
813		return XDP_ABORTED;
814
815	return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) :
816			       syncookie_handle_ack(hdr);
817}
818
819SEC("xdp")
820int syncookie_xdp(struct xdp_md *ctx)
821{
822	void *data_end = (void *)(long)ctx->data_end;
823	void *data = (void *)(long)ctx->data;
824	struct header_pointers hdr;
825	int ret;
826
827	ret = syncookie_part1(ctx, data, data_end, &hdr, true);
828	if (ret != XDP_TX)
829		return ret;
830
831	data_end = (void *)(long)ctx->data_end;
832	data = (void *)(long)ctx->data;
833
834	return syncookie_part2(ctx, data, data_end, &hdr, true);
835}
836
837SEC("tc")
838int syncookie_tc(struct __sk_buff *skb)
839{
840	void *data_end = (void *)(long)skb->data_end;
841	void *data = (void *)(long)skb->data;
842	struct header_pointers hdr;
843	int ret;
844
845	ret = syncookie_part1(skb, data, data_end, &hdr, false);
846	if (ret != XDP_TX)
847		return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT;
848
849	data_end = (void *)(long)skb->data_end;
850	data = (void *)(long)skb->data;
851
852	ret = syncookie_part2(skb, data, data_end, &hdr, false);
853	switch (ret) {
854	case XDP_PASS:
855		return TC_ACT_OK;
856	case XDP_TX:
857		return bpf_redirect(skb->ifindex, 0);
858	default:
859		return TC_ACT_SHOT;
860	}
861}
862
863char _license[] SEC("license") = "GPL";