Loading...
1// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
2/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
3
4#define BPF_NO_KFUNC_PROTOTYPES
5#include "vmlinux.h"
6
7#include <bpf/bpf_helpers.h>
8#include <bpf/bpf_endian.h>
9#include <asm/errno.h>
10
11#include "bpf_compiler.h"
12
13#define TC_ACT_OK 0
14#define TC_ACT_SHOT 2
15
16#define NSEC_PER_SEC 1000000000L
17
18#define ETH_ALEN 6
19#define ETH_P_IP 0x0800
20#define ETH_P_IPV6 0x86DD
21
22#define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3])
23
24#define IP_MF 0x2000
25#define IP_OFFSET 0x1fff
26
27#define NEXTHDR_TCP 6
28
29#define TCPOPT_NOP 1
30#define TCPOPT_EOL 0
31#define TCPOPT_MSS 2
32#define TCPOPT_WINDOW 3
33#define TCPOPT_SACK_PERM 4
34#define TCPOPT_TIMESTAMP 8
35
36#define TCPOLEN_MSS 4
37#define TCPOLEN_WINDOW 3
38#define TCPOLEN_SACK_PERM 2
39#define TCPOLEN_TIMESTAMP 10
40
41#define TCP_TS_HZ 1000
42#define TS_OPT_WSCALE_MASK 0xf
43#define TS_OPT_SACK (1 << 4)
44#define TS_OPT_ECN (1 << 5)
45#define TSBITS 6
46#define TSMASK (((__u32)1 << TSBITS) - 1)
47#define TCP_MAX_WSCALE 14U
48
49#define IPV4_MAXLEN 60
50#define TCP_MAXLEN 60
51
52#define DEFAULT_MSS4 1460
53#define DEFAULT_MSS6 1440
54#define DEFAULT_WSCALE 7
55#define DEFAULT_TTL 64
56#define MAX_ALLOWED_PORTS 8
57
58#define MAX_PACKET_OFF 0xffff
59
60#define swap(a, b) \
61 do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
62
63#define __get_unaligned_t(type, ptr) ({ \
64 const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \
65 __pptr->x; \
66})
67
68#define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr))
69
70struct {
71 __uint(type, BPF_MAP_TYPE_ARRAY);
72 __type(key, __u32);
73 __type(value, __u64);
74 __uint(max_entries, 2);
75} values SEC(".maps");
76
77struct {
78 __uint(type, BPF_MAP_TYPE_ARRAY);
79 __type(key, __u32);
80 __type(value, __u16);
81 __uint(max_entries, MAX_ALLOWED_PORTS);
82} allowed_ports SEC(".maps");
83
84/* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in
85 * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally.
86 */
87
88struct bpf_ct_opts___local {
89 s32 netns_id;
90 s32 error;
91 u8 l4proto;
92 u8 dir;
93 u8 reserved[2];
94} __attribute__((preserve_access_index));
95
96#define BPF_F_CURRENT_NETNS (-1)
97
98extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx,
99 struct bpf_sock_tuple *bpf_tuple,
100 __u32 len_tuple,
101 struct bpf_ct_opts___local *opts,
102 __u32 len_opts) __ksym;
103
104extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx,
105 struct bpf_sock_tuple *bpf_tuple,
106 u32 len_tuple,
107 struct bpf_ct_opts___local *opts,
108 u32 len_opts) __ksym;
109
110extern void bpf_ct_release(struct nf_conn *ct) __ksym;
111
112static __always_inline void swap_eth_addr(__u8 *a, __u8 *b)
113{
114 __u8 tmp[ETH_ALEN];
115
116 __builtin_memcpy(tmp, a, ETH_ALEN);
117 __builtin_memcpy(a, b, ETH_ALEN);
118 __builtin_memcpy(b, tmp, ETH_ALEN);
119}
120
121static __always_inline __u16 csum_fold(__u32 csum)
122{
123 csum = (csum & 0xffff) + (csum >> 16);
124 csum = (csum & 0xffff) + (csum >> 16);
125 return (__u16)~csum;
126}
127
128static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
129 __u32 len, __u8 proto,
130 __u32 csum)
131{
132 __u64 s = csum;
133
134 s += (__u32)saddr;
135 s += (__u32)daddr;
136#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
137 s += proto + len;
138#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
139 s += (proto + len) << 8;
140#else
141#error Unknown endian
142#endif
143 s = (s & 0xffffffff) + (s >> 32);
144 s = (s & 0xffffffff) + (s >> 32);
145
146 return csum_fold((__u32)s);
147}
148
149static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
150 const struct in6_addr *daddr,
151 __u32 len, __u8 proto, __u32 csum)
152{
153 __u64 sum = csum;
154 int i;
155
156 __pragma_loop_unroll
157 for (i = 0; i < 4; i++)
158 sum += (__u32)saddr->in6_u.u6_addr32[i];
159
160 __pragma_loop_unroll
161 for (i = 0; i < 4; i++)
162 sum += (__u32)daddr->in6_u.u6_addr32[i];
163
164 /* Don't combine additions to avoid 32-bit overflow. */
165 sum += bpf_htonl(len);
166 sum += bpf_htonl(proto);
167
168 sum = (sum & 0xffffffff) + (sum >> 32);
169 sum = (sum & 0xffffffff) + (sum >> 32);
170
171 return csum_fold((__u32)sum);
172}
173
174static __always_inline __u64 tcp_clock_ns(void)
175{
176 return bpf_ktime_get_ns();
177}
178
179static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
180{
181 return ns / (NSEC_PER_SEC / TCP_TS_HZ);
182}
183
184static __always_inline __u32 tcp_clock_ms(void)
185{
186 return tcp_ns_to_ts(tcp_clock_ns());
187}
188
189struct tcpopt_context {
190 void *data;
191 void *data_end;
192 __be32 *tsecr;
193 __u8 wscale;
194 bool option_timestamp;
195 bool option_sack;
196 __u32 off;
197};
198
199static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz)
200{
201 __u64 off = ctx->off;
202 __u8 *data;
203
204 /* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */
205 if (off > MAX_PACKET_OFF - sz)
206 return NULL;
207
208 data = ctx->data + off;
209 barrier_var(data);
210 if (data + sz >= ctx->data_end)
211 return NULL;
212
213 ctx->off += sz;
214 return data;
215}
216
217static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
218{
219 __u8 *opcode, *opsize, *wscale, *tsecr;
220 __u32 off = ctx->off;
221
222 opcode = next(ctx, 1);
223 if (!opcode)
224 return 1;
225
226 if (*opcode == TCPOPT_EOL)
227 return 1;
228 if (*opcode == TCPOPT_NOP)
229 return 0;
230
231 opsize = next(ctx, 1);
232 if (!opsize || *opsize < 2)
233 return 1;
234
235 switch (*opcode) {
236 case TCPOPT_WINDOW:
237 wscale = next(ctx, 1);
238 if (!wscale)
239 return 1;
240 if (*opsize == TCPOLEN_WINDOW)
241 ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE;
242 break;
243 case TCPOPT_TIMESTAMP:
244 tsecr = next(ctx, 4);
245 if (!tsecr)
246 return 1;
247 if (*opsize == TCPOLEN_TIMESTAMP) {
248 ctx->option_timestamp = true;
249 /* Client's tsval becomes our tsecr. */
250 *ctx->tsecr = get_unaligned((__be32 *)tsecr);
251 }
252 break;
253 case TCPOPT_SACK_PERM:
254 if (*opsize == TCPOLEN_SACK_PERM)
255 ctx->option_sack = true;
256 break;
257 }
258
259 ctx->off = off + *opsize;
260
261 return 0;
262}
263
264static int tscookie_tcpopt_parse_batch(__u32 index, void *context)
265{
266 int i;
267
268 for (i = 0; i < 7; i++)
269 if (tscookie_tcpopt_parse(context))
270 return 1;
271 return 0;
272}
273
274static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
275 __u16 tcp_len, __be32 *tsval,
276 __be32 *tsecr, void *data, void *data_end)
277{
278 struct tcpopt_context loop_ctx = {
279 .data = data,
280 .data_end = data_end,
281 .tsecr = tsecr,
282 .wscale = TS_OPT_WSCALE_MASK,
283 .option_timestamp = false,
284 .option_sack = false,
285 /* Note: currently verifier would track .off as unbound scalar.
286 * In case if verifier would at some point get smarter and
287 * compute bounded value for this var, beware that it might
288 * hinder bpf_loop() convergence validation.
289 */
290 .off = (__u8 *)(tcp_header + 1) - (__u8 *)data,
291 };
292 u32 cookie;
293
294 bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0);
295
296 if (!loop_ctx.option_timestamp)
297 return false;
298
299 cookie = tcp_clock_ms() & ~TSMASK;
300 cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;
301 if (loop_ctx.option_sack)
302 cookie |= TS_OPT_SACK;
303 if (tcp_header->ece && tcp_header->cwr)
304 cookie |= TS_OPT_ECN;
305 *tsval = bpf_htonl(cookie);
306
307 return true;
308}
309
310static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
311 __u8 *ttl, bool ipv6)
312{
313 __u32 key = 0;
314 __u64 *value;
315
316 value = bpf_map_lookup_elem(&values, &key);
317 if (value && *value != 0) {
318 if (ipv6)
319 *mss = (*value >> 32) & 0xffff;
320 else
321 *mss = *value & 0xffff;
322 *wscale = (*value >> 16) & 0xf;
323 *ttl = (*value >> 24) & 0xff;
324 return;
325 }
326
327 *mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4;
328 *wscale = DEFAULT_WSCALE;
329 *ttl = DEFAULT_TTL;
330}
331
332static __always_inline void values_inc_synacks(void)
333{
334 __u32 key = 1;
335 __u64 *value;
336
337 value = bpf_map_lookup_elem(&values, &key);
338 if (value)
339 __sync_fetch_and_add(value, 1);
340}
341
342static __always_inline bool check_port_allowed(__u16 port)
343{
344 __u32 i;
345
346 for (i = 0; i < MAX_ALLOWED_PORTS; i++) {
347 __u32 key = i;
348 __u16 *value;
349
350 value = bpf_map_lookup_elem(&allowed_ports, &key);
351
352 if (!value)
353 break;
354 /* 0 is a terminator value. Check it first to avoid matching on
355 * a forbidden port == 0 and returning true.
356 */
357 if (*value == 0)
358 break;
359
360 if (*value == port)
361 return true;
362 }
363
364 return false;
365}
366
367struct header_pointers {
368 struct ethhdr *eth;
369 struct iphdr *ipv4;
370 struct ipv6hdr *ipv6;
371 struct tcphdr *tcp;
372 __u16 tcp_len;
373};
374
375static __always_inline int tcp_dissect(void *data, void *data_end,
376 struct header_pointers *hdr)
377{
378 hdr->eth = data;
379 if (hdr->eth + 1 > data_end)
380 return XDP_DROP;
381
382 switch (bpf_ntohs(hdr->eth->h_proto)) {
383 case ETH_P_IP:
384 hdr->ipv6 = NULL;
385
386 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
387 if (hdr->ipv4 + 1 > data_end)
388 return XDP_DROP;
389 if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4))
390 return XDP_DROP;
391 if (hdr->ipv4->version != 4)
392 return XDP_DROP;
393
394 if (hdr->ipv4->protocol != IPPROTO_TCP)
395 return XDP_PASS;
396
397 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
398 break;
399 case ETH_P_IPV6:
400 hdr->ipv4 = NULL;
401
402 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
403 if (hdr->ipv6 + 1 > data_end)
404 return XDP_DROP;
405 if (hdr->ipv6->version != 6)
406 return XDP_DROP;
407
408 /* XXX: Extension headers are not supported and could circumvent
409 * XDP SYN flood protection.
410 */
411 if (hdr->ipv6->nexthdr != NEXTHDR_TCP)
412 return XDP_PASS;
413
414 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
415 break;
416 default:
417 /* XXX: VLANs will circumvent XDP SYN flood protection. */
418 return XDP_PASS;
419 }
420
421 if (hdr->tcp + 1 > data_end)
422 return XDP_DROP;
423 hdr->tcp_len = hdr->tcp->doff * 4;
424 if (hdr->tcp_len < sizeof(*hdr->tcp))
425 return XDP_DROP;
426
427 return XDP_TX;
428}
429
430static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp)
431{
432 struct bpf_ct_opts___local ct_lookup_opts = {
433 .netns_id = BPF_F_CURRENT_NETNS,
434 .l4proto = IPPROTO_TCP,
435 };
436 struct bpf_sock_tuple tup = {};
437 struct nf_conn *ct;
438 __u32 tup_size;
439
440 if (hdr->ipv4) {
441 /* TCP doesn't normally use fragments, and XDP can't reassemble
442 * them.
443 */
444 if ((hdr->ipv4->frag_off & bpf_htons(IP_MF | IP_OFFSET)) != 0)
445 return XDP_DROP;
446
447 tup.ipv4.saddr = hdr->ipv4->saddr;
448 tup.ipv4.daddr = hdr->ipv4->daddr;
449 tup.ipv4.sport = hdr->tcp->source;
450 tup.ipv4.dport = hdr->tcp->dest;
451 tup_size = sizeof(tup.ipv4);
452 } else if (hdr->ipv6) {
453 __builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr));
454 __builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr));
455 tup.ipv6.sport = hdr->tcp->source;
456 tup.ipv6.dport = hdr->tcp->dest;
457 tup_size = sizeof(tup.ipv6);
458 } else {
459 /* The verifier can't track that either ipv4 or ipv6 is not
460 * NULL.
461 */
462 return XDP_ABORTED;
463 }
464 if (xdp)
465 ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
466 else
467 ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
468 if (ct) {
469 unsigned long status = ct->status;
470
471 bpf_ct_release(ct);
472 if (status & IPS_CONFIRMED)
473 return XDP_PASS;
474 } else if (ct_lookup_opts.error != -ENOENT) {
475 return XDP_ABORTED;
476 }
477
478 /* error == -ENOENT || !(status & IPS_CONFIRMED) */
479 return XDP_TX;
480}
481
482static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss,
483 __u8 wscale)
484{
485 __be32 *start = buf;
486
487 *buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
488
489 if (!tsopt)
490 return buf - start;
491
492 if (tsopt[0] & bpf_htonl(1 << 4))
493 *buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) |
494 (TCPOLEN_SACK_PERM << 16) |
495 (TCPOPT_TIMESTAMP << 8) |
496 TCPOLEN_TIMESTAMP);
497 else
498 *buf++ = bpf_htonl((TCPOPT_NOP << 24) |
499 (TCPOPT_NOP << 16) |
500 (TCPOPT_TIMESTAMP << 8) |
501 TCPOLEN_TIMESTAMP);
502 *buf++ = tsopt[0];
503 *buf++ = tsopt[1];
504
505 if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf))
506 *buf++ = bpf_htonl((TCPOPT_NOP << 24) |
507 (TCPOPT_WINDOW << 16) |
508 (TCPOLEN_WINDOW << 8) |
509 wscale);
510
511 return buf - start;
512}
513
514static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header,
515 __u32 cookie, __be32 *tsopt,
516 __u16 mss, __u8 wscale)
517{
518 void *tcp_options;
519
520 tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK;
521 if (tsopt && (tsopt[0] & bpf_htonl(1 << 5)))
522 tcp_flag_word(tcp_header) |= TCP_FLAG_ECE;
523 tcp_header->doff = 5; /* doff is part of tcp_flag_word. */
524 swap(tcp_header->source, tcp_header->dest);
525 tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1);
526 tcp_header->seq = bpf_htonl(cookie);
527 tcp_header->window = 0;
528 tcp_header->urg_ptr = 0;
529 tcp_header->check = 0; /* Calculate checksum later. */
530
531 tcp_options = (void *)(tcp_header + 1);
532 tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale);
533}
534
535static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr,
536 __u32 cookie, __be32 *tsopt)
537{
538 __u8 wscale;
539 __u16 mss;
540 __u8 ttl;
541
542 values_get_tcpipopts(&mss, &wscale, &ttl, false);
543
544 swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
545
546 swap(hdr->ipv4->saddr, hdr->ipv4->daddr);
547 hdr->ipv4->check = 0; /* Calculate checksum later. */
548 hdr->ipv4->tos = 0;
549 hdr->ipv4->id = 0;
550 hdr->ipv4->ttl = ttl;
551
552 tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
553
554 hdr->tcp_len = hdr->tcp->doff * 4;
555 hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len);
556}
557
558static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr,
559 __u32 cookie, __be32 *tsopt)
560{
561 __u8 wscale;
562 __u16 mss;
563 __u8 ttl;
564
565 values_get_tcpipopts(&mss, &wscale, &ttl, true);
566
567 swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
568
569 swap(hdr->ipv6->saddr, hdr->ipv6->daddr);
570 *(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000);
571 hdr->ipv6->hop_limit = ttl;
572
573 tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
574
575 hdr->tcp_len = hdr->tcp->doff * 4;
576 hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len);
577}
578
579static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
580 void *ctx,
581 void *data, void *data_end,
582 bool xdp)
583{
584 __u32 old_pkt_size, new_pkt_size;
585 /* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the
586 * BPF verifier if tsopt is not volatile. Volatile forces it to store
587 * the pointer value and use it directly, otherwise tcp_mkoptions is
588 * (mis)compiled like this:
589 * if (!tsopt)
590 * return buf - start;
591 * reg = stored_return_value_of_tscookie_init;
592 * if (reg)
593 * tsopt = tsopt_buf;
594 * else
595 * tsopt = NULL;
596 * ...
597 * *buf++ = tsopt[1];
598 * It creates a dead branch where tsopt is assigned NULL, but the
599 * verifier can't prove it's dead and blocks the program.
600 */
601 __be32 * volatile tsopt = NULL;
602 __be32 tsopt_buf[2] = {};
603 __u16 ip_len;
604 __u32 cookie;
605 __s64 value;
606
607 /* Checksum is not yet verified, but both checksum failure and TCP
608 * header checks return XDP_DROP, so the order doesn't matter.
609 */
610 if (hdr->tcp->fin || hdr->tcp->rst)
611 return XDP_DROP;
612
613 /* Issue SYN cookies on allowed ports, drop SYN packets on blocked
614 * ports.
615 */
616 if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest)))
617 return XDP_DROP;
618
619 if (hdr->ipv4) {
620 /* Check the IPv4 and TCP checksums before creating a SYNACK. */
621 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0);
622 if (value < 0)
623 return XDP_ABORTED;
624 if (csum_fold(value) != 0)
625 return XDP_DROP; /* Bad IPv4 checksum. */
626
627 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
628 if (value < 0)
629 return XDP_ABORTED;
630 if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr,
631 hdr->tcp_len, IPPROTO_TCP, value) != 0)
632 return XDP_DROP; /* Bad TCP checksum. */
633
634 ip_len = sizeof(*hdr->ipv4);
635
636 value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp,
637 hdr->tcp_len);
638 } else if (hdr->ipv6) {
639 /* Check the TCP checksum before creating a SYNACK. */
640 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
641 if (value < 0)
642 return XDP_ABORTED;
643 if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr,
644 hdr->tcp_len, IPPROTO_TCP, value) != 0)
645 return XDP_DROP; /* Bad TCP checksum. */
646
647 ip_len = sizeof(*hdr->ipv6);
648
649 value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp,
650 hdr->tcp_len);
651 } else {
652 return XDP_ABORTED;
653 }
654
655 if (value < 0)
656 return XDP_ABORTED;
657 cookie = (__u32)value;
658
659 if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,
660 &tsopt_buf[0], &tsopt_buf[1], data, data_end))
661 tsopt = tsopt_buf;
662
663 /* Check that there is enough space for a SYNACK. It also covers
664 * the check that the destination of the __builtin_memmove below
665 * doesn't overflow.
666 */
667 if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end)
668 return XDP_ABORTED;
669
670 if (hdr->ipv4) {
671 if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) {
672 struct tcphdr *new_tcp_header;
673
674 new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4);
675 __builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp));
676 hdr->tcp = new_tcp_header;
677
678 hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4;
679 }
680
681 tcpv4_gen_synack(hdr, cookie, tsopt);
682 } else if (hdr->ipv6) {
683 tcpv6_gen_synack(hdr, cookie, tsopt);
684 } else {
685 return XDP_ABORTED;
686 }
687
688 /* Recalculate checksums. */
689 hdr->tcp->check = 0;
690 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
691 if (value < 0)
692 return XDP_ABORTED;
693 if (hdr->ipv4) {
694 hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr,
695 hdr->ipv4->daddr,
696 hdr->tcp_len,
697 IPPROTO_TCP,
698 value);
699
700 hdr->ipv4->check = 0;
701 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0);
702 if (value < 0)
703 return XDP_ABORTED;
704 hdr->ipv4->check = csum_fold(value);
705 } else if (hdr->ipv6) {
706 hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr,
707 &hdr->ipv6->daddr,
708 hdr->tcp_len,
709 IPPROTO_TCP,
710 value);
711 } else {
712 return XDP_ABORTED;
713 }
714
715 /* Set the new packet size. */
716 old_pkt_size = data_end - data;
717 new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4;
718 if (xdp) {
719 if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size))
720 return XDP_ABORTED;
721 } else {
722 if (bpf_skb_change_tail(ctx, new_pkt_size, 0))
723 return XDP_ABORTED;
724 }
725
726 values_inc_synacks();
727
728 return XDP_TX;
729}
730
731static __always_inline int syncookie_handle_ack(struct header_pointers *hdr)
732{
733 int err;
734
735 if (hdr->tcp->rst)
736 return XDP_DROP;
737
738 if (hdr->ipv4)
739 err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp);
740 else if (hdr->ipv6)
741 err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp);
742 else
743 return XDP_ABORTED;
744 if (err)
745 return XDP_DROP;
746
747 return XDP_PASS;
748}
749
750static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end,
751 struct header_pointers *hdr, bool xdp)
752{
753 int ret;
754
755 ret = tcp_dissect(data, data_end, hdr);
756 if (ret != XDP_TX)
757 return ret;
758
759 ret = tcp_lookup(ctx, hdr, xdp);
760 if (ret != XDP_TX)
761 return ret;
762
763 /* Packet is TCP and doesn't belong to an established connection. */
764
765 if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1)
766 return XDP_DROP;
767
768 /* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len
769 * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier.
770 */
771 if (xdp) {
772 if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len))
773 return XDP_ABORTED;
774 } else {
775 /* Without volatile the verifier throws this error:
776 * R9 32-bit pointer arithmetic prohibited
777 */
778 volatile u64 old_len = data_end - data;
779
780 if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0))
781 return XDP_ABORTED;
782 }
783
784 return XDP_TX;
785}
786
787static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end,
788 struct header_pointers *hdr, bool xdp)
789{
790 if (hdr->ipv4) {
791 hdr->eth = data;
792 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
793 /* IPV4_MAXLEN is needed when calculating checksum.
794 * At least sizeof(struct iphdr) is needed here to access ihl.
795 */
796 if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end)
797 return XDP_ABORTED;
798 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
799 } else if (hdr->ipv6) {
800 hdr->eth = data;
801 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
802 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
803 } else {
804 return XDP_ABORTED;
805 }
806
807 if ((void *)hdr->tcp + TCP_MAXLEN > data_end)
808 return XDP_ABORTED;
809
810 /* We run out of registers, tcp_len gets spilled to the stack, and the
811 * verifier forgets its min and max values checked above in tcp_dissect.
812 */
813 hdr->tcp_len = hdr->tcp->doff * 4;
814 if (hdr->tcp_len < sizeof(*hdr->tcp))
815 return XDP_ABORTED;
816
817 return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) :
818 syncookie_handle_ack(hdr);
819}
820
821SEC("xdp")
822int syncookie_xdp(struct xdp_md *ctx)
823{
824 void *data_end = (void *)(long)ctx->data_end;
825 void *data = (void *)(long)ctx->data;
826 struct header_pointers hdr;
827 int ret;
828
829 ret = syncookie_part1(ctx, data, data_end, &hdr, true);
830 if (ret != XDP_TX)
831 return ret;
832
833 data_end = (void *)(long)ctx->data_end;
834 data = (void *)(long)ctx->data;
835
836 return syncookie_part2(ctx, data, data_end, &hdr, true);
837}
838
839SEC("tc")
840int syncookie_tc(struct __sk_buff *skb)
841{
842 void *data_end = (void *)(long)skb->data_end;
843 void *data = (void *)(long)skb->data;
844 struct header_pointers hdr;
845 int ret;
846
847 ret = syncookie_part1(skb, data, data_end, &hdr, false);
848 if (ret != XDP_TX)
849 return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT;
850
851 data_end = (void *)(long)skb->data_end;
852 data = (void *)(long)skb->data;
853
854 ret = syncookie_part2(skb, data, data_end, &hdr, false);
855 switch (ret) {
856 case XDP_PASS:
857 return TC_ACT_OK;
858 case XDP_TX:
859 return bpf_redirect(skb->ifindex, 0);
860 default:
861 return TC_ACT_SHOT;
862 }
863}
864
865char _license[] SEC("license") = "GPL";
1// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
2/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
3
4#include "vmlinux.h"
5
6#include <bpf/bpf_helpers.h>
7#include <bpf/bpf_endian.h>
8#include <asm/errno.h>
9
10#define TC_ACT_OK 0
11#define TC_ACT_SHOT 2
12
13#define NSEC_PER_SEC 1000000000L
14
15#define ETH_ALEN 6
16#define ETH_P_IP 0x0800
17#define ETH_P_IPV6 0x86DD
18
19#define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3])
20
21#define IP_DF 0x4000
22#define IP_MF 0x2000
23#define IP_OFFSET 0x1fff
24
25#define NEXTHDR_TCP 6
26
27#define TCPOPT_NOP 1
28#define TCPOPT_EOL 0
29#define TCPOPT_MSS 2
30#define TCPOPT_WINDOW 3
31#define TCPOPT_SACK_PERM 4
32#define TCPOPT_TIMESTAMP 8
33
34#define TCPOLEN_MSS 4
35#define TCPOLEN_WINDOW 3
36#define TCPOLEN_SACK_PERM 2
37#define TCPOLEN_TIMESTAMP 10
38
39#define TCP_TS_HZ 1000
40#define TS_OPT_WSCALE_MASK 0xf
41#define TS_OPT_SACK (1 << 4)
42#define TS_OPT_ECN (1 << 5)
43#define TSBITS 6
44#define TSMASK (((__u32)1 << TSBITS) - 1)
45#define TCP_MAX_WSCALE 14U
46
47#define IPV4_MAXLEN 60
48#define TCP_MAXLEN 60
49
50#define DEFAULT_MSS4 1460
51#define DEFAULT_MSS6 1440
52#define DEFAULT_WSCALE 7
53#define DEFAULT_TTL 64
54#define MAX_ALLOWED_PORTS 8
55
56#define swap(a, b) \
57 do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
58
59#define __get_unaligned_t(type, ptr) ({ \
60 const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \
61 __pptr->x; \
62})
63
64#define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr))
65
66struct {
67 __uint(type, BPF_MAP_TYPE_ARRAY);
68 __type(key, __u32);
69 __type(value, __u64);
70 __uint(max_entries, 2);
71} values SEC(".maps");
72
73struct {
74 __uint(type, BPF_MAP_TYPE_ARRAY);
75 __type(key, __u32);
76 __type(value, __u16);
77 __uint(max_entries, MAX_ALLOWED_PORTS);
78} allowed_ports SEC(".maps");
79
80/* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in
81 * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally.
82 */
83
84struct bpf_ct_opts___local {
85 s32 netns_id;
86 s32 error;
87 u8 l4proto;
88 u8 dir;
89 u8 reserved[2];
90} __attribute__((preserve_access_index));
91
92#define BPF_F_CURRENT_NETNS (-1)
93
94extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx,
95 struct bpf_sock_tuple *bpf_tuple,
96 __u32 len_tuple,
97 struct bpf_ct_opts___local *opts,
98 __u32 len_opts) __ksym;
99
100extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx,
101 struct bpf_sock_tuple *bpf_tuple,
102 u32 len_tuple,
103 struct bpf_ct_opts___local *opts,
104 u32 len_opts) __ksym;
105
106extern void bpf_ct_release(struct nf_conn *ct) __ksym;
107
108static __always_inline void swap_eth_addr(__u8 *a, __u8 *b)
109{
110 __u8 tmp[ETH_ALEN];
111
112 __builtin_memcpy(tmp, a, ETH_ALEN);
113 __builtin_memcpy(a, b, ETH_ALEN);
114 __builtin_memcpy(b, tmp, ETH_ALEN);
115}
116
117static __always_inline __u16 csum_fold(__u32 csum)
118{
119 csum = (csum & 0xffff) + (csum >> 16);
120 csum = (csum & 0xffff) + (csum >> 16);
121 return (__u16)~csum;
122}
123
124static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
125 __u32 len, __u8 proto,
126 __u32 csum)
127{
128 __u64 s = csum;
129
130 s += (__u32)saddr;
131 s += (__u32)daddr;
132#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
133 s += proto + len;
134#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
135 s += (proto + len) << 8;
136#else
137#error Unknown endian
138#endif
139 s = (s & 0xffffffff) + (s >> 32);
140 s = (s & 0xffffffff) + (s >> 32);
141
142 return csum_fold((__u32)s);
143}
144
145static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
146 const struct in6_addr *daddr,
147 __u32 len, __u8 proto, __u32 csum)
148{
149 __u64 sum = csum;
150 int i;
151
152#pragma unroll
153 for (i = 0; i < 4; i++)
154 sum += (__u32)saddr->in6_u.u6_addr32[i];
155
156#pragma unroll
157 for (i = 0; i < 4; i++)
158 sum += (__u32)daddr->in6_u.u6_addr32[i];
159
160 /* Don't combine additions to avoid 32-bit overflow. */
161 sum += bpf_htonl(len);
162 sum += bpf_htonl(proto);
163
164 sum = (sum & 0xffffffff) + (sum >> 32);
165 sum = (sum & 0xffffffff) + (sum >> 32);
166
167 return csum_fold((__u32)sum);
168}
169
170static __always_inline __u64 tcp_clock_ns(void)
171{
172 return bpf_ktime_get_ns();
173}
174
175static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
176{
177 return ns / (NSEC_PER_SEC / TCP_TS_HZ);
178}
179
180static __always_inline __u32 tcp_time_stamp_raw(void)
181{
182 return tcp_ns_to_ts(tcp_clock_ns());
183}
184
185struct tcpopt_context {
186 __u8 *ptr;
187 __u8 *end;
188 void *data_end;
189 __be32 *tsecr;
190 __u8 wscale;
191 bool option_timestamp;
192 bool option_sack;
193};
194
195static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
196{
197 __u8 opcode, opsize;
198
199 if (ctx->ptr >= ctx->end)
200 return 1;
201 if (ctx->ptr >= ctx->data_end)
202 return 1;
203
204 opcode = ctx->ptr[0];
205
206 if (opcode == TCPOPT_EOL)
207 return 1;
208 if (opcode == TCPOPT_NOP) {
209 ++ctx->ptr;
210 return 0;
211 }
212
213 if (ctx->ptr + 1 >= ctx->end)
214 return 1;
215 if (ctx->ptr + 1 >= ctx->data_end)
216 return 1;
217 opsize = ctx->ptr[1];
218 if (opsize < 2)
219 return 1;
220
221 if (ctx->ptr + opsize > ctx->end)
222 return 1;
223
224 switch (opcode) {
225 case TCPOPT_WINDOW:
226 if (opsize == TCPOLEN_WINDOW && ctx->ptr + TCPOLEN_WINDOW <= ctx->data_end)
227 ctx->wscale = ctx->ptr[2] < TCP_MAX_WSCALE ? ctx->ptr[2] : TCP_MAX_WSCALE;
228 break;
229 case TCPOPT_TIMESTAMP:
230 if (opsize == TCPOLEN_TIMESTAMP && ctx->ptr + TCPOLEN_TIMESTAMP <= ctx->data_end) {
231 ctx->option_timestamp = true;
232 /* Client's tsval becomes our tsecr. */
233 *ctx->tsecr = get_unaligned((__be32 *)(ctx->ptr + 2));
234 }
235 break;
236 case TCPOPT_SACK_PERM:
237 if (opsize == TCPOLEN_SACK_PERM)
238 ctx->option_sack = true;
239 break;
240 }
241
242 ctx->ptr += opsize;
243
244 return 0;
245}
246
247static int tscookie_tcpopt_parse_batch(__u32 index, void *context)
248{
249 int i;
250
251 for (i = 0; i < 7; i++)
252 if (tscookie_tcpopt_parse(context))
253 return 1;
254 return 0;
255}
256
257static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
258 __u16 tcp_len, __be32 *tsval,
259 __be32 *tsecr, void *data_end)
260{
261 struct tcpopt_context loop_ctx = {
262 .ptr = (__u8 *)(tcp_header + 1),
263 .end = (__u8 *)tcp_header + tcp_len,
264 .data_end = data_end,
265 .tsecr = tsecr,
266 .wscale = TS_OPT_WSCALE_MASK,
267 .option_timestamp = false,
268 .option_sack = false,
269 };
270 u32 cookie;
271
272 bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0);
273
274 if (!loop_ctx.option_timestamp)
275 return false;
276
277 cookie = tcp_time_stamp_raw() & ~TSMASK;
278 cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;
279 if (loop_ctx.option_sack)
280 cookie |= TS_OPT_SACK;
281 if (tcp_header->ece && tcp_header->cwr)
282 cookie |= TS_OPT_ECN;
283 *tsval = bpf_htonl(cookie);
284
285 return true;
286}
287
288static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
289 __u8 *ttl, bool ipv6)
290{
291 __u32 key = 0;
292 __u64 *value;
293
294 value = bpf_map_lookup_elem(&values, &key);
295 if (value && *value != 0) {
296 if (ipv6)
297 *mss = (*value >> 32) & 0xffff;
298 else
299 *mss = *value & 0xffff;
300 *wscale = (*value >> 16) & 0xf;
301 *ttl = (*value >> 24) & 0xff;
302 return;
303 }
304
305 *mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4;
306 *wscale = DEFAULT_WSCALE;
307 *ttl = DEFAULT_TTL;
308}
309
310static __always_inline void values_inc_synacks(void)
311{
312 __u32 key = 1;
313 __u32 *value;
314
315 value = bpf_map_lookup_elem(&values, &key);
316 if (value)
317 __sync_fetch_and_add(value, 1);
318}
319
320static __always_inline bool check_port_allowed(__u16 port)
321{
322 __u32 i;
323
324 for (i = 0; i < MAX_ALLOWED_PORTS; i++) {
325 __u32 key = i;
326 __u16 *value;
327
328 value = bpf_map_lookup_elem(&allowed_ports, &key);
329
330 if (!value)
331 break;
332 /* 0 is a terminator value. Check it first to avoid matching on
333 * a forbidden port == 0 and returning true.
334 */
335 if (*value == 0)
336 break;
337
338 if (*value == port)
339 return true;
340 }
341
342 return false;
343}
344
345struct header_pointers {
346 struct ethhdr *eth;
347 struct iphdr *ipv4;
348 struct ipv6hdr *ipv6;
349 struct tcphdr *tcp;
350 __u16 tcp_len;
351};
352
353static __always_inline int tcp_dissect(void *data, void *data_end,
354 struct header_pointers *hdr)
355{
356 hdr->eth = data;
357 if (hdr->eth + 1 > data_end)
358 return XDP_DROP;
359
360 switch (bpf_ntohs(hdr->eth->h_proto)) {
361 case ETH_P_IP:
362 hdr->ipv6 = NULL;
363
364 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
365 if (hdr->ipv4 + 1 > data_end)
366 return XDP_DROP;
367 if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4))
368 return XDP_DROP;
369 if (hdr->ipv4->version != 4)
370 return XDP_DROP;
371
372 if (hdr->ipv4->protocol != IPPROTO_TCP)
373 return XDP_PASS;
374
375 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
376 break;
377 case ETH_P_IPV6:
378 hdr->ipv4 = NULL;
379
380 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
381 if (hdr->ipv6 + 1 > data_end)
382 return XDP_DROP;
383 if (hdr->ipv6->version != 6)
384 return XDP_DROP;
385
386 /* XXX: Extension headers are not supported and could circumvent
387 * XDP SYN flood protection.
388 */
389 if (hdr->ipv6->nexthdr != NEXTHDR_TCP)
390 return XDP_PASS;
391
392 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
393 break;
394 default:
395 /* XXX: VLANs will circumvent XDP SYN flood protection. */
396 return XDP_PASS;
397 }
398
399 if (hdr->tcp + 1 > data_end)
400 return XDP_DROP;
401 hdr->tcp_len = hdr->tcp->doff * 4;
402 if (hdr->tcp_len < sizeof(*hdr->tcp))
403 return XDP_DROP;
404
405 return XDP_TX;
406}
407
408static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp)
409{
410 struct bpf_ct_opts___local ct_lookup_opts = {
411 .netns_id = BPF_F_CURRENT_NETNS,
412 .l4proto = IPPROTO_TCP,
413 };
414 struct bpf_sock_tuple tup = {};
415 struct nf_conn *ct;
416 __u32 tup_size;
417
418 if (hdr->ipv4) {
419 /* TCP doesn't normally use fragments, and XDP can't reassemble
420 * them.
421 */
422 if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF))
423 return XDP_DROP;
424
425 tup.ipv4.saddr = hdr->ipv4->saddr;
426 tup.ipv4.daddr = hdr->ipv4->daddr;
427 tup.ipv4.sport = hdr->tcp->source;
428 tup.ipv4.dport = hdr->tcp->dest;
429 tup_size = sizeof(tup.ipv4);
430 } else if (hdr->ipv6) {
431 __builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr));
432 __builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr));
433 tup.ipv6.sport = hdr->tcp->source;
434 tup.ipv6.dport = hdr->tcp->dest;
435 tup_size = sizeof(tup.ipv6);
436 } else {
437 /* The verifier can't track that either ipv4 or ipv6 is not
438 * NULL.
439 */
440 return XDP_ABORTED;
441 }
442 if (xdp)
443 ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
444 else
445 ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
446 if (ct) {
447 unsigned long status = ct->status;
448
449 bpf_ct_release(ct);
450 if (status & IPS_CONFIRMED_BIT)
451 return XDP_PASS;
452 } else if (ct_lookup_opts.error != -ENOENT) {
453 return XDP_ABORTED;
454 }
455
456 /* error == -ENOENT || !(status & IPS_CONFIRMED_BIT) */
457 return XDP_TX;
458}
459
460static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss,
461 __u8 wscale)
462{
463 __be32 *start = buf;
464
465 *buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
466
467 if (!tsopt)
468 return buf - start;
469
470 if (tsopt[0] & bpf_htonl(1 << 4))
471 *buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) |
472 (TCPOLEN_SACK_PERM << 16) |
473 (TCPOPT_TIMESTAMP << 8) |
474 TCPOLEN_TIMESTAMP);
475 else
476 *buf++ = bpf_htonl((TCPOPT_NOP << 24) |
477 (TCPOPT_NOP << 16) |
478 (TCPOPT_TIMESTAMP << 8) |
479 TCPOLEN_TIMESTAMP);
480 *buf++ = tsopt[0];
481 *buf++ = tsopt[1];
482
483 if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf))
484 *buf++ = bpf_htonl((TCPOPT_NOP << 24) |
485 (TCPOPT_WINDOW << 16) |
486 (TCPOLEN_WINDOW << 8) |
487 wscale);
488
489 return buf - start;
490}
491
492static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header,
493 __u32 cookie, __be32 *tsopt,
494 __u16 mss, __u8 wscale)
495{
496 void *tcp_options;
497
498 tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK;
499 if (tsopt && (tsopt[0] & bpf_htonl(1 << 5)))
500 tcp_flag_word(tcp_header) |= TCP_FLAG_ECE;
501 tcp_header->doff = 5; /* doff is part of tcp_flag_word. */
502 swap(tcp_header->source, tcp_header->dest);
503 tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1);
504 tcp_header->seq = bpf_htonl(cookie);
505 tcp_header->window = 0;
506 tcp_header->urg_ptr = 0;
507 tcp_header->check = 0; /* Calculate checksum later. */
508
509 tcp_options = (void *)(tcp_header + 1);
510 tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale);
511}
512
513static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr,
514 __u32 cookie, __be32 *tsopt)
515{
516 __u8 wscale;
517 __u16 mss;
518 __u8 ttl;
519
520 values_get_tcpipopts(&mss, &wscale, &ttl, false);
521
522 swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
523
524 swap(hdr->ipv4->saddr, hdr->ipv4->daddr);
525 hdr->ipv4->check = 0; /* Calculate checksum later. */
526 hdr->ipv4->tos = 0;
527 hdr->ipv4->id = 0;
528 hdr->ipv4->ttl = ttl;
529
530 tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
531
532 hdr->tcp_len = hdr->tcp->doff * 4;
533 hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len);
534}
535
536static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr,
537 __u32 cookie, __be32 *tsopt)
538{
539 __u8 wscale;
540 __u16 mss;
541 __u8 ttl;
542
543 values_get_tcpipopts(&mss, &wscale, &ttl, true);
544
545 swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
546
547 swap(hdr->ipv6->saddr, hdr->ipv6->daddr);
548 *(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000);
549 hdr->ipv6->hop_limit = ttl;
550
551 tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
552
553 hdr->tcp_len = hdr->tcp->doff * 4;
554 hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len);
555}
556
557static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
558 void *ctx,
559 void *data, void *data_end,
560 bool xdp)
561{
562 __u32 old_pkt_size, new_pkt_size;
563 /* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the
564 * BPF verifier if tsopt is not volatile. Volatile forces it to store
565 * the pointer value and use it directly, otherwise tcp_mkoptions is
566 * (mis)compiled like this:
567 * if (!tsopt)
568 * return buf - start;
569 * reg = stored_return_value_of_tscookie_init;
570 * if (reg)
571 * tsopt = tsopt_buf;
572 * else
573 * tsopt = NULL;
574 * ...
575 * *buf++ = tsopt[1];
576 * It creates a dead branch where tsopt is assigned NULL, but the
577 * verifier can't prove it's dead and blocks the program.
578 */
579 __be32 * volatile tsopt = NULL;
580 __be32 tsopt_buf[2] = {};
581 __u16 ip_len;
582 __u32 cookie;
583 __s64 value;
584
585 /* Checksum is not yet verified, but both checksum failure and TCP
586 * header checks return XDP_DROP, so the order doesn't matter.
587 */
588 if (hdr->tcp->fin || hdr->tcp->rst)
589 return XDP_DROP;
590
591 /* Issue SYN cookies on allowed ports, drop SYN packets on blocked
592 * ports.
593 */
594 if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest)))
595 return XDP_DROP;
596
597 if (hdr->ipv4) {
598 /* Check the IPv4 and TCP checksums before creating a SYNACK. */
599 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0);
600 if (value < 0)
601 return XDP_ABORTED;
602 if (csum_fold(value) != 0)
603 return XDP_DROP; /* Bad IPv4 checksum. */
604
605 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
606 if (value < 0)
607 return XDP_ABORTED;
608 if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr,
609 hdr->tcp_len, IPPROTO_TCP, value) != 0)
610 return XDP_DROP; /* Bad TCP checksum. */
611
612 ip_len = sizeof(*hdr->ipv4);
613
614 value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp,
615 hdr->tcp_len);
616 } else if (hdr->ipv6) {
617 /* Check the TCP checksum before creating a SYNACK. */
618 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
619 if (value < 0)
620 return XDP_ABORTED;
621 if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr,
622 hdr->tcp_len, IPPROTO_TCP, value) != 0)
623 return XDP_DROP; /* Bad TCP checksum. */
624
625 ip_len = sizeof(*hdr->ipv6);
626
627 value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp,
628 hdr->tcp_len);
629 } else {
630 return XDP_ABORTED;
631 }
632
633 if (value < 0)
634 return XDP_ABORTED;
635 cookie = (__u32)value;
636
637 if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,
638 &tsopt_buf[0], &tsopt_buf[1], data_end))
639 tsopt = tsopt_buf;
640
641 /* Check that there is enough space for a SYNACK. It also covers
642 * the check that the destination of the __builtin_memmove below
643 * doesn't overflow.
644 */
645 if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end)
646 return XDP_ABORTED;
647
648 if (hdr->ipv4) {
649 if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) {
650 struct tcphdr *new_tcp_header;
651
652 new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4);
653 __builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp));
654 hdr->tcp = new_tcp_header;
655
656 hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4;
657 }
658
659 tcpv4_gen_synack(hdr, cookie, tsopt);
660 } else if (hdr->ipv6) {
661 tcpv6_gen_synack(hdr, cookie, tsopt);
662 } else {
663 return XDP_ABORTED;
664 }
665
666 /* Recalculate checksums. */
667 hdr->tcp->check = 0;
668 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
669 if (value < 0)
670 return XDP_ABORTED;
671 if (hdr->ipv4) {
672 hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr,
673 hdr->ipv4->daddr,
674 hdr->tcp_len,
675 IPPROTO_TCP,
676 value);
677
678 hdr->ipv4->check = 0;
679 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0);
680 if (value < 0)
681 return XDP_ABORTED;
682 hdr->ipv4->check = csum_fold(value);
683 } else if (hdr->ipv6) {
684 hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr,
685 &hdr->ipv6->daddr,
686 hdr->tcp_len,
687 IPPROTO_TCP,
688 value);
689 } else {
690 return XDP_ABORTED;
691 }
692
693 /* Set the new packet size. */
694 old_pkt_size = data_end - data;
695 new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4;
696 if (xdp) {
697 if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size))
698 return XDP_ABORTED;
699 } else {
700 if (bpf_skb_change_tail(ctx, new_pkt_size, 0))
701 return XDP_ABORTED;
702 }
703
704 values_inc_synacks();
705
706 return XDP_TX;
707}
708
709static __always_inline int syncookie_handle_ack(struct header_pointers *hdr)
710{
711 int err;
712
713 if (hdr->tcp->rst)
714 return XDP_DROP;
715
716 if (hdr->ipv4)
717 err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp);
718 else if (hdr->ipv6)
719 err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp);
720 else
721 return XDP_ABORTED;
722 if (err)
723 return XDP_DROP;
724
725 return XDP_PASS;
726}
727
728static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end,
729 struct header_pointers *hdr, bool xdp)
730{
731 int ret;
732
733 ret = tcp_dissect(data, data_end, hdr);
734 if (ret != XDP_TX)
735 return ret;
736
737 ret = tcp_lookup(ctx, hdr, xdp);
738 if (ret != XDP_TX)
739 return ret;
740
741 /* Packet is TCP and doesn't belong to an established connection. */
742
743 if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1)
744 return XDP_DROP;
745
746 /* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len
747 * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier.
748 */
749 if (xdp) {
750 if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len))
751 return XDP_ABORTED;
752 } else {
753 /* Without volatile the verifier throws this error:
754 * R9 32-bit pointer arithmetic prohibited
755 */
756 volatile u64 old_len = data_end - data;
757
758 if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0))
759 return XDP_ABORTED;
760 }
761
762 return XDP_TX;
763}
764
765static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end,
766 struct header_pointers *hdr, bool xdp)
767{
768 if (hdr->ipv4) {
769 hdr->eth = data;
770 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
771 /* IPV4_MAXLEN is needed when calculating checksum.
772 * At least sizeof(struct iphdr) is needed here to access ihl.
773 */
774 if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end)
775 return XDP_ABORTED;
776 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
777 } else if (hdr->ipv6) {
778 hdr->eth = data;
779 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
780 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
781 } else {
782 return XDP_ABORTED;
783 }
784
785 if ((void *)hdr->tcp + TCP_MAXLEN > data_end)
786 return XDP_ABORTED;
787
788 /* We run out of registers, tcp_len gets spilled to the stack, and the
789 * verifier forgets its min and max values checked above in tcp_dissect.
790 */
791 hdr->tcp_len = hdr->tcp->doff * 4;
792 if (hdr->tcp_len < sizeof(*hdr->tcp))
793 return XDP_ABORTED;
794
795 return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) :
796 syncookie_handle_ack(hdr);
797}
798
799SEC("xdp")
800int syncookie_xdp(struct xdp_md *ctx)
801{
802 void *data_end = (void *)(long)ctx->data_end;
803 void *data = (void *)(long)ctx->data;
804 struct header_pointers hdr;
805 int ret;
806
807 ret = syncookie_part1(ctx, data, data_end, &hdr, true);
808 if (ret != XDP_TX)
809 return ret;
810
811 data_end = (void *)(long)ctx->data_end;
812 data = (void *)(long)ctx->data;
813
814 return syncookie_part2(ctx, data, data_end, &hdr, true);
815}
816
817SEC("tc")
818int syncookie_tc(struct __sk_buff *skb)
819{
820 void *data_end = (void *)(long)skb->data_end;
821 void *data = (void *)(long)skb->data;
822 struct header_pointers hdr;
823 int ret;
824
825 ret = syncookie_part1(skb, data, data_end, &hdr, false);
826 if (ret != XDP_TX)
827 return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT;
828
829 data_end = (void *)(long)skb->data_end;
830 data = (void *)(long)skb->data;
831
832 ret = syncookie_part2(skb, data, data_end, &hdr, false);
833 switch (ret) {
834 case XDP_PASS:
835 return TC_ACT_OK;
836 case XDP_TX:
837 return bpf_redirect(skb->ifindex, 0);
838 default:
839 return TC_ACT_SHOT;
840 }
841}
842
843char _license[] SEC("license") = "GPL";