Loading...
1// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
2
3/*
4 * AF_XDP user-space access library.
5 *
6 * Copyright(c) 2018 - 2019 Intel Corporation.
7 *
8 * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
9 */
10
11#include <errno.h>
12#include <stdlib.h>
13#include <string.h>
14#include <unistd.h>
15#include <arpa/inet.h>
16#include <asm/barrier.h>
17#include <linux/compiler.h>
18#include <linux/ethtool.h>
19#include <linux/filter.h>
20#include <linux/if_ether.h>
21#include <linux/if_link.h>
22#include <linux/if_packet.h>
23#include <linux/if_xdp.h>
24#include <linux/kernel.h>
25#include <linux/list.h>
26#include <linux/netlink.h>
27#include <linux/rtnetlink.h>
28#include <linux/sockios.h>
29#include <net/if.h>
30#include <sys/ioctl.h>
31#include <sys/mman.h>
32#include <sys/socket.h>
33#include <sys/types.h>
34
35#include <bpf/bpf.h>
36#include <bpf/libbpf.h>
37#include "xsk.h"
38#include "bpf_util.h"
39
40#ifndef SOL_XDP
41 #define SOL_XDP 283
42#endif
43
44#ifndef AF_XDP
45 #define AF_XDP 44
46#endif
47
48#ifndef PF_XDP
49 #define PF_XDP AF_XDP
50#endif
51
52#define pr_warn(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__)
53
54#define XSKMAP_SIZE 1
55
56struct xsk_umem {
57 struct xsk_ring_prod *fill_save;
58 struct xsk_ring_cons *comp_save;
59 char *umem_area;
60 struct xsk_umem_config config;
61 int fd;
62 int refcount;
63 struct list_head ctx_list;
64 bool rx_ring_setup_done;
65 bool tx_ring_setup_done;
66};
67
68struct xsk_ctx {
69 struct xsk_ring_prod *fill;
70 struct xsk_ring_cons *comp;
71 __u32 queue_id;
72 struct xsk_umem *umem;
73 int refcount;
74 int ifindex;
75 struct list_head list;
76};
77
78struct xsk_socket {
79 struct xsk_ring_cons *rx;
80 struct xsk_ring_prod *tx;
81 struct xsk_ctx *ctx;
82 struct xsk_socket_config config;
83 int fd;
84};
85
86struct nl_mtu_req {
87 struct nlmsghdr nh;
88 struct ifinfomsg msg;
89 char buf[512];
90};
91
92int xsk_umem__fd(const struct xsk_umem *umem)
93{
94 return umem ? umem->fd : -EINVAL;
95}
96
97int xsk_socket__fd(const struct xsk_socket *xsk)
98{
99 return xsk ? xsk->fd : -EINVAL;
100}
101
102static bool xsk_page_aligned(void *buffer)
103{
104 unsigned long addr = (unsigned long)buffer;
105
106 return !(addr & (getpagesize() - 1));
107}
108
109static void xsk_set_umem_config(struct xsk_umem_config *cfg,
110 const struct xsk_umem_config *usr_cfg)
111{
112 if (!usr_cfg) {
113 cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
114 cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
115 cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
116 cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
117 cfg->flags = XSK_UMEM__DEFAULT_FLAGS;
118 cfg->tx_metadata_len = 0;
119 return;
120 }
121
122 cfg->fill_size = usr_cfg->fill_size;
123 cfg->comp_size = usr_cfg->comp_size;
124 cfg->frame_size = usr_cfg->frame_size;
125 cfg->frame_headroom = usr_cfg->frame_headroom;
126 cfg->flags = usr_cfg->flags;
127 cfg->tx_metadata_len = usr_cfg->tx_metadata_len;
128}
129
130static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
131 const struct xsk_socket_config *usr_cfg)
132{
133 if (!usr_cfg) {
134 cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
135 cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
136 cfg->bind_flags = 0;
137 return 0;
138 }
139
140 cfg->rx_size = usr_cfg->rx_size;
141 cfg->tx_size = usr_cfg->tx_size;
142 cfg->bind_flags = usr_cfg->bind_flags;
143
144 return 0;
145}
146
147static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off)
148{
149 socklen_t optlen;
150 int err;
151
152 optlen = sizeof(*off);
153 err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen);
154 if (err)
155 return err;
156
157 if (optlen == sizeof(*off))
158 return 0;
159
160 return -EINVAL;
161}
162
163static int xsk_create_umem_rings(struct xsk_umem *umem, int fd,
164 struct xsk_ring_prod *fill,
165 struct xsk_ring_cons *comp)
166{
167 struct xdp_mmap_offsets off;
168 void *map;
169 int err;
170
171 err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING,
172 &umem->config.fill_size,
173 sizeof(umem->config.fill_size));
174 if (err)
175 return -errno;
176
177 err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
178 &umem->config.comp_size,
179 sizeof(umem->config.comp_size));
180 if (err)
181 return -errno;
182
183 err = xsk_get_mmap_offsets(fd, &off);
184 if (err)
185 return -errno;
186
187 map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64),
188 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
189 XDP_UMEM_PGOFF_FILL_RING);
190 if (map == MAP_FAILED)
191 return -errno;
192
193 fill->mask = umem->config.fill_size - 1;
194 fill->size = umem->config.fill_size;
195 fill->producer = map + off.fr.producer;
196 fill->consumer = map + off.fr.consumer;
197 fill->flags = map + off.fr.flags;
198 fill->ring = map + off.fr.desc;
199 fill->cached_cons = umem->config.fill_size;
200
201 map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64),
202 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
203 XDP_UMEM_PGOFF_COMPLETION_RING);
204 if (map == MAP_FAILED) {
205 err = -errno;
206 goto out_mmap;
207 }
208
209 comp->mask = umem->config.comp_size - 1;
210 comp->size = umem->config.comp_size;
211 comp->producer = map + off.cr.producer;
212 comp->consumer = map + off.cr.consumer;
213 comp->flags = map + off.cr.flags;
214 comp->ring = map + off.cr.desc;
215
216 return 0;
217
218out_mmap:
219 munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64));
220 return err;
221}
222
223int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area,
224 __u64 size, struct xsk_ring_prod *fill,
225 struct xsk_ring_cons *comp,
226 const struct xsk_umem_config *usr_config)
227{
228 struct xdp_umem_reg mr;
229 struct xsk_umem *umem;
230 int err;
231
232 if (!umem_area || !umem_ptr || !fill || !comp)
233 return -EFAULT;
234 if (!size && !xsk_page_aligned(umem_area))
235 return -EINVAL;
236
237 umem = calloc(1, sizeof(*umem));
238 if (!umem)
239 return -ENOMEM;
240
241 umem->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0);
242 if (umem->fd < 0) {
243 err = -errno;
244 goto out_umem_alloc;
245 }
246
247 umem->umem_area = umem_area;
248 INIT_LIST_HEAD(&umem->ctx_list);
249 xsk_set_umem_config(&umem->config, usr_config);
250
251 memset(&mr, 0, sizeof(mr));
252 mr.addr = (uintptr_t)umem_area;
253 mr.len = size;
254 mr.chunk_size = umem->config.frame_size;
255 mr.headroom = umem->config.frame_headroom;
256 mr.flags = umem->config.flags;
257 mr.tx_metadata_len = umem->config.tx_metadata_len;
258
259 err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
260 if (err) {
261 err = -errno;
262 goto out_socket;
263 }
264
265 err = xsk_create_umem_rings(umem, umem->fd, fill, comp);
266 if (err)
267 goto out_socket;
268
269 umem->fill_save = fill;
270 umem->comp_save = comp;
271 *umem_ptr = umem;
272 return 0;
273
274out_socket:
275 close(umem->fd);
276out_umem_alloc:
277 free(umem);
278 return err;
279}
280
281bool xsk_is_in_mode(u32 ifindex, int mode)
282{
283 LIBBPF_OPTS(bpf_xdp_query_opts, opts);
284 int ret;
285
286 ret = bpf_xdp_query(ifindex, mode, &opts);
287 if (ret) {
288 printf("XDP mode query returned error %s\n", strerror(errno));
289 return false;
290 }
291
292 if (mode == XDP_FLAGS_DRV_MODE)
293 return opts.attach_mode == XDP_ATTACHED_DRV;
294 else if (mode == XDP_FLAGS_SKB_MODE)
295 return opts.attach_mode == XDP_ATTACHED_SKB;
296
297 return false;
298}
299
300/* Lifted from netlink.c in tools/lib/bpf */
301static int netlink_recvmsg(int sock, struct msghdr *mhdr, int flags)
302{
303 int len;
304
305 do {
306 len = recvmsg(sock, mhdr, flags);
307 } while (len < 0 && (errno == EINTR || errno == EAGAIN));
308
309 if (len < 0)
310 return -errno;
311 return len;
312}
313
314/* Lifted from netlink.c in tools/lib/bpf */
315static int alloc_iov(struct iovec *iov, int len)
316{
317 void *nbuf;
318
319 nbuf = realloc(iov->iov_base, len);
320 if (!nbuf)
321 return -ENOMEM;
322
323 iov->iov_base = nbuf;
324 iov->iov_len = len;
325 return 0;
326}
327
328/* Original version lifted from netlink.c in tools/lib/bpf */
329static int netlink_recv(int sock)
330{
331 struct iovec iov = {};
332 struct msghdr mhdr = {
333 .msg_iov = &iov,
334 .msg_iovlen = 1,
335 };
336 bool multipart = true;
337 struct nlmsgerr *err;
338 struct nlmsghdr *nh;
339 int len, ret;
340
341 ret = alloc_iov(&iov, 4096);
342 if (ret)
343 goto done;
344
345 while (multipart) {
346 multipart = false;
347 len = netlink_recvmsg(sock, &mhdr, MSG_PEEK | MSG_TRUNC);
348 if (len < 0) {
349 ret = len;
350 goto done;
351 }
352
353 if (len > iov.iov_len) {
354 ret = alloc_iov(&iov, len);
355 if (ret)
356 goto done;
357 }
358
359 len = netlink_recvmsg(sock, &mhdr, 0);
360 if (len < 0) {
361 ret = len;
362 goto done;
363 }
364
365 if (len == 0)
366 break;
367
368 for (nh = (struct nlmsghdr *)iov.iov_base; NLMSG_OK(nh, len);
369 nh = NLMSG_NEXT(nh, len)) {
370 if (nh->nlmsg_flags & NLM_F_MULTI)
371 multipart = true;
372 switch (nh->nlmsg_type) {
373 case NLMSG_ERROR:
374 err = (struct nlmsgerr *)NLMSG_DATA(nh);
375 if (!err->error)
376 continue;
377 ret = err->error;
378 goto done;
379 case NLMSG_DONE:
380 ret = 0;
381 goto done;
382 default:
383 break;
384 }
385 }
386 }
387 ret = 0;
388done:
389 free(iov.iov_base);
390 return ret;
391}
392
393int xsk_set_mtu(int ifindex, int mtu)
394{
395 struct nl_mtu_req req;
396 struct rtattr *rta;
397 int fd, ret;
398
399 fd = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE);
400 if (fd < 0)
401 return fd;
402
403 memset(&req, 0, sizeof(req));
404 req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
405 req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
406 req.nh.nlmsg_type = RTM_NEWLINK;
407 req.msg.ifi_family = AF_UNSPEC;
408 req.msg.ifi_index = ifindex;
409 rta = (struct rtattr *)(((char *)&req) + NLMSG_ALIGN(req.nh.nlmsg_len));
410 rta->rta_type = IFLA_MTU;
411 rta->rta_len = RTA_LENGTH(sizeof(unsigned int));
412 req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len) + RTA_LENGTH(sizeof(mtu));
413 memcpy(RTA_DATA(rta), &mtu, sizeof(mtu));
414
415 ret = send(fd, &req, req.nh.nlmsg_len, 0);
416 if (ret < 0) {
417 close(fd);
418 return errno;
419 }
420
421 ret = netlink_recv(fd);
422 close(fd);
423 return ret;
424}
425
426int xsk_attach_xdp_program(struct bpf_program *prog, int ifindex, u32 xdp_flags)
427{
428 int prog_fd;
429
430 prog_fd = bpf_program__fd(prog);
431 return bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL);
432}
433
434void xsk_detach_xdp_program(int ifindex, u32 xdp_flags)
435{
436 bpf_xdp_detach(ifindex, xdp_flags, NULL);
437}
438
439void xsk_clear_xskmap(struct bpf_map *map)
440{
441 u32 index = 0;
442 int map_fd;
443
444 map_fd = bpf_map__fd(map);
445 bpf_map_delete_elem(map_fd, &index);
446}
447
448int xsk_update_xskmap(struct bpf_map *map, struct xsk_socket *xsk, u32 index)
449{
450 int map_fd, sock_fd;
451
452 map_fd = bpf_map__fd(map);
453 sock_fd = xsk_socket__fd(xsk);
454
455 return bpf_map_update_elem(map_fd, &index, &sock_fd, 0);
456}
457
458static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex,
459 __u32 queue_id)
460{
461 struct xsk_ctx *ctx;
462
463 if (list_empty(&umem->ctx_list))
464 return NULL;
465
466 list_for_each_entry(ctx, &umem->ctx_list, list) {
467 if (ctx->ifindex == ifindex && ctx->queue_id == queue_id) {
468 ctx->refcount++;
469 return ctx;
470 }
471 }
472
473 return NULL;
474}
475
476static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap)
477{
478 struct xsk_umem *umem = ctx->umem;
479 struct xdp_mmap_offsets off;
480 int err;
481
482 if (--ctx->refcount)
483 return;
484
485 if (!unmap)
486 goto out_free;
487
488 err = xsk_get_mmap_offsets(umem->fd, &off);
489 if (err)
490 goto out_free;
491
492 munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size *
493 sizeof(__u64));
494 munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size *
495 sizeof(__u64));
496
497out_free:
498 list_del(&ctx->list);
499 free(ctx);
500}
501
502static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
503 struct xsk_umem *umem, int ifindex,
504 __u32 queue_id,
505 struct xsk_ring_prod *fill,
506 struct xsk_ring_cons *comp)
507{
508 struct xsk_ctx *ctx;
509 int err;
510
511 ctx = calloc(1, sizeof(*ctx));
512 if (!ctx)
513 return NULL;
514
515 if (!umem->fill_save) {
516 err = xsk_create_umem_rings(umem, xsk->fd, fill, comp);
517 if (err) {
518 free(ctx);
519 return NULL;
520 }
521 } else if (umem->fill_save != fill || umem->comp_save != comp) {
522 /* Copy over rings to new structs. */
523 memcpy(fill, umem->fill_save, sizeof(*fill));
524 memcpy(comp, umem->comp_save, sizeof(*comp));
525 }
526
527 ctx->ifindex = ifindex;
528 ctx->refcount = 1;
529 ctx->umem = umem;
530 ctx->queue_id = queue_id;
531
532 ctx->fill = fill;
533 ctx->comp = comp;
534 list_add(&ctx->list, &umem->ctx_list);
535 return ctx;
536}
537
538int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
539 int ifindex,
540 __u32 queue_id, struct xsk_umem *umem,
541 struct xsk_ring_cons *rx,
542 struct xsk_ring_prod *tx,
543 struct xsk_ring_prod *fill,
544 struct xsk_ring_cons *comp,
545 const struct xsk_socket_config *usr_config)
546{
547 bool unmap, rx_setup_done = false, tx_setup_done = false;
548 void *rx_map = NULL, *tx_map = NULL;
549 struct sockaddr_xdp sxdp = {};
550 struct xdp_mmap_offsets off;
551 struct xsk_socket *xsk;
552 struct xsk_ctx *ctx;
553 int err;
554
555 if (!umem || !xsk_ptr || !(rx || tx))
556 return -EFAULT;
557
558 unmap = umem->fill_save != fill;
559
560 xsk = calloc(1, sizeof(*xsk));
561 if (!xsk)
562 return -ENOMEM;
563
564 err = xsk_set_xdp_socket_config(&xsk->config, usr_config);
565 if (err)
566 goto out_xsk_alloc;
567
568 if (umem->refcount++ > 0) {
569 xsk->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0);
570 if (xsk->fd < 0) {
571 err = -errno;
572 goto out_xsk_alloc;
573 }
574 } else {
575 xsk->fd = umem->fd;
576 rx_setup_done = umem->rx_ring_setup_done;
577 tx_setup_done = umem->tx_ring_setup_done;
578 }
579
580 ctx = xsk_get_ctx(umem, ifindex, queue_id);
581 if (!ctx) {
582 if (!fill || !comp) {
583 err = -EFAULT;
584 goto out_socket;
585 }
586
587 ctx = xsk_create_ctx(xsk, umem, ifindex, queue_id, fill, comp);
588 if (!ctx) {
589 err = -ENOMEM;
590 goto out_socket;
591 }
592 }
593 xsk->ctx = ctx;
594
595 if (rx && !rx_setup_done) {
596 err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
597 &xsk->config.rx_size,
598 sizeof(xsk->config.rx_size));
599 if (err) {
600 err = -errno;
601 goto out_put_ctx;
602 }
603 if (xsk->fd == umem->fd)
604 umem->rx_ring_setup_done = true;
605 }
606 if (tx && !tx_setup_done) {
607 err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING,
608 &xsk->config.tx_size,
609 sizeof(xsk->config.tx_size));
610 if (err) {
611 err = -errno;
612 goto out_put_ctx;
613 }
614 if (xsk->fd == umem->fd)
615 umem->tx_ring_setup_done = true;
616 }
617
618 err = xsk_get_mmap_offsets(xsk->fd, &off);
619 if (err) {
620 err = -errno;
621 goto out_put_ctx;
622 }
623
624 if (rx) {
625 rx_map = mmap(NULL, off.rx.desc +
626 xsk->config.rx_size * sizeof(struct xdp_desc),
627 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
628 xsk->fd, XDP_PGOFF_RX_RING);
629 if (rx_map == MAP_FAILED) {
630 err = -errno;
631 goto out_put_ctx;
632 }
633
634 rx->mask = xsk->config.rx_size - 1;
635 rx->size = xsk->config.rx_size;
636 rx->producer = rx_map + off.rx.producer;
637 rx->consumer = rx_map + off.rx.consumer;
638 rx->flags = rx_map + off.rx.flags;
639 rx->ring = rx_map + off.rx.desc;
640 rx->cached_prod = *rx->producer;
641 rx->cached_cons = *rx->consumer;
642 }
643 xsk->rx = rx;
644
645 if (tx) {
646 tx_map = mmap(NULL, off.tx.desc +
647 xsk->config.tx_size * sizeof(struct xdp_desc),
648 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
649 xsk->fd, XDP_PGOFF_TX_RING);
650 if (tx_map == MAP_FAILED) {
651 err = -errno;
652 goto out_mmap_rx;
653 }
654
655 tx->mask = xsk->config.tx_size - 1;
656 tx->size = xsk->config.tx_size;
657 tx->producer = tx_map + off.tx.producer;
658 tx->consumer = tx_map + off.tx.consumer;
659 tx->flags = tx_map + off.tx.flags;
660 tx->ring = tx_map + off.tx.desc;
661 tx->cached_prod = *tx->producer;
662 /* cached_cons is r->size bigger than the real consumer pointer
663 * See xsk_prod_nb_free
664 */
665 tx->cached_cons = *tx->consumer + xsk->config.tx_size;
666 }
667 xsk->tx = tx;
668
669 sxdp.sxdp_family = PF_XDP;
670 sxdp.sxdp_ifindex = ctx->ifindex;
671 sxdp.sxdp_queue_id = ctx->queue_id;
672 if (umem->refcount > 1) {
673 sxdp.sxdp_flags |= XDP_SHARED_UMEM;
674 sxdp.sxdp_shared_umem_fd = umem->fd;
675 } else {
676 sxdp.sxdp_flags = xsk->config.bind_flags;
677 }
678
679 err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
680 if (err) {
681 err = -errno;
682 goto out_mmap_tx;
683 }
684
685 *xsk_ptr = xsk;
686 umem->fill_save = NULL;
687 umem->comp_save = NULL;
688 return 0;
689
690out_mmap_tx:
691 if (tx)
692 munmap(tx_map, off.tx.desc +
693 xsk->config.tx_size * sizeof(struct xdp_desc));
694out_mmap_rx:
695 if (rx)
696 munmap(rx_map, off.rx.desc +
697 xsk->config.rx_size * sizeof(struct xdp_desc));
698out_put_ctx:
699 xsk_put_ctx(ctx, unmap);
700out_socket:
701 if (--umem->refcount)
702 close(xsk->fd);
703out_xsk_alloc:
704 free(xsk);
705 return err;
706}
707
708int xsk_socket__create(struct xsk_socket **xsk_ptr, int ifindex,
709 __u32 queue_id, struct xsk_umem *umem,
710 struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
711 const struct xsk_socket_config *usr_config)
712{
713 if (!umem)
714 return -EFAULT;
715
716 return xsk_socket__create_shared(xsk_ptr, ifindex, queue_id, umem,
717 rx, tx, umem->fill_save,
718 umem->comp_save, usr_config);
719}
720
721int xsk_umem__delete(struct xsk_umem *umem)
722{
723 struct xdp_mmap_offsets off;
724 int err;
725
726 if (!umem)
727 return 0;
728
729 if (umem->refcount)
730 return -EBUSY;
731
732 err = xsk_get_mmap_offsets(umem->fd, &off);
733 if (!err && umem->fill_save && umem->comp_save) {
734 munmap(umem->fill_save->ring - off.fr.desc,
735 off.fr.desc + umem->config.fill_size * sizeof(__u64));
736 munmap(umem->comp_save->ring - off.cr.desc,
737 off.cr.desc + umem->config.comp_size * sizeof(__u64));
738 }
739
740 close(umem->fd);
741 free(umem);
742
743 return 0;
744}
745
746void xsk_socket__delete(struct xsk_socket *xsk)
747{
748 size_t desc_sz = sizeof(struct xdp_desc);
749 struct xdp_mmap_offsets off;
750 struct xsk_umem *umem;
751 struct xsk_ctx *ctx;
752 int err;
753
754 if (!xsk)
755 return;
756
757 ctx = xsk->ctx;
758 umem = ctx->umem;
759
760 xsk_put_ctx(ctx, true);
761
762 err = xsk_get_mmap_offsets(xsk->fd, &off);
763 if (!err) {
764 if (xsk->rx) {
765 munmap(xsk->rx->ring - off.rx.desc,
766 off.rx.desc + xsk->config.rx_size * desc_sz);
767 }
768 if (xsk->tx) {
769 munmap(xsk->tx->ring - off.tx.desc,
770 off.tx.desc + xsk->config.tx_size * desc_sz);
771 }
772 }
773
774 umem->refcount--;
775 /* Do not close an fd that also has an associated umem connected
776 * to it.
777 */
778 if (xsk->fd != umem->fd)
779 close(xsk->fd);
780 free(xsk);
781}
1// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
2
3/*
4 * AF_XDP user-space access library.
5 *
6 * Copyright(c) 2018 - 2019 Intel Corporation.
7 *
8 * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
9 */
10
11#include <errno.h>
12#include <stdlib.h>
13#include <string.h>
14#include <unistd.h>
15#include <arpa/inet.h>
16#include <asm/barrier.h>
17#include <linux/compiler.h>
18#include <linux/ethtool.h>
19#include <linux/filter.h>
20#include <linux/if_ether.h>
21#include <linux/if_packet.h>
22#include <linux/if_xdp.h>
23#include <linux/kernel.h>
24#include <linux/list.h>
25#include <linux/sockios.h>
26#include <net/if.h>
27#include <sys/ioctl.h>
28#include <sys/mman.h>
29#include <sys/socket.h>
30#include <sys/types.h>
31#include <linux/if_link.h>
32
33#include <bpf/bpf.h>
34#include <bpf/libbpf.h>
35#include "xsk.h"
36#include "bpf_util.h"
37
38#ifndef SOL_XDP
39 #define SOL_XDP 283
40#endif
41
42#ifndef AF_XDP
43 #define AF_XDP 44
44#endif
45
46#ifndef PF_XDP
47 #define PF_XDP AF_XDP
48#endif
49
50#define pr_warn(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__)
51
52enum xsk_prog {
53 XSK_PROG_FALLBACK,
54 XSK_PROG_REDIRECT_FLAGS,
55};
56
57struct xsk_umem {
58 struct xsk_ring_prod *fill_save;
59 struct xsk_ring_cons *comp_save;
60 char *umem_area;
61 struct xsk_umem_config config;
62 int fd;
63 int refcount;
64 struct list_head ctx_list;
65 bool rx_ring_setup_done;
66 bool tx_ring_setup_done;
67};
68
69struct xsk_ctx {
70 struct xsk_ring_prod *fill;
71 struct xsk_ring_cons *comp;
72 __u32 queue_id;
73 struct xsk_umem *umem;
74 int refcount;
75 int ifindex;
76 struct list_head list;
77 int prog_fd;
78 int link_fd;
79 int xsks_map_fd;
80 char ifname[IFNAMSIZ];
81 bool has_bpf_link;
82};
83
84struct xsk_socket {
85 struct xsk_ring_cons *rx;
86 struct xsk_ring_prod *tx;
87 __u64 outstanding_tx;
88 struct xsk_ctx *ctx;
89 struct xsk_socket_config config;
90 int fd;
91};
92
93struct xsk_nl_info {
94 bool xdp_prog_attached;
95 int ifindex;
96 int fd;
97};
98
99/* Up until and including Linux 5.3 */
100struct xdp_ring_offset_v1 {
101 __u64 producer;
102 __u64 consumer;
103 __u64 desc;
104};
105
106/* Up until and including Linux 5.3 */
107struct xdp_mmap_offsets_v1 {
108 struct xdp_ring_offset_v1 rx;
109 struct xdp_ring_offset_v1 tx;
110 struct xdp_ring_offset_v1 fr;
111 struct xdp_ring_offset_v1 cr;
112};
113
114int xsk_umem__fd(const struct xsk_umem *umem)
115{
116 return umem ? umem->fd : -EINVAL;
117}
118
119int xsk_socket__fd(const struct xsk_socket *xsk)
120{
121 return xsk ? xsk->fd : -EINVAL;
122}
123
124static bool xsk_page_aligned(void *buffer)
125{
126 unsigned long addr = (unsigned long)buffer;
127
128 return !(addr & (getpagesize() - 1));
129}
130
131static void xsk_set_umem_config(struct xsk_umem_config *cfg,
132 const struct xsk_umem_config *usr_cfg)
133{
134 if (!usr_cfg) {
135 cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
136 cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
137 cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
138 cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
139 cfg->flags = XSK_UMEM__DEFAULT_FLAGS;
140 return;
141 }
142
143 cfg->fill_size = usr_cfg->fill_size;
144 cfg->comp_size = usr_cfg->comp_size;
145 cfg->frame_size = usr_cfg->frame_size;
146 cfg->frame_headroom = usr_cfg->frame_headroom;
147 cfg->flags = usr_cfg->flags;
148}
149
150static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
151 const struct xsk_socket_config *usr_cfg)
152{
153 if (!usr_cfg) {
154 cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
155 cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
156 cfg->libbpf_flags = 0;
157 cfg->xdp_flags = 0;
158 cfg->bind_flags = 0;
159 return 0;
160 }
161
162 if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)
163 return -EINVAL;
164
165 cfg->rx_size = usr_cfg->rx_size;
166 cfg->tx_size = usr_cfg->tx_size;
167 cfg->libbpf_flags = usr_cfg->libbpf_flags;
168 cfg->xdp_flags = usr_cfg->xdp_flags;
169 cfg->bind_flags = usr_cfg->bind_flags;
170
171 return 0;
172}
173
174static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off)
175{
176 struct xdp_mmap_offsets_v1 off_v1;
177
178 /* getsockopt on a kernel <= 5.3 has no flags fields.
179 * Copy over the offsets to the correct places in the >=5.4 format
180 * and put the flags where they would have been on that kernel.
181 */
182 memcpy(&off_v1, off, sizeof(off_v1));
183
184 off->rx.producer = off_v1.rx.producer;
185 off->rx.consumer = off_v1.rx.consumer;
186 off->rx.desc = off_v1.rx.desc;
187 off->rx.flags = off_v1.rx.consumer + sizeof(__u32);
188
189 off->tx.producer = off_v1.tx.producer;
190 off->tx.consumer = off_v1.tx.consumer;
191 off->tx.desc = off_v1.tx.desc;
192 off->tx.flags = off_v1.tx.consumer + sizeof(__u32);
193
194 off->fr.producer = off_v1.fr.producer;
195 off->fr.consumer = off_v1.fr.consumer;
196 off->fr.desc = off_v1.fr.desc;
197 off->fr.flags = off_v1.fr.consumer + sizeof(__u32);
198
199 off->cr.producer = off_v1.cr.producer;
200 off->cr.consumer = off_v1.cr.consumer;
201 off->cr.desc = off_v1.cr.desc;
202 off->cr.flags = off_v1.cr.consumer + sizeof(__u32);
203}
204
205static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off)
206{
207 socklen_t optlen;
208 int err;
209
210 optlen = sizeof(*off);
211 err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen);
212 if (err)
213 return err;
214
215 if (optlen == sizeof(*off))
216 return 0;
217
218 if (optlen == sizeof(struct xdp_mmap_offsets_v1)) {
219 xsk_mmap_offsets_v1(off);
220 return 0;
221 }
222
223 return -EINVAL;
224}
225
226static int xsk_create_umem_rings(struct xsk_umem *umem, int fd,
227 struct xsk_ring_prod *fill,
228 struct xsk_ring_cons *comp)
229{
230 struct xdp_mmap_offsets off;
231 void *map;
232 int err;
233
234 err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING,
235 &umem->config.fill_size,
236 sizeof(umem->config.fill_size));
237 if (err)
238 return -errno;
239
240 err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
241 &umem->config.comp_size,
242 sizeof(umem->config.comp_size));
243 if (err)
244 return -errno;
245
246 err = xsk_get_mmap_offsets(fd, &off);
247 if (err)
248 return -errno;
249
250 map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64),
251 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
252 XDP_UMEM_PGOFF_FILL_RING);
253 if (map == MAP_FAILED)
254 return -errno;
255
256 fill->mask = umem->config.fill_size - 1;
257 fill->size = umem->config.fill_size;
258 fill->producer = map + off.fr.producer;
259 fill->consumer = map + off.fr.consumer;
260 fill->flags = map + off.fr.flags;
261 fill->ring = map + off.fr.desc;
262 fill->cached_cons = umem->config.fill_size;
263
264 map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64),
265 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
266 XDP_UMEM_PGOFF_COMPLETION_RING);
267 if (map == MAP_FAILED) {
268 err = -errno;
269 goto out_mmap;
270 }
271
272 comp->mask = umem->config.comp_size - 1;
273 comp->size = umem->config.comp_size;
274 comp->producer = map + off.cr.producer;
275 comp->consumer = map + off.cr.consumer;
276 comp->flags = map + off.cr.flags;
277 comp->ring = map + off.cr.desc;
278
279 return 0;
280
281out_mmap:
282 munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64));
283 return err;
284}
285
286int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area,
287 __u64 size, struct xsk_ring_prod *fill,
288 struct xsk_ring_cons *comp,
289 const struct xsk_umem_config *usr_config)
290{
291 struct xdp_umem_reg mr;
292 struct xsk_umem *umem;
293 int err;
294
295 if (!umem_area || !umem_ptr || !fill || !comp)
296 return -EFAULT;
297 if (!size && !xsk_page_aligned(umem_area))
298 return -EINVAL;
299
300 umem = calloc(1, sizeof(*umem));
301 if (!umem)
302 return -ENOMEM;
303
304 umem->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0);
305 if (umem->fd < 0) {
306 err = -errno;
307 goto out_umem_alloc;
308 }
309
310 umem->umem_area = umem_area;
311 INIT_LIST_HEAD(&umem->ctx_list);
312 xsk_set_umem_config(&umem->config, usr_config);
313
314 memset(&mr, 0, sizeof(mr));
315 mr.addr = (uintptr_t)umem_area;
316 mr.len = size;
317 mr.chunk_size = umem->config.frame_size;
318 mr.headroom = umem->config.frame_headroom;
319 mr.flags = umem->config.flags;
320
321 err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
322 if (err) {
323 err = -errno;
324 goto out_socket;
325 }
326
327 err = xsk_create_umem_rings(umem, umem->fd, fill, comp);
328 if (err)
329 goto out_socket;
330
331 umem->fill_save = fill;
332 umem->comp_save = comp;
333 *umem_ptr = umem;
334 return 0;
335
336out_socket:
337 close(umem->fd);
338out_umem_alloc:
339 free(umem);
340 return err;
341}
342
343struct xsk_umem_config_v1 {
344 __u32 fill_size;
345 __u32 comp_size;
346 __u32 frame_size;
347 __u32 frame_headroom;
348};
349
350static enum xsk_prog get_xsk_prog(void)
351{
352 enum xsk_prog detected = XSK_PROG_FALLBACK;
353 char data_in = 0, data_out;
354 struct bpf_insn insns[] = {
355 BPF_LD_MAP_FD(BPF_REG_1, 0),
356 BPF_MOV64_IMM(BPF_REG_2, 0),
357 BPF_MOV64_IMM(BPF_REG_3, XDP_PASS),
358 BPF_EMIT_CALL(BPF_FUNC_redirect_map),
359 BPF_EXIT_INSN(),
360 };
361 LIBBPF_OPTS(bpf_test_run_opts, opts,
362 .data_in = &data_in,
363 .data_size_in = 1,
364 .data_out = &data_out,
365 );
366
367 int prog_fd, map_fd, ret, insn_cnt = ARRAY_SIZE(insns);
368
369 map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, NULL, sizeof(int), sizeof(int), 1, NULL);
370 if (map_fd < 0)
371 return detected;
372
373 insns[0].imm = map_fd;
374
375 prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL);
376 if (prog_fd < 0) {
377 close(map_fd);
378 return detected;
379 }
380
381 ret = bpf_prog_test_run_opts(prog_fd, &opts);
382 if (!ret && opts.retval == XDP_PASS)
383 detected = XSK_PROG_REDIRECT_FLAGS;
384 close(prog_fd);
385 close(map_fd);
386 return detected;
387}
388
389static int xsk_load_xdp_prog(struct xsk_socket *xsk)
390{
391 static const int log_buf_size = 16 * 1024;
392 struct xsk_ctx *ctx = xsk->ctx;
393 char log_buf[log_buf_size];
394 int prog_fd;
395
396 /* This is the fallback C-program:
397 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
398 * {
399 * int ret, index = ctx->rx_queue_index;
400 *
401 * // A set entry here means that the correspnding queue_id
402 * // has an active AF_XDP socket bound to it.
403 * ret = bpf_redirect_map(&xsks_map, index, XDP_PASS);
404 * if (ret > 0)
405 * return ret;
406 *
407 * // Fallback for pre-5.3 kernels, not supporting default
408 * // action in the flags parameter.
409 * if (bpf_map_lookup_elem(&xsks_map, &index))
410 * return bpf_redirect_map(&xsks_map, index, 0);
411 * return XDP_PASS;
412 * }
413 */
414 struct bpf_insn prog[] = {
415 /* r2 = *(u32 *)(r1 + 16) */
416 BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
417 /* *(u32 *)(r10 - 4) = r2 */
418 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -4),
419 /* r1 = xskmap[] */
420 BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
421 /* r3 = XDP_PASS */
422 BPF_MOV64_IMM(BPF_REG_3, 2),
423 /* call bpf_redirect_map */
424 BPF_EMIT_CALL(BPF_FUNC_redirect_map),
425 /* if w0 != 0 goto pc+13 */
426 BPF_JMP32_IMM(BPF_JSGT, BPF_REG_0, 0, 13),
427 /* r2 = r10 */
428 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
429 /* r2 += -4 */
430 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
431 /* r1 = xskmap[] */
432 BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
433 /* call bpf_map_lookup_elem */
434 BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
435 /* r1 = r0 */
436 BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
437 /* r0 = XDP_PASS */
438 BPF_MOV64_IMM(BPF_REG_0, 2),
439 /* if r1 == 0 goto pc+5 */
440 BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
441 /* r2 = *(u32 *)(r10 - 4) */
442 BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4),
443 /* r1 = xskmap[] */
444 BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
445 /* r3 = 0 */
446 BPF_MOV64_IMM(BPF_REG_3, 0),
447 /* call bpf_redirect_map */
448 BPF_EMIT_CALL(BPF_FUNC_redirect_map),
449 /* The jumps are to this instruction */
450 BPF_EXIT_INSN(),
451 };
452
453 /* This is the post-5.3 kernel C-program:
454 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
455 * {
456 * return bpf_redirect_map(&xsks_map, ctx->rx_queue_index, XDP_PASS);
457 * }
458 */
459 struct bpf_insn prog_redirect_flags[] = {
460 /* r2 = *(u32 *)(r1 + 16) */
461 BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
462 /* r1 = xskmap[] */
463 BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
464 /* r3 = XDP_PASS */
465 BPF_MOV64_IMM(BPF_REG_3, 2),
466 /* call bpf_redirect_map */
467 BPF_EMIT_CALL(BPF_FUNC_redirect_map),
468 BPF_EXIT_INSN(),
469 };
470 size_t insns_cnt[] = {ARRAY_SIZE(prog),
471 ARRAY_SIZE(prog_redirect_flags),
472 };
473 struct bpf_insn *progs[] = {prog, prog_redirect_flags};
474 enum xsk_prog option = get_xsk_prog();
475 LIBBPF_OPTS(bpf_prog_load_opts, opts,
476 .log_buf = log_buf,
477 .log_size = log_buf_size,
478 );
479
480 prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "LGPL-2.1 or BSD-2-Clause",
481 progs[option], insns_cnt[option], &opts);
482 if (prog_fd < 0) {
483 pr_warn("BPF log buffer:\n%s", log_buf);
484 return prog_fd;
485 }
486
487 ctx->prog_fd = prog_fd;
488 return 0;
489}
490
491static int xsk_create_bpf_link(struct xsk_socket *xsk)
492{
493 DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts);
494 struct xsk_ctx *ctx = xsk->ctx;
495 __u32 prog_id = 0;
496 int link_fd;
497 int err;
498
499 err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id);
500 if (err) {
501 pr_warn("getting XDP prog id failed\n");
502 return err;
503 }
504
505 /* if there's a netlink-based XDP prog loaded on interface, bail out
506 * and ask user to do the removal by himself
507 */
508 if (prog_id) {
509 pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n");
510 return -EINVAL;
511 }
512
513 opts.flags = xsk->config.xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE);
514
515 link_fd = bpf_link_create(ctx->prog_fd, ctx->ifindex, BPF_XDP, &opts);
516 if (link_fd < 0) {
517 pr_warn("bpf_link_create failed: %s\n", strerror(errno));
518 return link_fd;
519 }
520
521 ctx->link_fd = link_fd;
522 return 0;
523}
524
525static int xsk_get_max_queues(struct xsk_socket *xsk)
526{
527 struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
528 struct xsk_ctx *ctx = xsk->ctx;
529 struct ifreq ifr = {};
530 int fd, err, ret;
531
532 fd = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0);
533 if (fd < 0)
534 return -errno;
535
536 ifr.ifr_data = (void *)&channels;
537 bpf_strlcpy(ifr.ifr_name, ctx->ifname, IFNAMSIZ);
538 err = ioctl(fd, SIOCETHTOOL, &ifr);
539 if (err && errno != EOPNOTSUPP) {
540 ret = -errno;
541 goto out;
542 }
543
544 if (err) {
545 /* If the device says it has no channels, then all traffic
546 * is sent to a single stream, so max queues = 1.
547 */
548 ret = 1;
549 } else {
550 /* Take the max of rx, tx, combined. Drivers return
551 * the number of channels in different ways.
552 */
553 ret = max(channels.max_rx, channels.max_tx);
554 ret = max(ret, (int)channels.max_combined);
555 }
556
557out:
558 close(fd);
559 return ret;
560}
561
562static int xsk_create_bpf_maps(struct xsk_socket *xsk)
563{
564 struct xsk_ctx *ctx = xsk->ctx;
565 int max_queues;
566 int fd;
567
568 max_queues = xsk_get_max_queues(xsk);
569 if (max_queues < 0)
570 return max_queues;
571
572 fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xsks_map",
573 sizeof(int), sizeof(int), max_queues, NULL);
574 if (fd < 0)
575 return fd;
576
577 ctx->xsks_map_fd = fd;
578
579 return 0;
580}
581
582static void xsk_delete_bpf_maps(struct xsk_socket *xsk)
583{
584 struct xsk_ctx *ctx = xsk->ctx;
585
586 bpf_map_delete_elem(ctx->xsks_map_fd, &ctx->queue_id);
587 close(ctx->xsks_map_fd);
588}
589
590static int xsk_lookup_bpf_maps(struct xsk_socket *xsk)
591{
592 __u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info);
593 __u32 map_len = sizeof(struct bpf_map_info);
594 struct bpf_prog_info prog_info = {};
595 struct xsk_ctx *ctx = xsk->ctx;
596 struct bpf_map_info map_info;
597 int fd, err;
598
599 err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
600 if (err)
601 return err;
602
603 num_maps = prog_info.nr_map_ids;
604
605 map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids));
606 if (!map_ids)
607 return -ENOMEM;
608
609 memset(&prog_info, 0, prog_len);
610 prog_info.nr_map_ids = num_maps;
611 prog_info.map_ids = (__u64)(unsigned long)map_ids;
612
613 err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
614 if (err)
615 goto out_map_ids;
616
617 ctx->xsks_map_fd = -1;
618
619 for (i = 0; i < prog_info.nr_map_ids; i++) {
620 fd = bpf_map_get_fd_by_id(map_ids[i]);
621 if (fd < 0)
622 continue;
623
624 memset(&map_info, 0, map_len);
625 err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len);
626 if (err) {
627 close(fd);
628 continue;
629 }
630
631 if (!strncmp(map_info.name, "xsks_map", sizeof(map_info.name))) {
632 ctx->xsks_map_fd = fd;
633 break;
634 }
635
636 close(fd);
637 }
638
639 if (ctx->xsks_map_fd == -1)
640 err = -ENOENT;
641
642out_map_ids:
643 free(map_ids);
644 return err;
645}
646
647static int xsk_set_bpf_maps(struct xsk_socket *xsk)
648{
649 struct xsk_ctx *ctx = xsk->ctx;
650
651 return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id,
652 &xsk->fd, 0);
653}
654
655static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd)
656{
657 struct bpf_link_info link_info;
658 __u32 link_len;
659 __u32 id = 0;
660 int err;
661 int fd;
662
663 while (true) {
664 err = bpf_link_get_next_id(id, &id);
665 if (err) {
666 if (errno == ENOENT) {
667 err = 0;
668 break;
669 }
670 pr_warn("can't get next link: %s\n", strerror(errno));
671 break;
672 }
673
674 fd = bpf_link_get_fd_by_id(id);
675 if (fd < 0) {
676 if (errno == ENOENT)
677 continue;
678 pr_warn("can't get link by id (%u): %s\n", id, strerror(errno));
679 err = -errno;
680 break;
681 }
682
683 link_len = sizeof(struct bpf_link_info);
684 memset(&link_info, 0, link_len);
685 err = bpf_obj_get_info_by_fd(fd, &link_info, &link_len);
686 if (err) {
687 pr_warn("can't get link info: %s\n", strerror(errno));
688 close(fd);
689 break;
690 }
691 if (link_info.type == BPF_LINK_TYPE_XDP) {
692 if (link_info.xdp.ifindex == ifindex) {
693 *link_fd = fd;
694 if (prog_id)
695 *prog_id = link_info.prog_id;
696 break;
697 }
698 }
699 close(fd);
700 }
701
702 return err;
703}
704
705static bool xsk_probe_bpf_link(void)
706{
707 LIBBPF_OPTS(bpf_link_create_opts, opts, .flags = XDP_FLAGS_SKB_MODE);
708 struct bpf_insn insns[2] = {
709 BPF_MOV64_IMM(BPF_REG_0, XDP_PASS),
710 BPF_EXIT_INSN()
711 };
712 int prog_fd, link_fd = -1, insn_cnt = ARRAY_SIZE(insns);
713 int ifindex_lo = 1;
714 bool ret = false;
715 int err;
716
717 err = xsk_link_lookup(ifindex_lo, NULL, &link_fd);
718 if (err)
719 return ret;
720
721 if (link_fd >= 0)
722 return true;
723
724 prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL);
725 if (prog_fd < 0)
726 return ret;
727
728 link_fd = bpf_link_create(prog_fd, ifindex_lo, BPF_XDP, &opts);
729 close(prog_fd);
730
731 if (link_fd >= 0) {
732 ret = true;
733 close(link_fd);
734 }
735
736 return ret;
737}
738
739static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk)
740{
741 char ifname[IFNAMSIZ];
742 struct xsk_ctx *ctx;
743 char *interface;
744
745 ctx = calloc(1, sizeof(*ctx));
746 if (!ctx)
747 return -ENOMEM;
748
749 interface = if_indextoname(ifindex, &ifname[0]);
750 if (!interface) {
751 free(ctx);
752 return -errno;
753 }
754
755 ctx->ifindex = ifindex;
756 bpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ);
757
758 xsk->ctx = ctx;
759 xsk->ctx->has_bpf_link = xsk_probe_bpf_link();
760
761 return 0;
762}
763
764static int xsk_init_xdp_res(struct xsk_socket *xsk,
765 int *xsks_map_fd)
766{
767 struct xsk_ctx *ctx = xsk->ctx;
768 int err;
769
770 err = xsk_create_bpf_maps(xsk);
771 if (err)
772 return err;
773
774 err = xsk_load_xdp_prog(xsk);
775 if (err)
776 goto err_load_xdp_prog;
777
778 if (ctx->has_bpf_link)
779 err = xsk_create_bpf_link(xsk);
780 else
781 err = bpf_xdp_attach(xsk->ctx->ifindex, ctx->prog_fd,
782 xsk->config.xdp_flags, NULL);
783
784 if (err)
785 goto err_attach_xdp_prog;
786
787 if (!xsk->rx)
788 return err;
789
790 err = xsk_set_bpf_maps(xsk);
791 if (err)
792 goto err_set_bpf_maps;
793
794 return err;
795
796err_set_bpf_maps:
797 if (ctx->has_bpf_link)
798 close(ctx->link_fd);
799 else
800 bpf_xdp_detach(ctx->ifindex, 0, NULL);
801err_attach_xdp_prog:
802 close(ctx->prog_fd);
803err_load_xdp_prog:
804 xsk_delete_bpf_maps(xsk);
805 return err;
806}
807
808static int xsk_lookup_xdp_res(struct xsk_socket *xsk, int *xsks_map_fd, int prog_id)
809{
810 struct xsk_ctx *ctx = xsk->ctx;
811 int err;
812
813 ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id);
814 if (ctx->prog_fd < 0) {
815 err = -errno;
816 goto err_prog_fd;
817 }
818 err = xsk_lookup_bpf_maps(xsk);
819 if (err)
820 goto err_lookup_maps;
821
822 if (!xsk->rx)
823 return err;
824
825 err = xsk_set_bpf_maps(xsk);
826 if (err)
827 goto err_set_maps;
828
829 return err;
830
831err_set_maps:
832 close(ctx->xsks_map_fd);
833err_lookup_maps:
834 close(ctx->prog_fd);
835err_prog_fd:
836 if (ctx->has_bpf_link)
837 close(ctx->link_fd);
838 return err;
839}
840
841static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd)
842{
843 struct xsk_socket *xsk = _xdp;
844 struct xsk_ctx *ctx = xsk->ctx;
845 __u32 prog_id = 0;
846 int err;
847
848 if (ctx->has_bpf_link)
849 err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd);
850 else
851 err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id);
852
853 if (err)
854 return err;
855
856 err = !prog_id ? xsk_init_xdp_res(xsk, xsks_map_fd) :
857 xsk_lookup_xdp_res(xsk, xsks_map_fd, prog_id);
858
859 if (!err && xsks_map_fd)
860 *xsks_map_fd = ctx->xsks_map_fd;
861
862 return err;
863}
864
865int xsk_setup_xdp_prog_xsk(struct xsk_socket *xsk, int *xsks_map_fd)
866{
867 return __xsk_setup_xdp_prog(xsk, xsks_map_fd);
868}
869
870static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex,
871 __u32 queue_id)
872{
873 struct xsk_ctx *ctx;
874
875 if (list_empty(&umem->ctx_list))
876 return NULL;
877
878 list_for_each_entry(ctx, &umem->ctx_list, list) {
879 if (ctx->ifindex == ifindex && ctx->queue_id == queue_id) {
880 ctx->refcount++;
881 return ctx;
882 }
883 }
884
885 return NULL;
886}
887
888static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap)
889{
890 struct xsk_umem *umem = ctx->umem;
891 struct xdp_mmap_offsets off;
892 int err;
893
894 if (--ctx->refcount)
895 return;
896
897 if (!unmap)
898 goto out_free;
899
900 err = xsk_get_mmap_offsets(umem->fd, &off);
901 if (err)
902 goto out_free;
903
904 munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size *
905 sizeof(__u64));
906 munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size *
907 sizeof(__u64));
908
909out_free:
910 list_del(&ctx->list);
911 free(ctx);
912}
913
914static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
915 struct xsk_umem *umem, int ifindex,
916 const char *ifname, __u32 queue_id,
917 struct xsk_ring_prod *fill,
918 struct xsk_ring_cons *comp)
919{
920 struct xsk_ctx *ctx;
921 int err;
922
923 ctx = calloc(1, sizeof(*ctx));
924 if (!ctx)
925 return NULL;
926
927 if (!umem->fill_save) {
928 err = xsk_create_umem_rings(umem, xsk->fd, fill, comp);
929 if (err) {
930 free(ctx);
931 return NULL;
932 }
933 } else if (umem->fill_save != fill || umem->comp_save != comp) {
934 /* Copy over rings to new structs. */
935 memcpy(fill, umem->fill_save, sizeof(*fill));
936 memcpy(comp, umem->comp_save, sizeof(*comp));
937 }
938
939 ctx->ifindex = ifindex;
940 ctx->refcount = 1;
941 ctx->umem = umem;
942 ctx->queue_id = queue_id;
943 bpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ);
944
945 ctx->fill = fill;
946 ctx->comp = comp;
947 list_add(&ctx->list, &umem->ctx_list);
948 ctx->has_bpf_link = xsk_probe_bpf_link();
949 return ctx;
950}
951
952static void xsk_destroy_xsk_struct(struct xsk_socket *xsk)
953{
954 free(xsk->ctx);
955 free(xsk);
956}
957
958int xsk_socket__update_xskmap(struct xsk_socket *xsk, int fd)
959{
960 xsk->ctx->xsks_map_fd = fd;
961 return xsk_set_bpf_maps(xsk);
962}
963
964int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd)
965{
966 struct xsk_socket *xsk;
967 int res;
968
969 xsk = calloc(1, sizeof(*xsk));
970 if (!xsk)
971 return -ENOMEM;
972
973 res = xsk_create_xsk_struct(ifindex, xsk);
974 if (res) {
975 free(xsk);
976 return -EINVAL;
977 }
978
979 res = __xsk_setup_xdp_prog(xsk, xsks_map_fd);
980
981 xsk_destroy_xsk_struct(xsk);
982
983 return res;
984}
985
986int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
987 const char *ifname,
988 __u32 queue_id, struct xsk_umem *umem,
989 struct xsk_ring_cons *rx,
990 struct xsk_ring_prod *tx,
991 struct xsk_ring_prod *fill,
992 struct xsk_ring_cons *comp,
993 const struct xsk_socket_config *usr_config)
994{
995 bool unmap, rx_setup_done = false, tx_setup_done = false;
996 void *rx_map = NULL, *tx_map = NULL;
997 struct sockaddr_xdp sxdp = {};
998 struct xdp_mmap_offsets off;
999 struct xsk_socket *xsk;
1000 struct xsk_ctx *ctx;
1001 int err, ifindex;
1002
1003 if (!umem || !xsk_ptr || !(rx || tx))
1004 return -EFAULT;
1005
1006 unmap = umem->fill_save != fill;
1007
1008 xsk = calloc(1, sizeof(*xsk));
1009 if (!xsk)
1010 return -ENOMEM;
1011
1012 err = xsk_set_xdp_socket_config(&xsk->config, usr_config);
1013 if (err)
1014 goto out_xsk_alloc;
1015
1016 xsk->outstanding_tx = 0;
1017 ifindex = if_nametoindex(ifname);
1018 if (!ifindex) {
1019 err = -errno;
1020 goto out_xsk_alloc;
1021 }
1022
1023 if (umem->refcount++ > 0) {
1024 xsk->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0);
1025 if (xsk->fd < 0) {
1026 err = -errno;
1027 goto out_xsk_alloc;
1028 }
1029 } else {
1030 xsk->fd = umem->fd;
1031 rx_setup_done = umem->rx_ring_setup_done;
1032 tx_setup_done = umem->tx_ring_setup_done;
1033 }
1034
1035 ctx = xsk_get_ctx(umem, ifindex, queue_id);
1036 if (!ctx) {
1037 if (!fill || !comp) {
1038 err = -EFAULT;
1039 goto out_socket;
1040 }
1041
1042 ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id,
1043 fill, comp);
1044 if (!ctx) {
1045 err = -ENOMEM;
1046 goto out_socket;
1047 }
1048 }
1049 xsk->ctx = ctx;
1050
1051 if (rx && !rx_setup_done) {
1052 err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
1053 &xsk->config.rx_size,
1054 sizeof(xsk->config.rx_size));
1055 if (err) {
1056 err = -errno;
1057 goto out_put_ctx;
1058 }
1059 if (xsk->fd == umem->fd)
1060 umem->rx_ring_setup_done = true;
1061 }
1062 if (tx && !tx_setup_done) {
1063 err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING,
1064 &xsk->config.tx_size,
1065 sizeof(xsk->config.tx_size));
1066 if (err) {
1067 err = -errno;
1068 goto out_put_ctx;
1069 }
1070 if (xsk->fd == umem->fd)
1071 umem->tx_ring_setup_done = true;
1072 }
1073
1074 err = xsk_get_mmap_offsets(xsk->fd, &off);
1075 if (err) {
1076 err = -errno;
1077 goto out_put_ctx;
1078 }
1079
1080 if (rx) {
1081 rx_map = mmap(NULL, off.rx.desc +
1082 xsk->config.rx_size * sizeof(struct xdp_desc),
1083 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
1084 xsk->fd, XDP_PGOFF_RX_RING);
1085 if (rx_map == MAP_FAILED) {
1086 err = -errno;
1087 goto out_put_ctx;
1088 }
1089
1090 rx->mask = xsk->config.rx_size - 1;
1091 rx->size = xsk->config.rx_size;
1092 rx->producer = rx_map + off.rx.producer;
1093 rx->consumer = rx_map + off.rx.consumer;
1094 rx->flags = rx_map + off.rx.flags;
1095 rx->ring = rx_map + off.rx.desc;
1096 rx->cached_prod = *rx->producer;
1097 rx->cached_cons = *rx->consumer;
1098 }
1099 xsk->rx = rx;
1100
1101 if (tx) {
1102 tx_map = mmap(NULL, off.tx.desc +
1103 xsk->config.tx_size * sizeof(struct xdp_desc),
1104 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
1105 xsk->fd, XDP_PGOFF_TX_RING);
1106 if (tx_map == MAP_FAILED) {
1107 err = -errno;
1108 goto out_mmap_rx;
1109 }
1110
1111 tx->mask = xsk->config.tx_size - 1;
1112 tx->size = xsk->config.tx_size;
1113 tx->producer = tx_map + off.tx.producer;
1114 tx->consumer = tx_map + off.tx.consumer;
1115 tx->flags = tx_map + off.tx.flags;
1116 tx->ring = tx_map + off.tx.desc;
1117 tx->cached_prod = *tx->producer;
1118 /* cached_cons is r->size bigger than the real consumer pointer
1119 * See xsk_prod_nb_free
1120 */
1121 tx->cached_cons = *tx->consumer + xsk->config.tx_size;
1122 }
1123 xsk->tx = tx;
1124
1125 sxdp.sxdp_family = PF_XDP;
1126 sxdp.sxdp_ifindex = ctx->ifindex;
1127 sxdp.sxdp_queue_id = ctx->queue_id;
1128 if (umem->refcount > 1) {
1129 sxdp.sxdp_flags |= XDP_SHARED_UMEM;
1130 sxdp.sxdp_shared_umem_fd = umem->fd;
1131 } else {
1132 sxdp.sxdp_flags = xsk->config.bind_flags;
1133 }
1134
1135 err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
1136 if (err) {
1137 err = -errno;
1138 goto out_mmap_tx;
1139 }
1140
1141 if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
1142 err = __xsk_setup_xdp_prog(xsk, NULL);
1143 if (err)
1144 goto out_mmap_tx;
1145 }
1146
1147 *xsk_ptr = xsk;
1148 umem->fill_save = NULL;
1149 umem->comp_save = NULL;
1150 return 0;
1151
1152out_mmap_tx:
1153 if (tx)
1154 munmap(tx_map, off.tx.desc +
1155 xsk->config.tx_size * sizeof(struct xdp_desc));
1156out_mmap_rx:
1157 if (rx)
1158 munmap(rx_map, off.rx.desc +
1159 xsk->config.rx_size * sizeof(struct xdp_desc));
1160out_put_ctx:
1161 xsk_put_ctx(ctx, unmap);
1162out_socket:
1163 if (--umem->refcount)
1164 close(xsk->fd);
1165out_xsk_alloc:
1166 free(xsk);
1167 return err;
1168}
1169
1170int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
1171 __u32 queue_id, struct xsk_umem *umem,
1172 struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
1173 const struct xsk_socket_config *usr_config)
1174{
1175 if (!umem)
1176 return -EFAULT;
1177
1178 return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem,
1179 rx, tx, umem->fill_save,
1180 umem->comp_save, usr_config);
1181}
1182
1183int xsk_umem__delete(struct xsk_umem *umem)
1184{
1185 struct xdp_mmap_offsets off;
1186 int err;
1187
1188 if (!umem)
1189 return 0;
1190
1191 if (umem->refcount)
1192 return -EBUSY;
1193
1194 err = xsk_get_mmap_offsets(umem->fd, &off);
1195 if (!err && umem->fill_save && umem->comp_save) {
1196 munmap(umem->fill_save->ring - off.fr.desc,
1197 off.fr.desc + umem->config.fill_size * sizeof(__u64));
1198 munmap(umem->comp_save->ring - off.cr.desc,
1199 off.cr.desc + umem->config.comp_size * sizeof(__u64));
1200 }
1201
1202 close(umem->fd);
1203 free(umem);
1204
1205 return 0;
1206}
1207
1208void xsk_socket__delete(struct xsk_socket *xsk)
1209{
1210 size_t desc_sz = sizeof(struct xdp_desc);
1211 struct xdp_mmap_offsets off;
1212 struct xsk_umem *umem;
1213 struct xsk_ctx *ctx;
1214 int err;
1215
1216 if (!xsk)
1217 return;
1218
1219 ctx = xsk->ctx;
1220 umem = ctx->umem;
1221
1222 if (ctx->refcount == 1) {
1223 xsk_delete_bpf_maps(xsk);
1224 close(ctx->prog_fd);
1225 if (ctx->has_bpf_link)
1226 close(ctx->link_fd);
1227 }
1228
1229 xsk_put_ctx(ctx, true);
1230
1231 err = xsk_get_mmap_offsets(xsk->fd, &off);
1232 if (!err) {
1233 if (xsk->rx) {
1234 munmap(xsk->rx->ring - off.rx.desc,
1235 off.rx.desc + xsk->config.rx_size * desc_sz);
1236 }
1237 if (xsk->tx) {
1238 munmap(xsk->tx->ring - off.tx.desc,
1239 off.tx.desc + xsk->config.tx_size * desc_sz);
1240 }
1241 }
1242
1243 umem->refcount--;
1244 /* Do not close an fd that also has an associated umem connected
1245 * to it.
1246 */
1247 if (xsk->fd != umem->fd)
1248 close(xsk->fd);
1249 free(xsk);
1250}