Linux Audio

Check our new training course

Loading...
v6.8
   1// SPDX-License-Identifier: GPL-2.0
   2/* XDP sockets
   3 *
   4 * AF_XDP sockets allows a channel between XDP programs and userspace
   5 * applications.
   6 * Copyright(c) 2018 Intel Corporation.
   7 *
   8 * Author(s): Björn Töpel <bjorn.topel@intel.com>
   9 *	      Magnus Karlsson <magnus.karlsson@intel.com>
  10 */
  11
  12#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
  13
  14#include <linux/if_xdp.h>
  15#include <linux/init.h>
  16#include <linux/sched/mm.h>
  17#include <linux/sched/signal.h>
  18#include <linux/sched/task.h>
  19#include <linux/socket.h>
  20#include <linux/file.h>
  21#include <linux/uaccess.h>
  22#include <linux/net.h>
  23#include <linux/netdevice.h>
  24#include <linux/rculist.h>
  25#include <linux/vmalloc.h>
  26#include <net/xdp_sock_drv.h>
  27#include <net/busy_poll.h>
  28#include <net/netdev_rx_queue.h>
  29#include <net/xdp.h>
  30
  31#include "xsk_queue.h"
  32#include "xdp_umem.h"
  33#include "xsk.h"
  34
  35#define TX_BATCH_SIZE 32
  36#define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE)
  37
  38static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
  39
  40void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
  41{
  42	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
  43		return;
  44
  45	pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
  46	pool->cached_need_wakeup |= XDP_WAKEUP_RX;
  47}
  48EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
  49
  50void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
  51{
  52	struct xdp_sock *xs;
  53
  54	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
  55		return;
  56
  57	rcu_read_lock();
  58	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
  59		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
  60	}
  61	rcu_read_unlock();
  62
  63	pool->cached_need_wakeup |= XDP_WAKEUP_TX;
  64}
  65EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
  66
  67void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
  68{
  69	if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
  70		return;
  71
  72	pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
  73	pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
  74}
  75EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
  76
  77void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
  78{
  79	struct xdp_sock *xs;
  80
  81	if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
  82		return;
  83
  84	rcu_read_lock();
  85	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
  86		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
  87	}
  88	rcu_read_unlock();
  89
  90	pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
  91}
  92EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
  93
  94bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
  95{
  96	return pool->uses_need_wakeup;
  97}
  98EXPORT_SYMBOL(xsk_uses_need_wakeup);
  99
 100struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
 101					    u16 queue_id)
 102{
 103	if (queue_id < dev->real_num_rx_queues)
 104		return dev->_rx[queue_id].pool;
 105	if (queue_id < dev->real_num_tx_queues)
 106		return dev->_tx[queue_id].pool;
 107
 108	return NULL;
 109}
 110EXPORT_SYMBOL(xsk_get_pool_from_qid);
 111
 112void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
 113{
 114	if (queue_id < dev->num_rx_queues)
 115		dev->_rx[queue_id].pool = NULL;
 116	if (queue_id < dev->num_tx_queues)
 117		dev->_tx[queue_id].pool = NULL;
 118}
 119
 120/* The buffer pool is stored both in the _rx struct and the _tx struct as we do
 121 * not know if the device has more tx queues than rx, or the opposite.
 122 * This might also change during run time.
 123 */
 124int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
 125			u16 queue_id)
 126{
 127	if (queue_id >= max_t(unsigned int,
 128			      dev->real_num_rx_queues,
 129			      dev->real_num_tx_queues))
 130		return -EINVAL;
 131
 132	if (queue_id < dev->real_num_rx_queues)
 133		dev->_rx[queue_id].pool = pool;
 134	if (queue_id < dev->real_num_tx_queues)
 135		dev->_tx[queue_id].pool = pool;
 136
 137	return 0;
 138}
 139
 140static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len,
 141			u32 flags)
 142{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 143	u64 addr;
 144	int err;
 145
 146	addr = xp_get_handle(xskb);
 147	err = xskq_prod_reserve_desc(xs->rx, addr, len, flags);
 148	if (err) {
 149		xs->rx_queue_full++;
 150		return err;
 151	}
 152
 153	xp_release(xskb);
 154	return 0;
 155}
 156
 157static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 158{
 159	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
 160	u32 frags = xdp_buff_has_frags(xdp);
 161	struct xdp_buff_xsk *pos, *tmp;
 162	struct list_head *xskb_list;
 163	u32 contd = 0;
 164	int err;
 165
 166	if (frags)
 167		contd = XDP_PKT_CONTD;
 168
 169	err = __xsk_rcv_zc(xs, xskb, len, contd);
 170	if (err)
 171		goto err;
 172	if (likely(!frags))
 173		return 0;
 174
 175	xskb_list = &xskb->pool->xskb_list;
 176	list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) {
 177		if (list_is_singular(xskb_list))
 178			contd = 0;
 179		len = pos->xdp.data_end - pos->xdp.data;
 180		err = __xsk_rcv_zc(xs, pos, len, contd);
 181		if (err)
 182			goto err;
 183		list_del(&pos->xskb_list_node);
 184	}
 185
 186	return 0;
 187err:
 188	xsk_buff_free(xdp);
 189	return err;
 190}
 191
 192static void *xsk_copy_xdp_start(struct xdp_buff *from)
 193{
 194	if (unlikely(xdp_data_meta_unsupported(from)))
 195		return from->data;
 196	else
 197		return from->data_meta;
 198}
 199
 200static u32 xsk_copy_xdp(void *to, void **from, u32 to_len,
 201			u32 *from_len, skb_frag_t **frag, u32 rem)
 202{
 203	u32 copied = 0;
 204
 205	while (1) {
 206		u32 copy_len = min_t(u32, *from_len, to_len);
 207
 208		memcpy(to, *from, copy_len);
 209		copied += copy_len;
 210		if (rem == copied)
 211			return copied;
 212
 213		if (*from_len == copy_len) {
 214			*from = skb_frag_address(*frag);
 215			*from_len = skb_frag_size((*frag)++);
 216		} else {
 217			*from += copy_len;
 218			*from_len -= copy_len;
 219		}
 220		if (to_len == copy_len)
 221			return copied;
 222
 223		to_len -= copy_len;
 224		to += copy_len;
 225	}
 226}
 227
 228static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 229{
 230	u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool);
 231	void *copy_from = xsk_copy_xdp_start(xdp), *copy_to;
 232	u32 from_len, meta_len, rem, num_desc;
 233	struct xdp_buff_xsk *xskb;
 234	struct xdp_buff *xsk_xdp;
 235	skb_frag_t *frag;
 236
 237	from_len = xdp->data_end - copy_from;
 238	meta_len = xdp->data - copy_from;
 239	rem = len + meta_len;
 240
 241	if (len <= frame_size && !xdp_buff_has_frags(xdp)) {
 242		int err;
 243
 244		xsk_xdp = xsk_buff_alloc(xs->pool);
 245		if (!xsk_xdp) {
 246			xs->rx_dropped++;
 247			return -ENOMEM;
 248		}
 249		memcpy(xsk_xdp->data - meta_len, copy_from, rem);
 250		xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
 251		err = __xsk_rcv_zc(xs, xskb, len, 0);
 252		if (err) {
 253			xsk_buff_free(xsk_xdp);
 254			return err;
 255		}
 256
 257		return 0;
 
 
 
 258	}
 259
 260	num_desc = (len - 1) / frame_size + 1;
 261
 262	if (!xsk_buff_can_alloc(xs->pool, num_desc)) {
 263		xs->rx_dropped++;
 264		return -ENOMEM;
 265	}
 266	if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) {
 267		xs->rx_queue_full++;
 268		return -ENOBUFS;
 269	}
 270
 271	if (xdp_buff_has_frags(xdp)) {
 272		struct skb_shared_info *sinfo;
 273
 274		sinfo = xdp_get_shared_info_from_buff(xdp);
 275		frag =  &sinfo->frags[0];
 276	}
 277
 278	do {
 279		u32 to_len = frame_size + meta_len;
 280		u32 copied;
 281
 282		xsk_xdp = xsk_buff_alloc(xs->pool);
 283		copy_to = xsk_xdp->data - meta_len;
 284
 285		copied = xsk_copy_xdp(copy_to, &copy_from, to_len, &from_len, &frag, rem);
 286		rem -= copied;
 287
 288		xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
 289		__xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0);
 290		meta_len = 0;
 291	} while (rem);
 292
 293	return 0;
 294}
 295
 296static bool xsk_tx_writeable(struct xdp_sock *xs)
 297{
 298	if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
 299		return false;
 300
 301	return true;
 302}
 303
 304static bool xsk_is_bound(struct xdp_sock *xs)
 305{
 306	if (READ_ONCE(xs->state) == XSK_BOUND) {
 307		/* Matches smp_wmb() in bind(). */
 308		smp_rmb();
 309		return true;
 310	}
 311	return false;
 312}
 313
 314static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 315{
 316	if (!xsk_is_bound(xs))
 317		return -ENXIO;
 318
 319	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 320		return -EINVAL;
 321
 322	if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
 323		xs->rx_dropped++;
 324		return -ENOSPC;
 325	}
 326
 327	sk_mark_napi_id_once_xdp(&xs->sk, xdp);
 328	return 0;
 329}
 330
 331static void xsk_flush(struct xdp_sock *xs)
 332{
 333	xskq_prod_submit(xs->rx);
 334	__xskq_cons_release(xs->pool->fq);
 335	sock_def_readable(&xs->sk);
 336}
 337
 338int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 339{
 340	u32 len = xdp_get_buff_len(xdp);
 341	int err;
 342
 343	spin_lock_bh(&xs->rx_lock);
 344	err = xsk_rcv_check(xs, xdp, len);
 345	if (!err) {
 346		err = __xsk_rcv(xs, xdp, len);
 347		xsk_flush(xs);
 348	}
 349	spin_unlock_bh(&xs->rx_lock);
 350	return err;
 351}
 352
 353static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 354{
 355	u32 len = xdp_get_buff_len(xdp);
 356	int err;
 
 357
 358	err = xsk_rcv_check(xs, xdp, len);
 359	if (err)
 360		return err;
 361
 362	if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
 363		len = xdp->data_end - xdp->data;
 364		return xsk_rcv_zc(xs, xdp, len);
 365	}
 366
 367	err = __xsk_rcv(xs, xdp, len);
 368	if (!err)
 369		xdp_return_buff(xdp);
 370	return err;
 371}
 372
 373int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
 374{
 375	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
 376	int err;
 377
 378	err = xsk_rcv(xs, xdp);
 379	if (err)
 380		return err;
 381
 382	if (!xs->flush_node.prev)
 383		list_add(&xs->flush_node, flush_list);
 384
 385	return 0;
 386}
 387
 388void __xsk_map_flush(void)
 389{
 390	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
 391	struct xdp_sock *xs, *tmp;
 392
 393	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
 394		xsk_flush(xs);
 395		__list_del_clearprev(&xs->flush_node);
 396	}
 397}
 398
 399#ifdef CONFIG_DEBUG_NET
 400bool xsk_map_check_flush(void)
 401{
 402	if (list_empty(this_cpu_ptr(&xskmap_flush_list)))
 403		return false;
 404	__xsk_map_flush();
 405	return true;
 406}
 407#endif
 408
 409void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
 410{
 411	xskq_prod_submit_n(pool->cq, nb_entries);
 412}
 413EXPORT_SYMBOL(xsk_tx_completed);
 414
 415void xsk_tx_release(struct xsk_buff_pool *pool)
 416{
 417	struct xdp_sock *xs;
 418
 419	rcu_read_lock();
 420	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
 421		__xskq_cons_release(xs->tx);
 422		if (xsk_tx_writeable(xs))
 423			xs->sk.sk_write_space(&xs->sk);
 424	}
 425	rcu_read_unlock();
 426}
 427EXPORT_SYMBOL(xsk_tx_release);
 428
 429bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
 430{
 431	bool budget_exhausted = false;
 432	struct xdp_sock *xs;
 433
 434	rcu_read_lock();
 435again:
 436	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
 437		if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) {
 438			budget_exhausted = true;
 439			continue;
 440		}
 441
 442		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
 443			if (xskq_has_descs(xs->tx))
 444				xskq_cons_release(xs->tx);
 445			continue;
 446		}
 447
 448		xs->tx_budget_spent++;
 449
 450		/* This is the backpressure mechanism for the Tx path.
 451		 * Reserve space in the completion queue and only proceed
 452		 * if there is space in it. This avoids having to implement
 453		 * any buffering in the Tx path.
 454		 */
 455		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
 456			goto out;
 457
 458		xskq_cons_release(xs->tx);
 459		rcu_read_unlock();
 460		return true;
 461	}
 462
 463	if (budget_exhausted) {
 464		list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list)
 465			xs->tx_budget_spent = 0;
 466
 467		budget_exhausted = false;
 468		goto again;
 469	}
 470
 471out:
 472	rcu_read_unlock();
 473	return false;
 474}
 475EXPORT_SYMBOL(xsk_tx_peek_desc);
 476
 477static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
 
 478{
 479	struct xdp_desc *descs = pool->tx_descs;
 480	u32 nb_pkts = 0;
 481
 482	while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
 483		nb_pkts++;
 484
 485	xsk_tx_release(pool);
 486	return nb_pkts;
 487}
 488
 489u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
 
 490{
 491	struct xdp_sock *xs;
 
 492
 493	rcu_read_lock();
 494	if (!list_is_singular(&pool->xsk_tx_list)) {
 495		/* Fallback to the non-batched version */
 496		rcu_read_unlock();
 497		return xsk_tx_peek_release_fallback(pool, nb_pkts);
 498	}
 499
 500	xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
 501	if (!xs) {
 502		nb_pkts = 0;
 503		goto out;
 504	}
 505
 506	nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts);
 
 
 
 
 507
 508	/* This is the backpressure mechanism for the Tx path. Try to
 509	 * reserve space in the completion queue for all packets, but
 510	 * if there are fewer slots available, just process that many
 511	 * packets. This avoids having to implement any buffering in
 512	 * the Tx path.
 513	 */
 514	nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts);
 515	if (!nb_pkts)
 516		goto out;
 517
 518	nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts);
 519	if (!nb_pkts) {
 520		xs->tx->queue_empty_descs++;
 521		goto out;
 522	}
 523
 524	__xskq_cons_release(xs->tx);
 525	xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
 526	xs->sk.sk_write_space(&xs->sk);
 527
 528out:
 529	rcu_read_unlock();
 530	return nb_pkts;
 531}
 532EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
 533
 534static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
 535{
 536	struct net_device *dev = xs->dev;
 
 537
 538	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
 539}
 540
 541static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr)
 542{
 543	unsigned long flags;
 544	int ret;
 545
 546	spin_lock_irqsave(&xs->pool->cq_lock, flags);
 547	ret = xskq_prod_reserve_addr(xs->pool->cq, addr);
 548	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 549
 550	return ret;
 551}
 552
 553static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n)
 554{
 555	unsigned long flags;
 556
 557	spin_lock_irqsave(&xs->pool->cq_lock, flags);
 558	xskq_prod_submit_n(xs->pool->cq, n);
 559	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 560}
 561
 562static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n)
 563{
 
 
 564	unsigned long flags;
 565
 566	spin_lock_irqsave(&xs->pool->cq_lock, flags);
 567	xskq_prod_cancel_n(xs->pool->cq, n);
 568	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 569}
 570
 571static u32 xsk_get_num_desc(struct sk_buff *skb)
 572{
 573	return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
 574}
 575
 576static void xsk_destruct_skb(struct sk_buff *skb)
 577{
 578	struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
 579
 580	if (compl->tx_timestamp) {
 581		/* sw completion timestamp, not a real one */
 582		*compl->tx_timestamp = ktime_get_tai_fast_ns();
 583	}
 584
 585	xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb));
 586	sock_wfree(skb);
 587}
 588
 589static void xsk_set_destructor_arg(struct sk_buff *skb)
 590{
 591	long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
 592
 593	skb_shinfo(skb)->destructor_arg = (void *)num;
 594}
 595
 596static void xsk_consume_skb(struct sk_buff *skb)
 597{
 598	struct xdp_sock *xs = xdp_sk(skb->sk);
 599
 600	skb->destructor = sock_wfree;
 601	xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb));
 602	/* Free skb without triggering the perf drop trace */
 603	consume_skb(skb);
 604	xs->skb = NULL;
 605}
 606
 607static void xsk_drop_skb(struct sk_buff *skb)
 608{
 609	xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb);
 610	xsk_consume_skb(skb);
 611}
 612
 613static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 614					      struct xdp_desc *desc)
 615{
 616	struct xsk_buff_pool *pool = xs->pool;
 617	u32 hr, len, ts, offset, copy, copied;
 618	struct sk_buff *skb = xs->skb;
 619	struct page *page;
 620	void *buffer;
 621	int err, i;
 622	u64 addr;
 623
 624	if (!skb) {
 625		hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
 626
 627		skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
 628		if (unlikely(!skb))
 629			return ERR_PTR(err);
 630
 631		skb_reserve(skb, hr);
 632	}
 633
 634	addr = desc->addr;
 635	len = desc->len;
 636	ts = pool->unaligned ? len : pool->chunk_size;
 637
 638	buffer = xsk_buff_raw_get_data(pool, addr);
 639	offset = offset_in_page(buffer);
 640	addr = buffer - pool->addrs;
 641
 642	for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
 643		if (unlikely(i >= MAX_SKB_FRAGS))
 644			return ERR_PTR(-EOVERFLOW);
 645
 646		page = pool->umem->pgs[addr >> PAGE_SHIFT];
 647		get_page(page);
 648
 649		copy = min_t(u32, PAGE_SIZE - offset, len - copied);
 650		skb_fill_page_desc(skb, i, page, offset, copy);
 651
 652		copied += copy;
 653		addr += copy;
 654		offset = 0;
 655	}
 656
 657	skb->len += len;
 658	skb->data_len += len;
 659	skb->truesize += ts;
 660
 661	refcount_add(ts, &xs->sk.sk_wmem_alloc);
 662
 663	return skb;
 664}
 665
 666static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 667				     struct xdp_desc *desc)
 668{
 669	struct xsk_tx_metadata *meta = NULL;
 670	struct net_device *dev = xs->dev;
 671	struct sk_buff *skb = xs->skb;
 672	bool first_frag = false;
 673	int err;
 674
 675	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
 676		skb = xsk_build_skb_zerocopy(xs, desc);
 677		if (IS_ERR(skb)) {
 678			err = PTR_ERR(skb);
 679			goto free_err;
 680		}
 681	} else {
 682		u32 hr, tr, len;
 683		void *buffer;
 
 684
 685		buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
 
 686		len = desc->len;
 687
 688		if (!skb) {
 689			hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
 690			tr = dev->needed_tailroom;
 691			skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
 692			if (unlikely(!skb))
 693				goto free_err;
 694
 695			skb_reserve(skb, hr);
 696			skb_put(skb, len);
 697
 698			err = skb_store_bits(skb, 0, buffer, len);
 699			if (unlikely(err)) {
 700				kfree_skb(skb);
 701				goto free_err;
 702			}
 703
 704			first_frag = true;
 705		} else {
 706			int nr_frags = skb_shinfo(skb)->nr_frags;
 707			struct page *page;
 708			u8 *vaddr;
 709
 710			if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
 711				err = -EOVERFLOW;
 712				goto free_err;
 713			}
 714
 715			page = alloc_page(xs->sk.sk_allocation);
 716			if (unlikely(!page)) {
 717				err = -EAGAIN;
 718				goto free_err;
 719			}
 720
 721			vaddr = kmap_local_page(page);
 722			memcpy(vaddr, buffer, len);
 723			kunmap_local(vaddr);
 724
 725			skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
 726			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
 727		}
 728
 729		if (first_frag && desc->options & XDP_TX_METADATA) {
 730			if (unlikely(xs->pool->tx_metadata_len == 0)) {
 731				err = -EINVAL;
 732				goto free_err;
 733			}
 734
 735			meta = buffer - xs->pool->tx_metadata_len;
 736			if (unlikely(!xsk_buff_valid_tx_metadata(meta))) {
 737				err = -EINVAL;
 738				goto free_err;
 739			}
 740
 741			if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) {
 742				if (unlikely(meta->request.csum_start +
 743					     meta->request.csum_offset +
 744					     sizeof(__sum16) > len)) {
 745					err = -EINVAL;
 746					goto free_err;
 747				}
 748
 749				skb->csum_start = hr + meta->request.csum_start;
 750				skb->csum_offset = meta->request.csum_offset;
 751				skb->ip_summed = CHECKSUM_PARTIAL;
 752
 753				if (unlikely(xs->pool->tx_sw_csum)) {
 754					err = skb_checksum_help(skb);
 755					if (err)
 756						goto free_err;
 757				}
 758			}
 759		}
 760	}
 761
 762	skb->dev = dev;
 763	skb->priority = READ_ONCE(xs->sk.sk_priority);
 764	skb->mark = READ_ONCE(xs->sk.sk_mark);
 
 765	skb->destructor = xsk_destruct_skb;
 766	xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
 767	xsk_set_destructor_arg(skb);
 768
 769	return skb;
 770
 771free_err:
 772	if (err == -EOVERFLOW) {
 773		/* Drop the packet */
 774		xsk_set_destructor_arg(xs->skb);
 775		xsk_drop_skb(xs->skb);
 776		xskq_cons_release(xs->tx);
 777	} else {
 778		/* Let application retry */
 779		xsk_cq_cancel_locked(xs, 1);
 780	}
 781
 782	return ERR_PTR(err);
 783}
 784
 785static int __xsk_generic_xmit(struct sock *sk)
 786{
 787	struct xdp_sock *xs = xdp_sk(sk);
 788	u32 max_batch = TX_BATCH_SIZE;
 789	bool sent_frame = false;
 790	struct xdp_desc desc;
 791	struct sk_buff *skb;
 
 792	int err = 0;
 793
 794	mutex_lock(&xs->mutex);
 795
 796	/* Since we dropped the RCU read lock, the socket state might have changed. */
 797	if (unlikely(!xsk_is_bound(xs))) {
 798		err = -ENXIO;
 799		goto out;
 800	}
 801
 802	if (xs->queue_id >= xs->dev->real_num_tx_queues)
 803		goto out;
 804
 805	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
 806		if (max_batch-- == 0) {
 807			err = -EAGAIN;
 808			goto out;
 809		}
 810
 
 
 
 
 
 
 811		/* This is the backpressure mechanism for the Tx path.
 812		 * Reserve space in the completion queue and only proceed
 813		 * if there is space in it. This avoids having to implement
 814		 * any buffering in the Tx path.
 815		 */
 816		if (xsk_cq_reserve_addr_locked(xs, desc.addr))
 
 
 
 817			goto out;
 818
 819		skb = xsk_build_skb(xs, &desc);
 820		if (IS_ERR(skb)) {
 821			err = PTR_ERR(skb);
 822			if (err != -EOVERFLOW)
 823				goto out;
 824			err = 0;
 825			continue;
 826		}
 827
 828		xskq_cons_release(xs->tx);
 829
 830		if (xp_mb_desc(&desc)) {
 831			xs->skb = skb;
 832			continue;
 833		}
 
 834
 835		err = __dev_direct_xmit(skb, xs->queue_id);
 836		if  (err == NETDEV_TX_BUSY) {
 837			/* Tell user-space to retry the send */
 838			xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb));
 839			xsk_consume_skb(skb);
 
 
 
 
 840			err = -EAGAIN;
 841			goto out;
 842		}
 843
 
 844		/* Ignore NET_XMIT_CN as packet might have been sent */
 845		if (err == NET_XMIT_DROP) {
 846			/* SKB completed but not sent */
 847			err = -EBUSY;
 848			xs->skb = NULL;
 849			goto out;
 850		}
 851
 852		sent_frame = true;
 853		xs->skb = NULL;
 854	}
 855
 856	if (xskq_has_descs(xs->tx)) {
 857		if (xs->skb)
 858			xsk_drop_skb(xs->skb);
 859		xskq_cons_release(xs->tx);
 860	}
 861
 862out:
 863	if (sent_frame)
 864		if (xsk_tx_writeable(xs))
 865			sk->sk_write_space(sk);
 866
 867	mutex_unlock(&xs->mutex);
 868	return err;
 869}
 870
 871static int xsk_generic_xmit(struct sock *sk)
 872{
 873	int ret;
 874
 875	/* Drop the RCU lock since the SKB path might sleep. */
 876	rcu_read_unlock();
 877	ret = __xsk_generic_xmit(sk);
 878	/* Reaquire RCU lock before going into common code. */
 879	rcu_read_lock();
 880
 881	return ret;
 882}
 883
 884static bool xsk_no_wakeup(struct sock *sk)
 885{
 886#ifdef CONFIG_NET_RX_BUSY_POLL
 887	/* Prefer busy-polling, skip the wakeup. */
 888	return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
 889		READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
 890#else
 891	return false;
 892#endif
 893}
 894
 895static int xsk_check_common(struct xdp_sock *xs)
 896{
 897	if (unlikely(!xsk_is_bound(xs)))
 898		return -ENXIO;
 899	if (unlikely(!(xs->dev->flags & IFF_UP)))
 900		return -ENETDOWN;
 901
 902	return 0;
 903}
 904
 905static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 906{
 907	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
 908	struct sock *sk = sock->sk;
 909	struct xdp_sock *xs = xdp_sk(sk);
 910	struct xsk_buff_pool *pool;
 911	int err;
 912
 913	err = xsk_check_common(xs);
 914	if (err)
 915		return err;
 916	if (unlikely(need_wait))
 917		return -EOPNOTSUPP;
 918	if (unlikely(!xs->tx))
 919		return -ENOBUFS;
 920
 921	if (sk_can_busy_loop(sk)) {
 922		if (xs->zc)
 923			__sk_mark_napi_id_once(sk, xsk_pool_get_napi_id(xs->pool));
 924		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
 925	}
 926
 927	if (xs->zc && xsk_no_wakeup(sk))
 928		return 0;
 929
 930	pool = xs->pool;
 931	if (pool->cached_need_wakeup & XDP_WAKEUP_TX) {
 932		if (xs->zc)
 933			return xsk_wakeup(xs, XDP_WAKEUP_TX);
 934		return xsk_generic_xmit(sk);
 935	}
 936	return 0;
 937}
 938
 939static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 940{
 941	int ret;
 942
 943	rcu_read_lock();
 944	ret = __xsk_sendmsg(sock, m, total_len);
 945	rcu_read_unlock();
 946
 947	return ret;
 948}
 949
 950static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
 951{
 952	bool need_wait = !(flags & MSG_DONTWAIT);
 953	struct sock *sk = sock->sk;
 954	struct xdp_sock *xs = xdp_sk(sk);
 955	int err;
 956
 957	err = xsk_check_common(xs);
 958	if (err)
 959		return err;
 
 960	if (unlikely(!xs->rx))
 961		return -ENOBUFS;
 962	if (unlikely(need_wait))
 963		return -EOPNOTSUPP;
 964
 965	if (sk_can_busy_loop(sk))
 966		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
 967
 968	if (xsk_no_wakeup(sk))
 969		return 0;
 970
 971	if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
 972		return xsk_wakeup(xs, XDP_WAKEUP_RX);
 973	return 0;
 974}
 975
 976static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
 977{
 978	int ret;
 979
 980	rcu_read_lock();
 981	ret = __xsk_recvmsg(sock, m, len, flags);
 982	rcu_read_unlock();
 983
 984	return ret;
 985}
 986
 987static __poll_t xsk_poll(struct file *file, struct socket *sock,
 988			     struct poll_table_struct *wait)
 989{
 990	__poll_t mask = 0;
 991	struct sock *sk = sock->sk;
 992	struct xdp_sock *xs = xdp_sk(sk);
 993	struct xsk_buff_pool *pool;
 994
 995	sock_poll_wait(file, sock, wait);
 996
 997	rcu_read_lock();
 998	if (xsk_check_common(xs))
 999		goto out;
1000
1001	pool = xs->pool;
1002
1003	if (pool->cached_need_wakeup) {
1004		if (xs->zc)
1005			xsk_wakeup(xs, pool->cached_need_wakeup);
1006		else if (xs->tx)
1007			/* Poll needs to drive Tx also in copy mode */
1008			xsk_generic_xmit(sk);
1009	}
1010
1011	if (xs->rx && !xskq_prod_is_empty(xs->rx))
1012		mask |= EPOLLIN | EPOLLRDNORM;
1013	if (xs->tx && xsk_tx_writeable(xs))
1014		mask |= EPOLLOUT | EPOLLWRNORM;
1015out:
1016	rcu_read_unlock();
1017	return mask;
1018}
1019
1020static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
1021			  bool umem_queue)
1022{
1023	struct xsk_queue *q;
1024
1025	if (entries == 0 || *queue || !is_power_of_2(entries))
1026		return -EINVAL;
1027
1028	q = xskq_create(entries, umem_queue);
1029	if (!q)
1030		return -ENOMEM;
1031
1032	/* Make sure queue is ready before it can be seen by others */
1033	smp_wmb();
1034	WRITE_ONCE(*queue, q);
1035	return 0;
1036}
1037
1038static void xsk_unbind_dev(struct xdp_sock *xs)
1039{
1040	struct net_device *dev = xs->dev;
1041
1042	if (xs->state != XSK_BOUND)
1043		return;
1044	WRITE_ONCE(xs->state, XSK_UNBOUND);
1045
1046	/* Wait for driver to stop using the xdp socket. */
1047	xp_del_xsk(xs->pool, xs);
 
1048	synchronize_net();
1049	dev_put(dev);
1050}
1051
1052static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
1053					      struct xdp_sock __rcu ***map_entry)
1054{
1055	struct xsk_map *map = NULL;
1056	struct xsk_map_node *node;
1057
1058	*map_entry = NULL;
1059
1060	spin_lock_bh(&xs->map_list_lock);
1061	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
1062					node);
1063	if (node) {
1064		bpf_map_inc(&node->map->map);
1065		map = node->map;
1066		*map_entry = node->map_entry;
1067	}
1068	spin_unlock_bh(&xs->map_list_lock);
1069	return map;
1070}
1071
1072static void xsk_delete_from_maps(struct xdp_sock *xs)
1073{
1074	/* This function removes the current XDP socket from all the
1075	 * maps it resides in. We need to take extra care here, due to
1076	 * the two locks involved. Each map has a lock synchronizing
1077	 * updates to the entries, and each socket has a lock that
1078	 * synchronizes access to the list of maps (map_list). For
1079	 * deadlock avoidance the locks need to be taken in the order
1080	 * "map lock"->"socket map list lock". We start off by
1081	 * accessing the socket map list, and take a reference to the
1082	 * map to guarantee existence between the
1083	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
1084	 * calls. Then we ask the map to remove the socket, which
1085	 * tries to remove the socket from the map. Note that there
1086	 * might be updates to the map between
1087	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
1088	 */
1089	struct xdp_sock __rcu **map_entry = NULL;
1090	struct xsk_map *map;
1091
1092	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
1093		xsk_map_try_sock_delete(map, xs, map_entry);
1094		bpf_map_put(&map->map);
1095	}
1096}
1097
1098static int xsk_release(struct socket *sock)
1099{
1100	struct sock *sk = sock->sk;
1101	struct xdp_sock *xs = xdp_sk(sk);
1102	struct net *net;
1103
1104	if (!sk)
1105		return 0;
1106
1107	net = sock_net(sk);
1108
1109	if (xs->skb)
1110		xsk_drop_skb(xs->skb);
1111
1112	mutex_lock(&net->xdp.lock);
1113	sk_del_node_init_rcu(sk);
1114	mutex_unlock(&net->xdp.lock);
1115
 
1116	sock_prot_inuse_add(net, sk->sk_prot, -1);
 
1117
1118	xsk_delete_from_maps(xs);
1119	mutex_lock(&xs->mutex);
1120	xsk_unbind_dev(xs);
1121	mutex_unlock(&xs->mutex);
1122
1123	xskq_destroy(xs->rx);
1124	xskq_destroy(xs->tx);
1125	xskq_destroy(xs->fq_tmp);
1126	xskq_destroy(xs->cq_tmp);
1127
1128	sock_orphan(sk);
1129	sock->sk = NULL;
1130
 
1131	sock_put(sk);
1132
1133	return 0;
1134}
1135
1136static struct socket *xsk_lookup_xsk_from_fd(int fd)
1137{
1138	struct socket *sock;
1139	int err;
1140
1141	sock = sockfd_lookup(fd, &err);
1142	if (!sock)
1143		return ERR_PTR(-ENOTSOCK);
1144
1145	if (sock->sk->sk_family != PF_XDP) {
1146		sockfd_put(sock);
1147		return ERR_PTR(-ENOPROTOOPT);
1148	}
1149
1150	return sock;
1151}
1152
1153static bool xsk_validate_queues(struct xdp_sock *xs)
1154{
1155	return xs->fq_tmp && xs->cq_tmp;
1156}
1157
1158static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
1159{
1160	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
1161	struct sock *sk = sock->sk;
1162	struct xdp_sock *xs = xdp_sk(sk);
1163	struct net_device *dev;
1164	int bound_dev_if;
1165	u32 flags, qid;
1166	int err = 0;
1167
1168	if (addr_len < sizeof(struct sockaddr_xdp))
1169		return -EINVAL;
1170	if (sxdp->sxdp_family != AF_XDP)
1171		return -EINVAL;
1172
1173	flags = sxdp->sxdp_flags;
1174	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
1175		      XDP_USE_NEED_WAKEUP | XDP_USE_SG))
1176		return -EINVAL;
1177
1178	bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
1179	if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex)
1180		return -EINVAL;
1181
1182	rtnl_lock();
1183	mutex_lock(&xs->mutex);
1184	if (xs->state != XSK_READY) {
1185		err = -EBUSY;
1186		goto out_release;
1187	}
1188
1189	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
1190	if (!dev) {
1191		err = -ENODEV;
1192		goto out_release;
1193	}
1194
1195	if (!xs->rx && !xs->tx) {
1196		err = -EINVAL;
1197		goto out_unlock;
1198	}
1199
1200	qid = sxdp->sxdp_queue_id;
1201
1202	if (flags & XDP_SHARED_UMEM) {
1203		struct xdp_sock *umem_xs;
1204		struct socket *sock;
1205
1206		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
1207		    (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) {
1208			/* Cannot specify flags for shared sockets. */
1209			err = -EINVAL;
1210			goto out_unlock;
1211		}
1212
1213		if (xs->umem) {
1214			/* We have already our own. */
1215			err = -EINVAL;
1216			goto out_unlock;
1217		}
1218
1219		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
1220		if (IS_ERR(sock)) {
1221			err = PTR_ERR(sock);
1222			goto out_unlock;
1223		}
1224
1225		umem_xs = xdp_sk(sock->sk);
1226		if (!xsk_is_bound(umem_xs)) {
1227			err = -EBADF;
1228			sockfd_put(sock);
1229			goto out_unlock;
1230		}
1231
1232		if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
1233			/* Share the umem with another socket on another qid
1234			 * and/or device.
1235			 */
1236			xs->pool = xp_create_and_assign_umem(xs,
1237							     umem_xs->umem);
1238			if (!xs->pool) {
1239				err = -ENOMEM;
1240				sockfd_put(sock);
1241				goto out_unlock;
1242			}
1243
1244			err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
1245						   qid);
1246			if (err) {
1247				xp_destroy(xs->pool);
1248				xs->pool = NULL;
1249				sockfd_put(sock);
1250				goto out_unlock;
1251			}
1252		} else {
1253			/* Share the buffer pool with the other socket. */
1254			if (xs->fq_tmp || xs->cq_tmp) {
1255				/* Do not allow setting your own fq or cq. */
1256				err = -EINVAL;
1257				sockfd_put(sock);
1258				goto out_unlock;
1259			}
1260
1261			xp_get_pool(umem_xs->pool);
1262			xs->pool = umem_xs->pool;
1263
1264			/* If underlying shared umem was created without Tx
1265			 * ring, allocate Tx descs array that Tx batching API
1266			 * utilizes
1267			 */
1268			if (xs->tx && !xs->pool->tx_descs) {
1269				err = xp_alloc_tx_descs(xs->pool, xs);
1270				if (err) {
1271					xp_put_pool(xs->pool);
1272					xs->pool = NULL;
1273					sockfd_put(sock);
1274					goto out_unlock;
1275				}
1276			}
1277		}
1278
1279		xdp_get_umem(umem_xs->umem);
1280		WRITE_ONCE(xs->umem, umem_xs->umem);
1281		sockfd_put(sock);
1282	} else if (!xs->umem || !xsk_validate_queues(xs)) {
1283		err = -EINVAL;
1284		goto out_unlock;
1285	} else {
1286		/* This xsk has its own umem. */
1287		xs->pool = xp_create_and_assign_umem(xs, xs->umem);
1288		if (!xs->pool) {
1289			err = -ENOMEM;
1290			goto out_unlock;
1291		}
1292
1293		err = xp_assign_dev(xs->pool, dev, qid, flags);
1294		if (err) {
1295			xp_destroy(xs->pool);
1296			xs->pool = NULL;
1297			goto out_unlock;
1298		}
1299	}
1300
1301	/* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
1302	xs->fq_tmp = NULL;
1303	xs->cq_tmp = NULL;
1304
1305	xs->dev = dev;
1306	xs->zc = xs->umem->zc;
1307	xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG);
1308	xs->queue_id = qid;
1309	xp_add_xsk(xs->pool, xs);
1310
1311out_unlock:
1312	if (err) {
1313		dev_put(dev);
1314	} else {
1315		/* Matches smp_rmb() in bind() for shared umem
1316		 * sockets, and xsk_is_bound().
1317		 */
1318		smp_wmb();
1319		WRITE_ONCE(xs->state, XSK_BOUND);
1320	}
1321out_release:
1322	mutex_unlock(&xs->mutex);
1323	rtnl_unlock();
1324	return err;
1325}
1326
1327struct xdp_umem_reg_v1 {
1328	__u64 addr; /* Start of packet data area */
1329	__u64 len; /* Length of packet data area */
1330	__u32 chunk_size;
1331	__u32 headroom;
1332};
1333
1334struct xdp_umem_reg_v2 {
1335	__u64 addr; /* Start of packet data area */
1336	__u64 len; /* Length of packet data area */
1337	__u32 chunk_size;
1338	__u32 headroom;
1339	__u32 flags;
1340};
1341
1342static int xsk_setsockopt(struct socket *sock, int level, int optname,
1343			  sockptr_t optval, unsigned int optlen)
1344{
1345	struct sock *sk = sock->sk;
1346	struct xdp_sock *xs = xdp_sk(sk);
1347	int err;
1348
1349	if (level != SOL_XDP)
1350		return -ENOPROTOOPT;
1351
1352	switch (optname) {
1353	case XDP_RX_RING:
1354	case XDP_TX_RING:
1355	{
1356		struct xsk_queue **q;
1357		int entries;
1358
1359		if (optlen < sizeof(entries))
1360			return -EINVAL;
1361		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1362			return -EFAULT;
1363
1364		mutex_lock(&xs->mutex);
1365		if (xs->state != XSK_READY) {
1366			mutex_unlock(&xs->mutex);
1367			return -EBUSY;
1368		}
1369		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
1370		err = xsk_init_queue(entries, q, false);
1371		if (!err && optname == XDP_TX_RING)
1372			/* Tx needs to be explicitly woken up the first time */
1373			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
1374		mutex_unlock(&xs->mutex);
1375		return err;
1376	}
1377	case XDP_UMEM_REG:
1378	{
1379		size_t mr_size = sizeof(struct xdp_umem_reg);
1380		struct xdp_umem_reg mr = {};
1381		struct xdp_umem *umem;
1382
1383		if (optlen < sizeof(struct xdp_umem_reg_v1))
1384			return -EINVAL;
1385		else if (optlen < sizeof(struct xdp_umem_reg_v2))
1386			mr_size = sizeof(struct xdp_umem_reg_v1);
1387		else if (optlen < sizeof(mr))
1388			mr_size = sizeof(struct xdp_umem_reg_v2);
1389
1390		if (copy_from_sockptr(&mr, optval, mr_size))
1391			return -EFAULT;
1392
1393		mutex_lock(&xs->mutex);
1394		if (xs->state != XSK_READY || xs->umem) {
1395			mutex_unlock(&xs->mutex);
1396			return -EBUSY;
1397		}
1398
1399		umem = xdp_umem_create(&mr);
1400		if (IS_ERR(umem)) {
1401			mutex_unlock(&xs->mutex);
1402			return PTR_ERR(umem);
1403		}
1404
1405		/* Make sure umem is ready before it can be seen by others */
1406		smp_wmb();
1407		WRITE_ONCE(xs->umem, umem);
1408		mutex_unlock(&xs->mutex);
1409		return 0;
1410	}
1411	case XDP_UMEM_FILL_RING:
1412	case XDP_UMEM_COMPLETION_RING:
1413	{
1414		struct xsk_queue **q;
1415		int entries;
1416
1417		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1418			return -EFAULT;
1419
1420		mutex_lock(&xs->mutex);
1421		if (xs->state != XSK_READY) {
1422			mutex_unlock(&xs->mutex);
1423			return -EBUSY;
1424		}
1425
1426		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
1427			&xs->cq_tmp;
1428		err = xsk_init_queue(entries, q, true);
1429		mutex_unlock(&xs->mutex);
1430		return err;
1431	}
1432	default:
1433		break;
1434	}
1435
1436	return -ENOPROTOOPT;
1437}
1438
1439static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
1440{
1441	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
1442	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
1443	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
1444}
1445
1446static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
1447{
1448	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
1449	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
1450	ring->desc = offsetof(struct xdp_umem_ring, desc);
1451}
1452
1453struct xdp_statistics_v1 {
1454	__u64 rx_dropped;
1455	__u64 rx_invalid_descs;
1456	__u64 tx_invalid_descs;
1457};
1458
1459static int xsk_getsockopt(struct socket *sock, int level, int optname,
1460			  char __user *optval, int __user *optlen)
1461{
1462	struct sock *sk = sock->sk;
1463	struct xdp_sock *xs = xdp_sk(sk);
1464	int len;
1465
1466	if (level != SOL_XDP)
1467		return -ENOPROTOOPT;
1468
1469	if (get_user(len, optlen))
1470		return -EFAULT;
1471	if (len < 0)
1472		return -EINVAL;
1473
1474	switch (optname) {
1475	case XDP_STATISTICS:
1476	{
1477		struct xdp_statistics stats = {};
1478		bool extra_stats = true;
1479		size_t stats_size;
1480
1481		if (len < sizeof(struct xdp_statistics_v1)) {
1482			return -EINVAL;
1483		} else if (len < sizeof(stats)) {
1484			extra_stats = false;
1485			stats_size = sizeof(struct xdp_statistics_v1);
1486		} else {
1487			stats_size = sizeof(stats);
1488		}
1489
1490		mutex_lock(&xs->mutex);
1491		stats.rx_dropped = xs->rx_dropped;
1492		if (extra_stats) {
1493			stats.rx_ring_full = xs->rx_queue_full;
1494			stats.rx_fill_ring_empty_descs =
1495				xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
1496			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
1497		} else {
1498			stats.rx_dropped += xs->rx_queue_full;
1499		}
1500		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1501		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1502		mutex_unlock(&xs->mutex);
1503
1504		if (copy_to_user(optval, &stats, stats_size))
1505			return -EFAULT;
1506		if (put_user(stats_size, optlen))
1507			return -EFAULT;
1508
1509		return 0;
1510	}
1511	case XDP_MMAP_OFFSETS:
1512	{
1513		struct xdp_mmap_offsets off;
1514		struct xdp_mmap_offsets_v1 off_v1;
1515		bool flags_supported = true;
1516		void *to_copy;
1517
1518		if (len < sizeof(off_v1))
1519			return -EINVAL;
1520		else if (len < sizeof(off))
1521			flags_supported = false;
1522
1523		if (flags_supported) {
1524			/* xdp_ring_offset is identical to xdp_ring_offset_v1
1525			 * except for the flags field added to the end.
1526			 */
1527			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1528					       &off.rx);
1529			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1530					       &off.tx);
1531			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1532					       &off.fr);
1533			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1534					       &off.cr);
1535			off.rx.flags = offsetof(struct xdp_rxtx_ring,
1536						ptrs.flags);
1537			off.tx.flags = offsetof(struct xdp_rxtx_ring,
1538						ptrs.flags);
1539			off.fr.flags = offsetof(struct xdp_umem_ring,
1540						ptrs.flags);
1541			off.cr.flags = offsetof(struct xdp_umem_ring,
1542						ptrs.flags);
1543
1544			len = sizeof(off);
1545			to_copy = &off;
1546		} else {
1547			xsk_enter_rxtx_offsets(&off_v1.rx);
1548			xsk_enter_rxtx_offsets(&off_v1.tx);
1549			xsk_enter_umem_offsets(&off_v1.fr);
1550			xsk_enter_umem_offsets(&off_v1.cr);
1551
1552			len = sizeof(off_v1);
1553			to_copy = &off_v1;
1554		}
1555
1556		if (copy_to_user(optval, to_copy, len))
1557			return -EFAULT;
1558		if (put_user(len, optlen))
1559			return -EFAULT;
1560
1561		return 0;
1562	}
1563	case XDP_OPTIONS:
1564	{
1565		struct xdp_options opts = {};
1566
1567		if (len < sizeof(opts))
1568			return -EINVAL;
1569
1570		mutex_lock(&xs->mutex);
1571		if (xs->zc)
1572			opts.flags |= XDP_OPTIONS_ZEROCOPY;
1573		mutex_unlock(&xs->mutex);
1574
1575		len = sizeof(opts);
1576		if (copy_to_user(optval, &opts, len))
1577			return -EFAULT;
1578		if (put_user(len, optlen))
1579			return -EFAULT;
1580
1581		return 0;
1582	}
1583	default:
1584		break;
1585	}
1586
1587	return -EOPNOTSUPP;
1588}
1589
1590static int xsk_mmap(struct file *file, struct socket *sock,
1591		    struct vm_area_struct *vma)
1592{
1593	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1594	unsigned long size = vma->vm_end - vma->vm_start;
1595	struct xdp_sock *xs = xdp_sk(sock->sk);
1596	int state = READ_ONCE(xs->state);
1597	struct xsk_queue *q = NULL;
 
 
1598
1599	if (state != XSK_READY && state != XSK_BOUND)
1600		return -EBUSY;
1601
1602	if (offset == XDP_PGOFF_RX_RING) {
1603		q = READ_ONCE(xs->rx);
1604	} else if (offset == XDP_PGOFF_TX_RING) {
1605		q = READ_ONCE(xs->tx);
1606	} else {
1607		/* Matches the smp_wmb() in XDP_UMEM_REG */
1608		smp_rmb();
1609		if (offset == XDP_UMEM_PGOFF_FILL_RING)
1610			q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) :
1611						 READ_ONCE(xs->pool->fq);
1612		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1613			q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) :
1614						 READ_ONCE(xs->pool->cq);
1615	}
1616
1617	if (!q)
1618		return -EINVAL;
1619
1620	/* Matches the smp_wmb() in xsk_init_queue */
1621	smp_rmb();
1622	if (size > q->ring_vmalloc_size)
 
1623		return -EINVAL;
1624
1625	return remap_vmalloc_range(vma, q->ring, 0);
 
 
1626}
1627
1628static int xsk_notifier(struct notifier_block *this,
1629			unsigned long msg, void *ptr)
1630{
1631	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1632	struct net *net = dev_net(dev);
1633	struct sock *sk;
1634
1635	switch (msg) {
1636	case NETDEV_UNREGISTER:
1637		mutex_lock(&net->xdp.lock);
1638		sk_for_each(sk, &net->xdp.list) {
1639			struct xdp_sock *xs = xdp_sk(sk);
1640
1641			mutex_lock(&xs->mutex);
1642			if (xs->dev == dev) {
1643				sk->sk_err = ENETDOWN;
1644				if (!sock_flag(sk, SOCK_DEAD))
1645					sk_error_report(sk);
1646
1647				xsk_unbind_dev(xs);
1648
1649				/* Clear device references. */
1650				xp_clear_dev(xs->pool);
1651			}
1652			mutex_unlock(&xs->mutex);
1653		}
1654		mutex_unlock(&net->xdp.lock);
1655		break;
1656	}
1657	return NOTIFY_DONE;
1658}
1659
1660static struct proto xsk_proto = {
1661	.name =		"XDP",
1662	.owner =	THIS_MODULE,
1663	.obj_size =	sizeof(struct xdp_sock),
1664};
1665
1666static const struct proto_ops xsk_proto_ops = {
1667	.family		= PF_XDP,
1668	.owner		= THIS_MODULE,
1669	.release	= xsk_release,
1670	.bind		= xsk_bind,
1671	.connect	= sock_no_connect,
1672	.socketpair	= sock_no_socketpair,
1673	.accept		= sock_no_accept,
1674	.getname	= sock_no_getname,
1675	.poll		= xsk_poll,
1676	.ioctl		= sock_no_ioctl,
1677	.listen		= sock_no_listen,
1678	.shutdown	= sock_no_shutdown,
1679	.setsockopt	= xsk_setsockopt,
1680	.getsockopt	= xsk_getsockopt,
1681	.sendmsg	= xsk_sendmsg,
1682	.recvmsg	= xsk_recvmsg,
1683	.mmap		= xsk_mmap,
 
1684};
1685
1686static void xsk_destruct(struct sock *sk)
1687{
1688	struct xdp_sock *xs = xdp_sk(sk);
1689
1690	if (!sock_flag(sk, SOCK_DEAD))
1691		return;
1692
1693	if (!xp_put_pool(xs->pool))
1694		xdp_put_umem(xs->umem, !xs->pool);
 
 
1695}
1696
1697static int xsk_create(struct net *net, struct socket *sock, int protocol,
1698		      int kern)
1699{
1700	struct xdp_sock *xs;
1701	struct sock *sk;
1702
1703	if (!ns_capable(net->user_ns, CAP_NET_RAW))
1704		return -EPERM;
1705	if (sock->type != SOCK_RAW)
1706		return -ESOCKTNOSUPPORT;
1707
1708	if (protocol)
1709		return -EPROTONOSUPPORT;
1710
1711	sock->state = SS_UNCONNECTED;
1712
1713	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1714	if (!sk)
1715		return -ENOBUFS;
1716
1717	sock->ops = &xsk_proto_ops;
1718
1719	sock_init_data(sock, sk);
1720
1721	sk->sk_family = PF_XDP;
1722
1723	sk->sk_destruct = xsk_destruct;
 
1724
1725	sock_set_flag(sk, SOCK_RCU_FREE);
1726
1727	xs = xdp_sk(sk);
1728	xs->state = XSK_READY;
1729	mutex_init(&xs->mutex);
1730	spin_lock_init(&xs->rx_lock);
1731
1732	INIT_LIST_HEAD(&xs->map_list);
1733	spin_lock_init(&xs->map_list_lock);
1734
1735	mutex_lock(&net->xdp.lock);
1736	sk_add_node_rcu(sk, &net->xdp.list);
1737	mutex_unlock(&net->xdp.lock);
1738
 
1739	sock_prot_inuse_add(net, &xsk_proto, 1);
 
1740
1741	return 0;
1742}
1743
1744static const struct net_proto_family xsk_family_ops = {
1745	.family = PF_XDP,
1746	.create = xsk_create,
1747	.owner	= THIS_MODULE,
1748};
1749
1750static struct notifier_block xsk_netdev_notifier = {
1751	.notifier_call	= xsk_notifier,
1752};
1753
1754static int __net_init xsk_net_init(struct net *net)
1755{
1756	mutex_init(&net->xdp.lock);
1757	INIT_HLIST_HEAD(&net->xdp.list);
1758	return 0;
1759}
1760
1761static void __net_exit xsk_net_exit(struct net *net)
1762{
1763	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1764}
1765
1766static struct pernet_operations xsk_net_ops = {
1767	.init = xsk_net_init,
1768	.exit = xsk_net_exit,
1769};
1770
1771static int __init xsk_init(void)
1772{
1773	int err, cpu;
1774
1775	err = proto_register(&xsk_proto, 0 /* no slab */);
1776	if (err)
1777		goto out;
1778
1779	err = sock_register(&xsk_family_ops);
1780	if (err)
1781		goto out_proto;
1782
1783	err = register_pernet_subsys(&xsk_net_ops);
1784	if (err)
1785		goto out_sk;
1786
1787	err = register_netdevice_notifier(&xsk_netdev_notifier);
1788	if (err)
1789		goto out_pernet;
1790
1791	for_each_possible_cpu(cpu)
1792		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1793	return 0;
1794
1795out_pernet:
1796	unregister_pernet_subsys(&xsk_net_ops);
1797out_sk:
1798	sock_unregister(PF_XDP);
1799out_proto:
1800	proto_unregister(&xsk_proto);
1801out:
1802	return err;
1803}
1804
1805fs_initcall(xsk_init);
v5.14.15
   1// SPDX-License-Identifier: GPL-2.0
   2/* XDP sockets
   3 *
   4 * AF_XDP sockets allows a channel between XDP programs and userspace
   5 * applications.
   6 * Copyright(c) 2018 Intel Corporation.
   7 *
   8 * Author(s): Björn Töpel <bjorn.topel@intel.com>
   9 *	      Magnus Karlsson <magnus.karlsson@intel.com>
  10 */
  11
  12#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
  13
  14#include <linux/if_xdp.h>
  15#include <linux/init.h>
  16#include <linux/sched/mm.h>
  17#include <linux/sched/signal.h>
  18#include <linux/sched/task.h>
  19#include <linux/socket.h>
  20#include <linux/file.h>
  21#include <linux/uaccess.h>
  22#include <linux/net.h>
  23#include <linux/netdevice.h>
  24#include <linux/rculist.h>
 
  25#include <net/xdp_sock_drv.h>
  26#include <net/busy_poll.h>
 
  27#include <net/xdp.h>
  28
  29#include "xsk_queue.h"
  30#include "xdp_umem.h"
  31#include "xsk.h"
  32
  33#define TX_BATCH_SIZE 32
 
  34
  35static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
  36
  37void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
  38{
  39	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
  40		return;
  41
  42	pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
  43	pool->cached_need_wakeup |= XDP_WAKEUP_RX;
  44}
  45EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
  46
  47void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
  48{
  49	struct xdp_sock *xs;
  50
  51	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
  52		return;
  53
  54	rcu_read_lock();
  55	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
  56		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
  57	}
  58	rcu_read_unlock();
  59
  60	pool->cached_need_wakeup |= XDP_WAKEUP_TX;
  61}
  62EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
  63
  64void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
  65{
  66	if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
  67		return;
  68
  69	pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
  70	pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
  71}
  72EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
  73
  74void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
  75{
  76	struct xdp_sock *xs;
  77
  78	if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
  79		return;
  80
  81	rcu_read_lock();
  82	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
  83		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
  84	}
  85	rcu_read_unlock();
  86
  87	pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
  88}
  89EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
  90
  91bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
  92{
  93	return pool->uses_need_wakeup;
  94}
  95EXPORT_SYMBOL(xsk_uses_need_wakeup);
  96
  97struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
  98					    u16 queue_id)
  99{
 100	if (queue_id < dev->real_num_rx_queues)
 101		return dev->_rx[queue_id].pool;
 102	if (queue_id < dev->real_num_tx_queues)
 103		return dev->_tx[queue_id].pool;
 104
 105	return NULL;
 106}
 107EXPORT_SYMBOL(xsk_get_pool_from_qid);
 108
 109void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
 110{
 111	if (queue_id < dev->num_rx_queues)
 112		dev->_rx[queue_id].pool = NULL;
 113	if (queue_id < dev->num_tx_queues)
 114		dev->_tx[queue_id].pool = NULL;
 115}
 116
 117/* The buffer pool is stored both in the _rx struct and the _tx struct as we do
 118 * not know if the device has more tx queues than rx, or the opposite.
 119 * This might also change during run time.
 120 */
 121int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
 122			u16 queue_id)
 123{
 124	if (queue_id >= max_t(unsigned int,
 125			      dev->real_num_rx_queues,
 126			      dev->real_num_tx_queues))
 127		return -EINVAL;
 128
 129	if (queue_id < dev->real_num_rx_queues)
 130		dev->_rx[queue_id].pool = pool;
 131	if (queue_id < dev->real_num_tx_queues)
 132		dev->_tx[queue_id].pool = pool;
 133
 134	return 0;
 135}
 136
 137void xp_release(struct xdp_buff_xsk *xskb)
 
 138{
 139	xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
 140}
 141
 142static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
 143{
 144	u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
 145
 146	offset += xskb->pool->headroom;
 147	if (!xskb->pool->unaligned)
 148		return xskb->orig_addr + offset;
 149	return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
 150}
 151
 152static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 153{
 154	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
 155	u64 addr;
 156	int err;
 157
 158	addr = xp_get_handle(xskb);
 159	err = xskq_prod_reserve_desc(xs->rx, addr, len);
 160	if (err) {
 161		xs->rx_queue_full++;
 162		return err;
 163	}
 164
 165	xp_release(xskb);
 166	return 0;
 167}
 168
 169static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
 170{
 171	void *from_buf, *to_buf;
 172	u32 metalen;
 
 
 
 
 
 
 
 
 
 
 
 
 
 173
 174	if (unlikely(xdp_data_meta_unsupported(from))) {
 175		from_buf = from->data;
 176		to_buf = to->data;
 177		metalen = 0;
 178	} else {
 179		from_buf = from->data_meta;
 180		metalen = from->data - from->data_meta;
 181		to_buf = to->data - metalen;
 
 182	}
 183
 184	memcpy(to_buf, from_buf, len + metalen);
 
 
 
 185}
 186
 187static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 188{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 189	struct xdp_buff *xsk_xdp;
 190	int err;
 191	u32 len;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 192
 193	len = xdp->data_end - xdp->data;
 194	if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
 195		xs->rx_dropped++;
 196		return -ENOSPC;
 197	}
 198
 199	xsk_xdp = xsk_buff_alloc(xs->pool);
 200	if (!xsk_xdp) {
 
 201		xs->rx_dropped++;
 202		return -ENOSPC;
 
 
 
 
 203	}
 204
 205	xsk_copy_xdp(xsk_xdp, xdp, len);
 206	err = __xsk_rcv_zc(xs, xsk_xdp, len);
 207	if (err) {
 208		xsk_buff_free(xsk_xdp);
 209		return err;
 210	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 211	return 0;
 212}
 213
 214static bool xsk_tx_writeable(struct xdp_sock *xs)
 215{
 216	if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
 217		return false;
 218
 219	return true;
 220}
 221
 222static bool xsk_is_bound(struct xdp_sock *xs)
 223{
 224	if (READ_ONCE(xs->state) == XSK_BOUND) {
 225		/* Matches smp_wmb() in bind(). */
 226		smp_rmb();
 227		return true;
 228	}
 229	return false;
 230}
 231
 232static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp)
 233{
 234	if (!xsk_is_bound(xs))
 235		return -EINVAL;
 236
 237	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 238		return -EINVAL;
 239
 
 
 
 
 
 240	sk_mark_napi_id_once_xdp(&xs->sk, xdp);
 241	return 0;
 242}
 243
 244static void xsk_flush(struct xdp_sock *xs)
 245{
 246	xskq_prod_submit(xs->rx);
 247	__xskq_cons_release(xs->pool->fq);
 248	sock_def_readable(&xs->sk);
 249}
 250
 251int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 252{
 
 253	int err;
 254
 255	spin_lock_bh(&xs->rx_lock);
 256	err = xsk_rcv_check(xs, xdp);
 257	if (!err) {
 258		err = __xsk_rcv(xs, xdp);
 259		xsk_flush(xs);
 260	}
 261	spin_unlock_bh(&xs->rx_lock);
 262	return err;
 263}
 264
 265static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 266{
 
 267	int err;
 268	u32 len;
 269
 270	err = xsk_rcv_check(xs, xdp);
 271	if (err)
 272		return err;
 273
 274	if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
 275		len = xdp->data_end - xdp->data;
 276		return __xsk_rcv_zc(xs, xdp, len);
 277	}
 278
 279	err = __xsk_rcv(xs, xdp);
 280	if (!err)
 281		xdp_return_buff(xdp);
 282	return err;
 283}
 284
 285int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
 286{
 287	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
 288	int err;
 289
 290	err = xsk_rcv(xs, xdp);
 291	if (err)
 292		return err;
 293
 294	if (!xs->flush_node.prev)
 295		list_add(&xs->flush_node, flush_list);
 296
 297	return 0;
 298}
 299
 300void __xsk_map_flush(void)
 301{
 302	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
 303	struct xdp_sock *xs, *tmp;
 304
 305	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
 306		xsk_flush(xs);
 307		__list_del_clearprev(&xs->flush_node);
 308	}
 309}
 310
 
 
 
 
 
 
 
 
 
 
 311void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
 312{
 313	xskq_prod_submit_n(pool->cq, nb_entries);
 314}
 315EXPORT_SYMBOL(xsk_tx_completed);
 316
 317void xsk_tx_release(struct xsk_buff_pool *pool)
 318{
 319	struct xdp_sock *xs;
 320
 321	rcu_read_lock();
 322	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
 323		__xskq_cons_release(xs->tx);
 324		if (xsk_tx_writeable(xs))
 325			xs->sk.sk_write_space(&xs->sk);
 326	}
 327	rcu_read_unlock();
 328}
 329EXPORT_SYMBOL(xsk_tx_release);
 330
 331bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
 332{
 
 333	struct xdp_sock *xs;
 334
 335	rcu_read_lock();
 
 336	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
 
 
 
 
 
 337		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
 338			xs->tx->queue_empty_descs++;
 
 339			continue;
 340		}
 341
 
 
 342		/* This is the backpressure mechanism for the Tx path.
 343		 * Reserve space in the completion queue and only proceed
 344		 * if there is space in it. This avoids having to implement
 345		 * any buffering in the Tx path.
 346		 */
 347		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
 348			goto out;
 349
 350		xskq_cons_release(xs->tx);
 351		rcu_read_unlock();
 352		return true;
 353	}
 354
 
 
 
 
 
 
 
 
 355out:
 356	rcu_read_unlock();
 357	return false;
 358}
 359EXPORT_SYMBOL(xsk_tx_peek_desc);
 360
 361static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs,
 362					u32 max_entries)
 363{
 
 364	u32 nb_pkts = 0;
 365
 366	while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
 367		nb_pkts++;
 368
 369	xsk_tx_release(pool);
 370	return nb_pkts;
 371}
 372
 373u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs,
 374				   u32 max_entries)
 375{
 376	struct xdp_sock *xs;
 377	u32 nb_pkts;
 378
 379	rcu_read_lock();
 380	if (!list_is_singular(&pool->xsk_tx_list)) {
 381		/* Fallback to the non-batched version */
 382		rcu_read_unlock();
 383		return xsk_tx_peek_release_fallback(pool, descs, max_entries);
 384	}
 385
 386	xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
 387	if (!xs) {
 388		nb_pkts = 0;
 389		goto out;
 390	}
 391
 392	nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries);
 393	if (!nb_pkts) {
 394		xs->tx->queue_empty_descs++;
 395		goto out;
 396	}
 397
 398	/* This is the backpressure mechanism for the Tx path. Try to
 399	 * reserve space in the completion queue for all packets, but
 400	 * if there are fewer slots available, just process that many
 401	 * packets. This avoids having to implement any buffering in
 402	 * the Tx path.
 403	 */
 404	nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts);
 405	if (!nb_pkts)
 406		goto out;
 407
 408	xskq_cons_release_n(xs->tx, nb_pkts);
 
 
 
 
 
 409	__xskq_cons_release(xs->tx);
 
 410	xs->sk.sk_write_space(&xs->sk);
 411
 412out:
 413	rcu_read_unlock();
 414	return nb_pkts;
 415}
 416EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
 417
 418static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
 419{
 420	struct net_device *dev = xs->dev;
 421	int err;
 422
 423	rcu_read_lock();
 424	err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
 425	rcu_read_unlock();
 
 
 
 
 
 
 
 
 426
 427	return err;
 428}
 429
 430static int xsk_zc_xmit(struct xdp_sock *xs)
 431{
 432	return xsk_wakeup(xs, XDP_WAKEUP_TX);
 
 
 
 
 433}
 434
 435static void xsk_destruct_skb(struct sk_buff *skb)
 436{
 437	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
 438	struct xdp_sock *xs = xdp_sk(skb->sk);
 439	unsigned long flags;
 440
 441	spin_lock_irqsave(&xs->pool->cq_lock, flags);
 442	xskq_prod_submit_addr(xs->pool->cq, addr);
 443	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 
 
 
 
 
 
 
 
 
 
 444
 
 
 
 
 
 
 445	sock_wfree(skb);
 446}
 447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 448static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 449					      struct xdp_desc *desc)
 450{
 451	struct xsk_buff_pool *pool = xs->pool;
 452	u32 hr, len, ts, offset, copy, copied;
 453	struct sk_buff *skb;
 454	struct page *page;
 455	void *buffer;
 456	int err, i;
 457	u64 addr;
 458
 459	hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
 
 460
 461	skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
 462	if (unlikely(!skb))
 463		return ERR_PTR(err);
 464
 465	skb_reserve(skb, hr);
 
 466
 467	addr = desc->addr;
 468	len = desc->len;
 469	ts = pool->unaligned ? len : pool->chunk_size;
 470
 471	buffer = xsk_buff_raw_get_data(pool, addr);
 472	offset = offset_in_page(buffer);
 473	addr = buffer - pool->addrs;
 474
 475	for (copied = 0, i = 0; copied < len; i++) {
 
 
 
 476		page = pool->umem->pgs[addr >> PAGE_SHIFT];
 477		get_page(page);
 478
 479		copy = min_t(u32, PAGE_SIZE - offset, len - copied);
 480		skb_fill_page_desc(skb, i, page, offset, copy);
 481
 482		copied += copy;
 483		addr += copy;
 484		offset = 0;
 485	}
 486
 487	skb->len += len;
 488	skb->data_len += len;
 489	skb->truesize += ts;
 490
 491	refcount_add(ts, &xs->sk.sk_wmem_alloc);
 492
 493	return skb;
 494}
 495
 496static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 497				     struct xdp_desc *desc)
 498{
 
 499	struct net_device *dev = xs->dev;
 500	struct sk_buff *skb;
 
 
 501
 502	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
 503		skb = xsk_build_skb_zerocopy(xs, desc);
 504		if (IS_ERR(skb))
 505			return skb;
 
 
 506	} else {
 507		u32 hr, tr, len;
 508		void *buffer;
 509		int err;
 510
 511		hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
 512		tr = dev->needed_tailroom;
 513		len = desc->len;
 514
 515		skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
 516		if (unlikely(!skb))
 517			return ERR_PTR(err);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 518
 519		skb_reserve(skb, hr);
 520		skb_put(skb, len);
 
 
 
 521
 522		buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
 523		err = skb_store_bits(skb, 0, buffer, len);
 524		if (unlikely(err)) {
 525			kfree_skb(skb);
 526			return ERR_PTR(err);
 
 
 
 
 
 
 
 
 
 
 
 
 
 527		}
 528	}
 529
 530	skb->dev = dev;
 531	skb->priority = xs->sk.sk_priority;
 532	skb->mark = xs->sk.sk_mark;
 533	skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
 534	skb->destructor = xsk_destruct_skb;
 
 
 535
 536	return skb;
 
 
 
 
 
 
 
 
 
 
 
 
 
 537}
 538
 539static int xsk_generic_xmit(struct sock *sk)
 540{
 541	struct xdp_sock *xs = xdp_sk(sk);
 542	u32 max_batch = TX_BATCH_SIZE;
 543	bool sent_frame = false;
 544	struct xdp_desc desc;
 545	struct sk_buff *skb;
 546	unsigned long flags;
 547	int err = 0;
 548
 549	mutex_lock(&xs->mutex);
 550
 
 
 
 
 
 
 551	if (xs->queue_id >= xs->dev->real_num_tx_queues)
 552		goto out;
 553
 554	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
 555		if (max_batch-- == 0) {
 556			err = -EAGAIN;
 557			goto out;
 558		}
 559
 560		skb = xsk_build_skb(xs, &desc);
 561		if (IS_ERR(skb)) {
 562			err = PTR_ERR(skb);
 563			goto out;
 564		}
 565
 566		/* This is the backpressure mechanism for the Tx path.
 567		 * Reserve space in the completion queue and only proceed
 568		 * if there is space in it. This avoids having to implement
 569		 * any buffering in the Tx path.
 570		 */
 571		spin_lock_irqsave(&xs->pool->cq_lock, flags);
 572		if (xskq_prod_reserve(xs->pool->cq)) {
 573			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 574			kfree_skb(skb);
 575			goto out;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 576		}
 577		spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 578
 579		err = __dev_direct_xmit(skb, xs->queue_id);
 580		if  (err == NETDEV_TX_BUSY) {
 581			/* Tell user-space to retry the send */
 582			skb->destructor = sock_wfree;
 583			spin_lock_irqsave(&xs->pool->cq_lock, flags);
 584			xskq_prod_cancel(xs->pool->cq);
 585			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 586			/* Free skb without triggering the perf drop trace */
 587			consume_skb(skb);
 588			err = -EAGAIN;
 589			goto out;
 590		}
 591
 592		xskq_cons_release(xs->tx);
 593		/* Ignore NET_XMIT_CN as packet might have been sent */
 594		if (err == NET_XMIT_DROP) {
 595			/* SKB completed but not sent */
 596			err = -EBUSY;
 
 597			goto out;
 598		}
 599
 600		sent_frame = true;
 
 601	}
 602
 603	xs->tx->queue_empty_descs++;
 
 
 
 
 604
 605out:
 606	if (sent_frame)
 607		if (xsk_tx_writeable(xs))
 608			sk->sk_write_space(sk);
 609
 610	mutex_unlock(&xs->mutex);
 611	return err;
 612}
 613
 614static int __xsk_sendmsg(struct sock *sk)
 615{
 616	struct xdp_sock *xs = xdp_sk(sk);
 617
 618	if (unlikely(!(xs->dev->flags & IFF_UP)))
 619		return -ENETDOWN;
 620	if (unlikely(!xs->tx))
 621		return -ENOBUFS;
 
 622
 623	return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
 624}
 625
 626static bool xsk_no_wakeup(struct sock *sk)
 627{
 628#ifdef CONFIG_NET_RX_BUSY_POLL
 629	/* Prefer busy-polling, skip the wakeup. */
 630	return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
 631		READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
 632#else
 633	return false;
 634#endif
 635}
 636
 637static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 
 
 
 
 
 
 
 
 
 
 638{
 639	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
 640	struct sock *sk = sock->sk;
 641	struct xdp_sock *xs = xdp_sk(sk);
 642	struct xsk_buff_pool *pool;
 
 643
 644	if (unlikely(!xsk_is_bound(xs)))
 645		return -ENXIO;
 
 646	if (unlikely(need_wait))
 647		return -EOPNOTSUPP;
 
 
 648
 649	if (sk_can_busy_loop(sk))
 
 
 650		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
 
 651
 652	if (xsk_no_wakeup(sk))
 653		return 0;
 654
 655	pool = xs->pool;
 656	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
 657		return __xsk_sendmsg(sk);
 
 
 
 658	return 0;
 659}
 660
 661static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
 
 
 
 
 
 
 
 
 
 
 
 662{
 663	bool need_wait = !(flags & MSG_DONTWAIT);
 664	struct sock *sk = sock->sk;
 665	struct xdp_sock *xs = xdp_sk(sk);
 
 666
 667	if (unlikely(!xsk_is_bound(xs)))
 668		return -ENXIO;
 669	if (unlikely(!(xs->dev->flags & IFF_UP)))
 670		return -ENETDOWN;
 671	if (unlikely(!xs->rx))
 672		return -ENOBUFS;
 673	if (unlikely(need_wait))
 674		return -EOPNOTSUPP;
 675
 676	if (sk_can_busy_loop(sk))
 677		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
 678
 679	if (xsk_no_wakeup(sk))
 680		return 0;
 681
 682	if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
 683		return xsk_wakeup(xs, XDP_WAKEUP_RX);
 684	return 0;
 685}
 686
 
 
 
 
 
 
 
 
 
 
 
 687static __poll_t xsk_poll(struct file *file, struct socket *sock,
 688			     struct poll_table_struct *wait)
 689{
 690	__poll_t mask = 0;
 691	struct sock *sk = sock->sk;
 692	struct xdp_sock *xs = xdp_sk(sk);
 693	struct xsk_buff_pool *pool;
 694
 695	sock_poll_wait(file, sock, wait);
 696
 697	if (unlikely(!xsk_is_bound(xs)))
 698		return mask;
 
 699
 700	pool = xs->pool;
 701
 702	if (pool->cached_need_wakeup) {
 703		if (xs->zc)
 704			xsk_wakeup(xs, pool->cached_need_wakeup);
 705		else
 706			/* Poll needs to drive Tx also in copy mode */
 707			__xsk_sendmsg(sk);
 708	}
 709
 710	if (xs->rx && !xskq_prod_is_empty(xs->rx))
 711		mask |= EPOLLIN | EPOLLRDNORM;
 712	if (xs->tx && xsk_tx_writeable(xs))
 713		mask |= EPOLLOUT | EPOLLWRNORM;
 714
 
 715	return mask;
 716}
 717
 718static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
 719			  bool umem_queue)
 720{
 721	struct xsk_queue *q;
 722
 723	if (entries == 0 || *queue || !is_power_of_2(entries))
 724		return -EINVAL;
 725
 726	q = xskq_create(entries, umem_queue);
 727	if (!q)
 728		return -ENOMEM;
 729
 730	/* Make sure queue is ready before it can be seen by others */
 731	smp_wmb();
 732	WRITE_ONCE(*queue, q);
 733	return 0;
 734}
 735
 736static void xsk_unbind_dev(struct xdp_sock *xs)
 737{
 738	struct net_device *dev = xs->dev;
 739
 740	if (xs->state != XSK_BOUND)
 741		return;
 742	WRITE_ONCE(xs->state, XSK_UNBOUND);
 743
 744	/* Wait for driver to stop using the xdp socket. */
 745	xp_del_xsk(xs->pool, xs);
 746	xs->dev = NULL;
 747	synchronize_net();
 748	dev_put(dev);
 749}
 750
 751static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
 752					      struct xdp_sock __rcu ***map_entry)
 753{
 754	struct xsk_map *map = NULL;
 755	struct xsk_map_node *node;
 756
 757	*map_entry = NULL;
 758
 759	spin_lock_bh(&xs->map_list_lock);
 760	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
 761					node);
 762	if (node) {
 763		bpf_map_inc(&node->map->map);
 764		map = node->map;
 765		*map_entry = node->map_entry;
 766	}
 767	spin_unlock_bh(&xs->map_list_lock);
 768	return map;
 769}
 770
 771static void xsk_delete_from_maps(struct xdp_sock *xs)
 772{
 773	/* This function removes the current XDP socket from all the
 774	 * maps it resides in. We need to take extra care here, due to
 775	 * the two locks involved. Each map has a lock synchronizing
 776	 * updates to the entries, and each socket has a lock that
 777	 * synchronizes access to the list of maps (map_list). For
 778	 * deadlock avoidance the locks need to be taken in the order
 779	 * "map lock"->"socket map list lock". We start off by
 780	 * accessing the socket map list, and take a reference to the
 781	 * map to guarantee existence between the
 782	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
 783	 * calls. Then we ask the map to remove the socket, which
 784	 * tries to remove the socket from the map. Note that there
 785	 * might be updates to the map between
 786	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
 787	 */
 788	struct xdp_sock __rcu **map_entry = NULL;
 789	struct xsk_map *map;
 790
 791	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
 792		xsk_map_try_sock_delete(map, xs, map_entry);
 793		bpf_map_put(&map->map);
 794	}
 795}
 796
 797static int xsk_release(struct socket *sock)
 798{
 799	struct sock *sk = sock->sk;
 800	struct xdp_sock *xs = xdp_sk(sk);
 801	struct net *net;
 802
 803	if (!sk)
 804		return 0;
 805
 806	net = sock_net(sk);
 807
 
 
 
 808	mutex_lock(&net->xdp.lock);
 809	sk_del_node_init_rcu(sk);
 810	mutex_unlock(&net->xdp.lock);
 811
 812	local_bh_disable();
 813	sock_prot_inuse_add(net, sk->sk_prot, -1);
 814	local_bh_enable();
 815
 816	xsk_delete_from_maps(xs);
 817	mutex_lock(&xs->mutex);
 818	xsk_unbind_dev(xs);
 819	mutex_unlock(&xs->mutex);
 820
 821	xskq_destroy(xs->rx);
 822	xskq_destroy(xs->tx);
 823	xskq_destroy(xs->fq_tmp);
 824	xskq_destroy(xs->cq_tmp);
 825
 826	sock_orphan(sk);
 827	sock->sk = NULL;
 828
 829	sk_refcnt_debug_release(sk);
 830	sock_put(sk);
 831
 832	return 0;
 833}
 834
 835static struct socket *xsk_lookup_xsk_from_fd(int fd)
 836{
 837	struct socket *sock;
 838	int err;
 839
 840	sock = sockfd_lookup(fd, &err);
 841	if (!sock)
 842		return ERR_PTR(-ENOTSOCK);
 843
 844	if (sock->sk->sk_family != PF_XDP) {
 845		sockfd_put(sock);
 846		return ERR_PTR(-ENOPROTOOPT);
 847	}
 848
 849	return sock;
 850}
 851
 852static bool xsk_validate_queues(struct xdp_sock *xs)
 853{
 854	return xs->fq_tmp && xs->cq_tmp;
 855}
 856
 857static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 858{
 859	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
 860	struct sock *sk = sock->sk;
 861	struct xdp_sock *xs = xdp_sk(sk);
 862	struct net_device *dev;
 
 863	u32 flags, qid;
 864	int err = 0;
 865
 866	if (addr_len < sizeof(struct sockaddr_xdp))
 867		return -EINVAL;
 868	if (sxdp->sxdp_family != AF_XDP)
 869		return -EINVAL;
 870
 871	flags = sxdp->sxdp_flags;
 872	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
 873		      XDP_USE_NEED_WAKEUP))
 
 
 
 
 874		return -EINVAL;
 875
 876	rtnl_lock();
 877	mutex_lock(&xs->mutex);
 878	if (xs->state != XSK_READY) {
 879		err = -EBUSY;
 880		goto out_release;
 881	}
 882
 883	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
 884	if (!dev) {
 885		err = -ENODEV;
 886		goto out_release;
 887	}
 888
 889	if (!xs->rx && !xs->tx) {
 890		err = -EINVAL;
 891		goto out_unlock;
 892	}
 893
 894	qid = sxdp->sxdp_queue_id;
 895
 896	if (flags & XDP_SHARED_UMEM) {
 897		struct xdp_sock *umem_xs;
 898		struct socket *sock;
 899
 900		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
 901		    (flags & XDP_USE_NEED_WAKEUP)) {
 902			/* Cannot specify flags for shared sockets. */
 903			err = -EINVAL;
 904			goto out_unlock;
 905		}
 906
 907		if (xs->umem) {
 908			/* We have already our own. */
 909			err = -EINVAL;
 910			goto out_unlock;
 911		}
 912
 913		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
 914		if (IS_ERR(sock)) {
 915			err = PTR_ERR(sock);
 916			goto out_unlock;
 917		}
 918
 919		umem_xs = xdp_sk(sock->sk);
 920		if (!xsk_is_bound(umem_xs)) {
 921			err = -EBADF;
 922			sockfd_put(sock);
 923			goto out_unlock;
 924		}
 925
 926		if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
 927			/* Share the umem with another socket on another qid
 928			 * and/or device.
 929			 */
 930			xs->pool = xp_create_and_assign_umem(xs,
 931							     umem_xs->umem);
 932			if (!xs->pool) {
 933				err = -ENOMEM;
 934				sockfd_put(sock);
 935				goto out_unlock;
 936			}
 937
 938			err = xp_assign_dev_shared(xs->pool, umem_xs->umem,
 939						   dev, qid);
 940			if (err) {
 941				xp_destroy(xs->pool);
 942				xs->pool = NULL;
 943				sockfd_put(sock);
 944				goto out_unlock;
 945			}
 946		} else {
 947			/* Share the buffer pool with the other socket. */
 948			if (xs->fq_tmp || xs->cq_tmp) {
 949				/* Do not allow setting your own fq or cq. */
 950				err = -EINVAL;
 951				sockfd_put(sock);
 952				goto out_unlock;
 953			}
 954
 955			xp_get_pool(umem_xs->pool);
 956			xs->pool = umem_xs->pool;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 957		}
 958
 959		xdp_get_umem(umem_xs->umem);
 960		WRITE_ONCE(xs->umem, umem_xs->umem);
 961		sockfd_put(sock);
 962	} else if (!xs->umem || !xsk_validate_queues(xs)) {
 963		err = -EINVAL;
 964		goto out_unlock;
 965	} else {
 966		/* This xsk has its own umem. */
 967		xs->pool = xp_create_and_assign_umem(xs, xs->umem);
 968		if (!xs->pool) {
 969			err = -ENOMEM;
 970			goto out_unlock;
 971		}
 972
 973		err = xp_assign_dev(xs->pool, dev, qid, flags);
 974		if (err) {
 975			xp_destroy(xs->pool);
 976			xs->pool = NULL;
 977			goto out_unlock;
 978		}
 979	}
 980
 981	/* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
 982	xs->fq_tmp = NULL;
 983	xs->cq_tmp = NULL;
 984
 985	xs->dev = dev;
 986	xs->zc = xs->umem->zc;
 
 987	xs->queue_id = qid;
 988	xp_add_xsk(xs->pool, xs);
 989
 990out_unlock:
 991	if (err) {
 992		dev_put(dev);
 993	} else {
 994		/* Matches smp_rmb() in bind() for shared umem
 995		 * sockets, and xsk_is_bound().
 996		 */
 997		smp_wmb();
 998		WRITE_ONCE(xs->state, XSK_BOUND);
 999	}
1000out_release:
1001	mutex_unlock(&xs->mutex);
1002	rtnl_unlock();
1003	return err;
1004}
1005
1006struct xdp_umem_reg_v1 {
1007	__u64 addr; /* Start of packet data area */
1008	__u64 len; /* Length of packet data area */
1009	__u32 chunk_size;
1010	__u32 headroom;
1011};
1012
 
 
 
 
 
 
 
 
1013static int xsk_setsockopt(struct socket *sock, int level, int optname,
1014			  sockptr_t optval, unsigned int optlen)
1015{
1016	struct sock *sk = sock->sk;
1017	struct xdp_sock *xs = xdp_sk(sk);
1018	int err;
1019
1020	if (level != SOL_XDP)
1021		return -ENOPROTOOPT;
1022
1023	switch (optname) {
1024	case XDP_RX_RING:
1025	case XDP_TX_RING:
1026	{
1027		struct xsk_queue **q;
1028		int entries;
1029
1030		if (optlen < sizeof(entries))
1031			return -EINVAL;
1032		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1033			return -EFAULT;
1034
1035		mutex_lock(&xs->mutex);
1036		if (xs->state != XSK_READY) {
1037			mutex_unlock(&xs->mutex);
1038			return -EBUSY;
1039		}
1040		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
1041		err = xsk_init_queue(entries, q, false);
1042		if (!err && optname == XDP_TX_RING)
1043			/* Tx needs to be explicitly woken up the first time */
1044			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
1045		mutex_unlock(&xs->mutex);
1046		return err;
1047	}
1048	case XDP_UMEM_REG:
1049	{
1050		size_t mr_size = sizeof(struct xdp_umem_reg);
1051		struct xdp_umem_reg mr = {};
1052		struct xdp_umem *umem;
1053
1054		if (optlen < sizeof(struct xdp_umem_reg_v1))
1055			return -EINVAL;
 
 
1056		else if (optlen < sizeof(mr))
1057			mr_size = sizeof(struct xdp_umem_reg_v1);
1058
1059		if (copy_from_sockptr(&mr, optval, mr_size))
1060			return -EFAULT;
1061
1062		mutex_lock(&xs->mutex);
1063		if (xs->state != XSK_READY || xs->umem) {
1064			mutex_unlock(&xs->mutex);
1065			return -EBUSY;
1066		}
1067
1068		umem = xdp_umem_create(&mr);
1069		if (IS_ERR(umem)) {
1070			mutex_unlock(&xs->mutex);
1071			return PTR_ERR(umem);
1072		}
1073
1074		/* Make sure umem is ready before it can be seen by others */
1075		smp_wmb();
1076		WRITE_ONCE(xs->umem, umem);
1077		mutex_unlock(&xs->mutex);
1078		return 0;
1079	}
1080	case XDP_UMEM_FILL_RING:
1081	case XDP_UMEM_COMPLETION_RING:
1082	{
1083		struct xsk_queue **q;
1084		int entries;
1085
1086		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1087			return -EFAULT;
1088
1089		mutex_lock(&xs->mutex);
1090		if (xs->state != XSK_READY) {
1091			mutex_unlock(&xs->mutex);
1092			return -EBUSY;
1093		}
1094
1095		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
1096			&xs->cq_tmp;
1097		err = xsk_init_queue(entries, q, true);
1098		mutex_unlock(&xs->mutex);
1099		return err;
1100	}
1101	default:
1102		break;
1103	}
1104
1105	return -ENOPROTOOPT;
1106}
1107
1108static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
1109{
1110	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
1111	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
1112	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
1113}
1114
1115static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
1116{
1117	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
1118	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
1119	ring->desc = offsetof(struct xdp_umem_ring, desc);
1120}
1121
1122struct xdp_statistics_v1 {
1123	__u64 rx_dropped;
1124	__u64 rx_invalid_descs;
1125	__u64 tx_invalid_descs;
1126};
1127
1128static int xsk_getsockopt(struct socket *sock, int level, int optname,
1129			  char __user *optval, int __user *optlen)
1130{
1131	struct sock *sk = sock->sk;
1132	struct xdp_sock *xs = xdp_sk(sk);
1133	int len;
1134
1135	if (level != SOL_XDP)
1136		return -ENOPROTOOPT;
1137
1138	if (get_user(len, optlen))
1139		return -EFAULT;
1140	if (len < 0)
1141		return -EINVAL;
1142
1143	switch (optname) {
1144	case XDP_STATISTICS:
1145	{
1146		struct xdp_statistics stats = {};
1147		bool extra_stats = true;
1148		size_t stats_size;
1149
1150		if (len < sizeof(struct xdp_statistics_v1)) {
1151			return -EINVAL;
1152		} else if (len < sizeof(stats)) {
1153			extra_stats = false;
1154			stats_size = sizeof(struct xdp_statistics_v1);
1155		} else {
1156			stats_size = sizeof(stats);
1157		}
1158
1159		mutex_lock(&xs->mutex);
1160		stats.rx_dropped = xs->rx_dropped;
1161		if (extra_stats) {
1162			stats.rx_ring_full = xs->rx_queue_full;
1163			stats.rx_fill_ring_empty_descs =
1164				xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
1165			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
1166		} else {
1167			stats.rx_dropped += xs->rx_queue_full;
1168		}
1169		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1170		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1171		mutex_unlock(&xs->mutex);
1172
1173		if (copy_to_user(optval, &stats, stats_size))
1174			return -EFAULT;
1175		if (put_user(stats_size, optlen))
1176			return -EFAULT;
1177
1178		return 0;
1179	}
1180	case XDP_MMAP_OFFSETS:
1181	{
1182		struct xdp_mmap_offsets off;
1183		struct xdp_mmap_offsets_v1 off_v1;
1184		bool flags_supported = true;
1185		void *to_copy;
1186
1187		if (len < sizeof(off_v1))
1188			return -EINVAL;
1189		else if (len < sizeof(off))
1190			flags_supported = false;
1191
1192		if (flags_supported) {
1193			/* xdp_ring_offset is identical to xdp_ring_offset_v1
1194			 * except for the flags field added to the end.
1195			 */
1196			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1197					       &off.rx);
1198			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1199					       &off.tx);
1200			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1201					       &off.fr);
1202			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1203					       &off.cr);
1204			off.rx.flags = offsetof(struct xdp_rxtx_ring,
1205						ptrs.flags);
1206			off.tx.flags = offsetof(struct xdp_rxtx_ring,
1207						ptrs.flags);
1208			off.fr.flags = offsetof(struct xdp_umem_ring,
1209						ptrs.flags);
1210			off.cr.flags = offsetof(struct xdp_umem_ring,
1211						ptrs.flags);
1212
1213			len = sizeof(off);
1214			to_copy = &off;
1215		} else {
1216			xsk_enter_rxtx_offsets(&off_v1.rx);
1217			xsk_enter_rxtx_offsets(&off_v1.tx);
1218			xsk_enter_umem_offsets(&off_v1.fr);
1219			xsk_enter_umem_offsets(&off_v1.cr);
1220
1221			len = sizeof(off_v1);
1222			to_copy = &off_v1;
1223		}
1224
1225		if (copy_to_user(optval, to_copy, len))
1226			return -EFAULT;
1227		if (put_user(len, optlen))
1228			return -EFAULT;
1229
1230		return 0;
1231	}
1232	case XDP_OPTIONS:
1233	{
1234		struct xdp_options opts = {};
1235
1236		if (len < sizeof(opts))
1237			return -EINVAL;
1238
1239		mutex_lock(&xs->mutex);
1240		if (xs->zc)
1241			opts.flags |= XDP_OPTIONS_ZEROCOPY;
1242		mutex_unlock(&xs->mutex);
1243
1244		len = sizeof(opts);
1245		if (copy_to_user(optval, &opts, len))
1246			return -EFAULT;
1247		if (put_user(len, optlen))
1248			return -EFAULT;
1249
1250		return 0;
1251	}
1252	default:
1253		break;
1254	}
1255
1256	return -EOPNOTSUPP;
1257}
1258
1259static int xsk_mmap(struct file *file, struct socket *sock,
1260		    struct vm_area_struct *vma)
1261{
1262	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1263	unsigned long size = vma->vm_end - vma->vm_start;
1264	struct xdp_sock *xs = xdp_sk(sock->sk);
 
1265	struct xsk_queue *q = NULL;
1266	unsigned long pfn;
1267	struct page *qpg;
1268
1269	if (READ_ONCE(xs->state) != XSK_READY)
1270		return -EBUSY;
1271
1272	if (offset == XDP_PGOFF_RX_RING) {
1273		q = READ_ONCE(xs->rx);
1274	} else if (offset == XDP_PGOFF_TX_RING) {
1275		q = READ_ONCE(xs->tx);
1276	} else {
1277		/* Matches the smp_wmb() in XDP_UMEM_REG */
1278		smp_rmb();
1279		if (offset == XDP_UMEM_PGOFF_FILL_RING)
1280			q = READ_ONCE(xs->fq_tmp);
 
1281		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1282			q = READ_ONCE(xs->cq_tmp);
 
1283	}
1284
1285	if (!q)
1286		return -EINVAL;
1287
1288	/* Matches the smp_wmb() in xsk_init_queue */
1289	smp_rmb();
1290	qpg = virt_to_head_page(q->ring);
1291	if (size > page_size(qpg))
1292		return -EINVAL;
1293
1294	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1295	return remap_pfn_range(vma, vma->vm_start, pfn,
1296			       size, vma->vm_page_prot);
1297}
1298
1299static int xsk_notifier(struct notifier_block *this,
1300			unsigned long msg, void *ptr)
1301{
1302	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1303	struct net *net = dev_net(dev);
1304	struct sock *sk;
1305
1306	switch (msg) {
1307	case NETDEV_UNREGISTER:
1308		mutex_lock(&net->xdp.lock);
1309		sk_for_each(sk, &net->xdp.list) {
1310			struct xdp_sock *xs = xdp_sk(sk);
1311
1312			mutex_lock(&xs->mutex);
1313			if (xs->dev == dev) {
1314				sk->sk_err = ENETDOWN;
1315				if (!sock_flag(sk, SOCK_DEAD))
1316					sk_error_report(sk);
1317
1318				xsk_unbind_dev(xs);
1319
1320				/* Clear device references. */
1321				xp_clear_dev(xs->pool);
1322			}
1323			mutex_unlock(&xs->mutex);
1324		}
1325		mutex_unlock(&net->xdp.lock);
1326		break;
1327	}
1328	return NOTIFY_DONE;
1329}
1330
1331static struct proto xsk_proto = {
1332	.name =		"XDP",
1333	.owner =	THIS_MODULE,
1334	.obj_size =	sizeof(struct xdp_sock),
1335};
1336
1337static const struct proto_ops xsk_proto_ops = {
1338	.family		= PF_XDP,
1339	.owner		= THIS_MODULE,
1340	.release	= xsk_release,
1341	.bind		= xsk_bind,
1342	.connect	= sock_no_connect,
1343	.socketpair	= sock_no_socketpair,
1344	.accept		= sock_no_accept,
1345	.getname	= sock_no_getname,
1346	.poll		= xsk_poll,
1347	.ioctl		= sock_no_ioctl,
1348	.listen		= sock_no_listen,
1349	.shutdown	= sock_no_shutdown,
1350	.setsockopt	= xsk_setsockopt,
1351	.getsockopt	= xsk_getsockopt,
1352	.sendmsg	= xsk_sendmsg,
1353	.recvmsg	= xsk_recvmsg,
1354	.mmap		= xsk_mmap,
1355	.sendpage	= sock_no_sendpage,
1356};
1357
1358static void xsk_destruct(struct sock *sk)
1359{
1360	struct xdp_sock *xs = xdp_sk(sk);
1361
1362	if (!sock_flag(sk, SOCK_DEAD))
1363		return;
1364
1365	if (!xp_put_pool(xs->pool))
1366		xdp_put_umem(xs->umem, !xs->pool);
1367
1368	sk_refcnt_debug_dec(sk);
1369}
1370
1371static int xsk_create(struct net *net, struct socket *sock, int protocol,
1372		      int kern)
1373{
1374	struct xdp_sock *xs;
1375	struct sock *sk;
1376
1377	if (!ns_capable(net->user_ns, CAP_NET_RAW))
1378		return -EPERM;
1379	if (sock->type != SOCK_RAW)
1380		return -ESOCKTNOSUPPORT;
1381
1382	if (protocol)
1383		return -EPROTONOSUPPORT;
1384
1385	sock->state = SS_UNCONNECTED;
1386
1387	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1388	if (!sk)
1389		return -ENOBUFS;
1390
1391	sock->ops = &xsk_proto_ops;
1392
1393	sock_init_data(sock, sk);
1394
1395	sk->sk_family = PF_XDP;
1396
1397	sk->sk_destruct = xsk_destruct;
1398	sk_refcnt_debug_inc(sk);
1399
1400	sock_set_flag(sk, SOCK_RCU_FREE);
1401
1402	xs = xdp_sk(sk);
1403	xs->state = XSK_READY;
1404	mutex_init(&xs->mutex);
1405	spin_lock_init(&xs->rx_lock);
1406
1407	INIT_LIST_HEAD(&xs->map_list);
1408	spin_lock_init(&xs->map_list_lock);
1409
1410	mutex_lock(&net->xdp.lock);
1411	sk_add_node_rcu(sk, &net->xdp.list);
1412	mutex_unlock(&net->xdp.lock);
1413
1414	local_bh_disable();
1415	sock_prot_inuse_add(net, &xsk_proto, 1);
1416	local_bh_enable();
1417
1418	return 0;
1419}
1420
1421static const struct net_proto_family xsk_family_ops = {
1422	.family = PF_XDP,
1423	.create = xsk_create,
1424	.owner	= THIS_MODULE,
1425};
1426
1427static struct notifier_block xsk_netdev_notifier = {
1428	.notifier_call	= xsk_notifier,
1429};
1430
1431static int __net_init xsk_net_init(struct net *net)
1432{
1433	mutex_init(&net->xdp.lock);
1434	INIT_HLIST_HEAD(&net->xdp.list);
1435	return 0;
1436}
1437
1438static void __net_exit xsk_net_exit(struct net *net)
1439{
1440	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1441}
1442
1443static struct pernet_operations xsk_net_ops = {
1444	.init = xsk_net_init,
1445	.exit = xsk_net_exit,
1446};
1447
1448static int __init xsk_init(void)
1449{
1450	int err, cpu;
1451
1452	err = proto_register(&xsk_proto, 0 /* no slab */);
1453	if (err)
1454		goto out;
1455
1456	err = sock_register(&xsk_family_ops);
1457	if (err)
1458		goto out_proto;
1459
1460	err = register_pernet_subsys(&xsk_net_ops);
1461	if (err)
1462		goto out_sk;
1463
1464	err = register_netdevice_notifier(&xsk_netdev_notifier);
1465	if (err)
1466		goto out_pernet;
1467
1468	for_each_possible_cpu(cpu)
1469		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1470	return 0;
1471
1472out_pernet:
1473	unregister_pernet_subsys(&xsk_net_ops);
1474out_sk:
1475	sock_unregister(PF_XDP);
1476out_proto:
1477	proto_unregister(&xsk_proto);
1478out:
1479	return err;
1480}
1481
1482fs_initcall(xsk_init);