page_pool.c - net/core/page_pool.c - Linux source code v4.6

Note: File does not exist in v4.6.
   1/* SPDX-License-Identifier: GPL-2.0
   2 *
   3 * page_pool.c
   4 *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
   5 *	Copyright (C) 2016 Red Hat, Inc.
   6 */
   7
   8#include <linux/error-injection.h>
   9#include <linux/types.h>
  10#include <linux/kernel.h>
  11#include <linux/slab.h>
  12#include <linux/device.h>
  13
  14#include <net/netdev_rx_queue.h>
  15#include <net/page_pool/helpers.h>
  16#include <net/xdp.h>
  17
  18#include <linux/dma-direction.h>
  19#include <linux/dma-mapping.h>
  20#include <linux/page-flags.h>
  21#include <linux/mm.h> /* for put_page() */
  22#include <linux/poison.h>
  23#include <linux/ethtool.h>
  24#include <linux/netdevice.h>
  25
  26#include <trace/events/page_pool.h>
  27
  28#include "mp_dmabuf_devmem.h"
  29#include "netmem_priv.h"
  30#include "page_pool_priv.h"
  31
  32DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
  33
  34#define DEFER_TIME (msecs_to_jiffies(1000))
  35#define DEFER_WARN_INTERVAL (60 * HZ)
  36
  37#define BIAS_MAX	(LONG_MAX >> 1)
  38
  39#ifdef CONFIG_PAGE_POOL_STATS
  40static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
  41
  42/* alloc_stat_inc is intended to be used in softirq context */
  43#define alloc_stat_inc(pool, __stat)	(pool->alloc_stats.__stat++)
  44/* recycle_stat_inc is safe to use when preemption is possible. */
  45#define recycle_stat_inc(pool, __stat)							\
  46	do {										\
  47		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
  48		this_cpu_inc(s->__stat);						\
  49	} while (0)
  50
  51#define recycle_stat_add(pool, __stat, val)						\
  52	do {										\
  53		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
  54		this_cpu_add(s->__stat, val);						\
  55	} while (0)
  56
  57static const char pp_stats[][ETH_GSTRING_LEN] = {
  58	"rx_pp_alloc_fast",
  59	"rx_pp_alloc_slow",
  60	"rx_pp_alloc_slow_ho",
  61	"rx_pp_alloc_empty",
  62	"rx_pp_alloc_refill",
  63	"rx_pp_alloc_waive",
  64	"rx_pp_recycle_cached",
  65	"rx_pp_recycle_cache_full",
  66	"rx_pp_recycle_ring",
  67	"rx_pp_recycle_ring_full",
  68	"rx_pp_recycle_released_ref",
  69};
  70
  71/**
  72 * page_pool_get_stats() - fetch page pool stats
  73 * @pool:	pool from which page was allocated
  74 * @stats:	struct page_pool_stats to fill in
  75 *
  76 * Retrieve statistics about the page_pool. This API is only available
  77 * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
  78 * A pointer to a caller allocated struct page_pool_stats structure
  79 * is passed to this API which is filled in. The caller can then report
  80 * those stats to the user (perhaps via ethtool, debugfs, etc.).
  81 */
  82bool page_pool_get_stats(const struct page_pool *pool,
  83			 struct page_pool_stats *stats)
  84{
  85	int cpu = 0;
  86
  87	if (!stats)
  88		return false;
  89
  90	/* The caller is responsible to initialize stats. */
  91	stats->alloc_stats.fast += pool->alloc_stats.fast;
  92	stats->alloc_stats.slow += pool->alloc_stats.slow;
  93	stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
  94	stats->alloc_stats.empty += pool->alloc_stats.empty;
  95	stats->alloc_stats.refill += pool->alloc_stats.refill;
  96	stats->alloc_stats.waive += pool->alloc_stats.waive;
  97
  98	for_each_possible_cpu(cpu) {
  99		const struct page_pool_recycle_stats *pcpu =
 100			per_cpu_ptr(pool->recycle_stats, cpu);
 101
 102		stats->recycle_stats.cached += pcpu->cached;
 103		stats->recycle_stats.cache_full += pcpu->cache_full;
 104		stats->recycle_stats.ring += pcpu->ring;
 105		stats->recycle_stats.ring_full += pcpu->ring_full;
 106		stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
 107	}
 108
 109	return true;
 110}
 111EXPORT_SYMBOL(page_pool_get_stats);
 112
 113u8 *page_pool_ethtool_stats_get_strings(u8 *data)
 114{
 115	int i;
 116
 117	for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
 118		memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
 119		data += ETH_GSTRING_LEN;
 120	}
 121
 122	return data;
 123}
 124EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
 125
 126int page_pool_ethtool_stats_get_count(void)
 127{
 128	return ARRAY_SIZE(pp_stats);
 129}
 130EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
 131
 132u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
 133{
 134	const struct page_pool_stats *pool_stats = stats;
 135
 136	*data++ = pool_stats->alloc_stats.fast;
 137	*data++ = pool_stats->alloc_stats.slow;
 138	*data++ = pool_stats->alloc_stats.slow_high_order;
 139	*data++ = pool_stats->alloc_stats.empty;
 140	*data++ = pool_stats->alloc_stats.refill;
 141	*data++ = pool_stats->alloc_stats.waive;
 142	*data++ = pool_stats->recycle_stats.cached;
 143	*data++ = pool_stats->recycle_stats.cache_full;
 144	*data++ = pool_stats->recycle_stats.ring;
 145	*data++ = pool_stats->recycle_stats.ring_full;
 146	*data++ = pool_stats->recycle_stats.released_refcnt;
 147
 148	return data;
 149}
 150EXPORT_SYMBOL(page_pool_ethtool_stats_get);
 151
 152#else
 153#define alloc_stat_inc(pool, __stat)
 154#define recycle_stat_inc(pool, __stat)
 155#define recycle_stat_add(pool, __stat, val)
 156#endif
 157
 158static bool page_pool_producer_lock(struct page_pool *pool)
 159	__acquires(&pool->ring.producer_lock)
 160{
 161	bool in_softirq = in_softirq();
 162
 163	if (in_softirq)
 164		spin_lock(&pool->ring.producer_lock);
 165	else
 166		spin_lock_bh(&pool->ring.producer_lock);
 167
 168	return in_softirq;
 169}
 170
 171static void page_pool_producer_unlock(struct page_pool *pool,
 172				      bool in_softirq)
 173	__releases(&pool->ring.producer_lock)
 174{
 175	if (in_softirq)
 176		spin_unlock(&pool->ring.producer_lock);
 177	else
 178		spin_unlock_bh(&pool->ring.producer_lock);
 179}
 180
 181static void page_pool_struct_check(void)
 182{
 183	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
 184	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
 185	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
 186	CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
 187				    PAGE_POOL_FRAG_GROUP_ALIGN);
 188}
 189
 190static int page_pool_init(struct page_pool *pool,
 191			  const struct page_pool_params *params,
 192			  int cpuid)
 193{
 194	unsigned int ring_qsize = 1024; /* Default */
 195	struct netdev_rx_queue *rxq;
 196	int err;
 197
 198	page_pool_struct_check();
 199
 200	memcpy(&pool->p, &params->fast, sizeof(pool->p));
 201	memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
 202
 203	pool->cpuid = cpuid;
 204
 205	/* Validate only known flags were used */
 206	if (pool->slow.flags & ~PP_FLAG_ALL)
 207		return -EINVAL;
 208
 209	if (pool->p.pool_size)
 210		ring_qsize = pool->p.pool_size;
 211
 212	/* Sanity limit mem that can be pinned down */
 213	if (ring_qsize > 32768)
 214		return -E2BIG;
 215
 216	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
 217	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
 218	 * which is the XDP_TX use-case.
 219	 */
 220	if (pool->slow.flags & PP_FLAG_DMA_MAP) {
 221		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
 222		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
 223			return -EINVAL;
 224
 225		pool->dma_map = true;
 226	}
 227
 228	if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
 229		/* In order to request DMA-sync-for-device the page
 230		 * needs to be mapped
 231		 */
 232		if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
 233			return -EINVAL;
 234
 235		if (!pool->p.max_len)
 236			return -EINVAL;
 237
 238		pool->dma_sync = true;
 239
 240		/* pool->p.offset has to be set according to the address
 241		 * offset used by the DMA engine to start copying rx data
 242		 */
 243	}
 244
 245	pool->has_init_callback = !!pool->slow.init_callback;
 246
 247#ifdef CONFIG_PAGE_POOL_STATS
 248	if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
 249		pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
 250		if (!pool->recycle_stats)
 251			return -ENOMEM;
 252	} else {
 253		/* For system page pool instance we use a singular stats object
 254		 * instead of allocating a separate percpu variable for each
 255		 * (also percpu) page pool instance.
 256		 */
 257		pool->recycle_stats = &pp_system_recycle_stats;
 258		pool->system = true;
 259	}
 260#endif
 261
 262	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
 263#ifdef CONFIG_PAGE_POOL_STATS
 264		if (!pool->system)
 265			free_percpu(pool->recycle_stats);
 266#endif
 267		return -ENOMEM;
 268	}
 269
 270	atomic_set(&pool->pages_state_release_cnt, 0);
 271
 272	/* Driver calling page_pool_create() also call page_pool_destroy() */
 273	refcount_set(&pool->user_cnt, 1);
 274
 275	if (pool->dma_map)
 276		get_device(pool->p.dev);
 277
 278	if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
 279		/* We rely on rtnl_lock()ing to make sure netdev_rx_queue
 280		 * configuration doesn't change while we're initializing
 281		 * the page_pool.
 282		 */
 283		ASSERT_RTNL();
 284		rxq = __netif_get_rx_queue(pool->slow.netdev,
 285					   pool->slow.queue_idx);
 286		pool->mp_priv = rxq->mp_params.mp_priv;
 287	}
 288
 289	if (pool->mp_priv) {
 290		err = mp_dmabuf_devmem_init(pool);
 291		if (err) {
 292			pr_warn("%s() mem-provider init failed %d\n", __func__,
 293				err);
 294			goto free_ptr_ring;
 295		}
 296
 297		static_branch_inc(&page_pool_mem_providers);
 298	}
 299
 300	return 0;
 301
 302free_ptr_ring:
 303	ptr_ring_cleanup(&pool->ring, NULL);
 304#ifdef CONFIG_PAGE_POOL_STATS
 305	if (!pool->system)
 306		free_percpu(pool->recycle_stats);
 307#endif
 308	return err;
 309}
 310
 311static void page_pool_uninit(struct page_pool *pool)
 312{
 313	ptr_ring_cleanup(&pool->ring, NULL);
 314
 315	if (pool->dma_map)
 316		put_device(pool->p.dev);
 317
 318#ifdef CONFIG_PAGE_POOL_STATS
 319	if (!pool->system)
 320		free_percpu(pool->recycle_stats);
 321#endif
 322}
 323
 324/**
 325 * page_pool_create_percpu() - create a page pool for a given cpu.
 326 * @params: parameters, see struct page_pool_params
 327 * @cpuid: cpu identifier
 328 */
 329struct page_pool *
 330page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
 331{
 332	struct page_pool *pool;
 333	int err;
 334
 335	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
 336	if (!pool)
 337		return ERR_PTR(-ENOMEM);
 338
 339	err = page_pool_init(pool, params, cpuid);
 340	if (err < 0)
 341		goto err_free;
 342
 343	err = page_pool_list(pool);
 344	if (err)
 345		goto err_uninit;
 346
 347	return pool;
 348
 349err_uninit:
 350	page_pool_uninit(pool);
 351err_free:
 352	pr_warn("%s() gave up with errno %d\n", __func__, err);
 353	kfree(pool);
 354	return ERR_PTR(err);
 355}
 356EXPORT_SYMBOL(page_pool_create_percpu);
 357
 358/**
 359 * page_pool_create() - create a page pool
 360 * @params: parameters, see struct page_pool_params
 361 */
 362struct page_pool *page_pool_create(const struct page_pool_params *params)
 363{
 364	return page_pool_create_percpu(params, -1);
 365}
 366EXPORT_SYMBOL(page_pool_create);
 367
 368static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem);
 369
 370static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
 371{
 372	struct ptr_ring *r = &pool->ring;
 373	netmem_ref netmem;
 374	int pref_nid; /* preferred NUMA node */
 375
 376	/* Quicker fallback, avoid locks when ring is empty */
 377	if (__ptr_ring_empty(r)) {
 378		alloc_stat_inc(pool, empty);
 379		return 0;
 380	}
 381
 382	/* Softirq guarantee CPU and thus NUMA node is stable. This,
 383	 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
 384	 */
 385#ifdef CONFIG_NUMA
 386	pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
 387#else
 388	/* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
 389	pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
 390#endif
 391
 392	/* Refill alloc array, but only if NUMA match */
 393	do {
 394		netmem = (__force netmem_ref)__ptr_ring_consume(r);
 395		if (unlikely(!netmem))
 396			break;
 397
 398		if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
 399			pool->alloc.cache[pool->alloc.count++] = netmem;
 400		} else {
 401			/* NUMA mismatch;
 402			 * (1) release 1 page to page-allocator and
 403			 * (2) break out to fallthrough to alloc_pages_node.
 404			 * This limit stress on page buddy alloactor.
 405			 */
 406			page_pool_return_page(pool, netmem);
 407			alloc_stat_inc(pool, waive);
 408			netmem = 0;
 409			break;
 410		}
 411	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
 412
 413	/* Return last page */
 414	if (likely(pool->alloc.count > 0)) {
 415		netmem = pool->alloc.cache[--pool->alloc.count];
 416		alloc_stat_inc(pool, refill);
 417	}
 418
 419	return netmem;
 420}
 421
 422/* fast path */
 423static netmem_ref __page_pool_get_cached(struct page_pool *pool)
 424{
 425	netmem_ref netmem;
 426
 427	/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
 428	if (likely(pool->alloc.count)) {
 429		/* Fast-path */
 430		netmem = pool->alloc.cache[--pool->alloc.count];
 431		alloc_stat_inc(pool, fast);
 432	} else {
 433		netmem = page_pool_refill_alloc_cache(pool);
 434	}
 435
 436	return netmem;
 437}
 438
 439static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
 440					    netmem_ref netmem,
 441					    u32 dma_sync_size)
 442{
 443#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
 444	dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
 445
 446	dma_sync_size = min(dma_sync_size, pool->p.max_len);
 447	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
 448				     dma_sync_size, pool->p.dma_dir);
 449#endif
 450}
 451
 452static __always_inline void
 453page_pool_dma_sync_for_device(const struct page_pool *pool,
 454			      netmem_ref netmem,
 455			      u32 dma_sync_size)
 456{
 457	if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
 458		__page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
 459}
 460
 461static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
 462{
 463	dma_addr_t dma;
 464
 465	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
 466	 * since dma_addr_t can be either 32 or 64 bits and does not always fit
 467	 * into page private data (i.e 32bit cpu with 64bit DMA caps)
 468	 * This mapping is kept for lifetime of page, until leaving pool.
 469	 */
 470	dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0,
 471				 (PAGE_SIZE << pool->p.order), pool->p.dma_dir,
 472				 DMA_ATTR_SKIP_CPU_SYNC |
 473					 DMA_ATTR_WEAK_ORDERING);
 474	if (dma_mapping_error(pool->p.dev, dma))
 475		return false;
 476
 477	if (page_pool_set_dma_addr_netmem(netmem, dma))
 478		goto unmap_failed;
 479
 480	page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
 481
 482	return true;
 483
 484unmap_failed:
 485	WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
 486	dma_unmap_page_attrs(pool->p.dev, dma,
 487			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 488			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
 489	return false;
 490}
 491
 492static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 493						 gfp_t gfp)
 494{
 495	struct page *page;
 496
 497	gfp |= __GFP_COMP;
 498	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
 499	if (unlikely(!page))
 500		return NULL;
 501
 502	if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
 503		put_page(page);
 504		return NULL;
 505	}
 506
 507	alloc_stat_inc(pool, slow_high_order);
 508	page_pool_set_pp_info(pool, page_to_netmem(page));
 509
 510	/* Track how many pages are held 'in-flight' */
 511	pool->pages_state_hold_cnt++;
 512	trace_page_pool_state_hold(pool, page_to_netmem(page),
 513				   pool->pages_state_hold_cnt);
 514	return page;
 515}
 516
 517/* slow path */
 518static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
 519							gfp_t gfp)
 520{
 521	const int bulk = PP_ALLOC_CACHE_REFILL;
 522	unsigned int pp_order = pool->p.order;
 523	bool dma_map = pool->dma_map;
 524	netmem_ref netmem;
 525	int i, nr_pages;
 526
 527	/* Don't support bulk alloc for high-order pages */
 528	if (unlikely(pp_order))
 529		return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
 530
 531	/* Unnecessary as alloc cache is empty, but guarantees zero count */
 532	if (unlikely(pool->alloc.count > 0))
 533		return pool->alloc.cache[--pool->alloc.count];
 534
 535	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
 536	memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
 537
 538	nr_pages = alloc_pages_bulk_array_node(gfp,
 539					       pool->p.nid, bulk,
 540					       (struct page **)pool->alloc.cache);
 541	if (unlikely(!nr_pages))
 542		return 0;
 543
 544	/* Pages have been filled into alloc.cache array, but count is zero and
 545	 * page element have not been (possibly) DMA mapped.
 546	 */
 547	for (i = 0; i < nr_pages; i++) {
 548		netmem = pool->alloc.cache[i];
 549		if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
 550			put_page(netmem_to_page(netmem));
 551			continue;
 552		}
 553
 554		page_pool_set_pp_info(pool, netmem);
 555		pool->alloc.cache[pool->alloc.count++] = netmem;
 556		/* Track how many pages are held 'in-flight' */
 557		pool->pages_state_hold_cnt++;
 558		trace_page_pool_state_hold(pool, netmem,
 559					   pool->pages_state_hold_cnt);
 560	}
 561
 562	/* Return last page */
 563	if (likely(pool->alloc.count > 0)) {
 564		netmem = pool->alloc.cache[--pool->alloc.count];
 565		alloc_stat_inc(pool, slow);
 566	} else {
 567		netmem = 0;
 568	}
 569
 570	/* When page just alloc'ed is should/must have refcnt 1. */
 571	return netmem;
 572}
 573
 574/* For using page_pool replace: alloc_pages() API calls, but provide
 575 * synchronization guarantee for allocation side.
 576 */
 577netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp)
 578{
 579	netmem_ref netmem;
 580
 581	/* Fast-path: Get a page from cache */
 582	netmem = __page_pool_get_cached(pool);
 583	if (netmem)
 584		return netmem;
 585
 586	/* Slow-path: cache empty, do real allocation */
 587	if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
 588		netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp);
 589	else
 590		netmem = __page_pool_alloc_pages_slow(pool, gfp);
 591	return netmem;
 592}
 593EXPORT_SYMBOL(page_pool_alloc_netmem);
 594
 595struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
 596{
 597	return netmem_to_page(page_pool_alloc_netmem(pool, gfp));
 598}
 599EXPORT_SYMBOL(page_pool_alloc_pages);
 600ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL);
 601
 602/* Calculate distance between two u32 values, valid if distance is below 2^(31)
 603 *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
 604 */
 605#define _distance(a, b)	(s32)((a) - (b))
 606
 607s32 page_pool_inflight(const struct page_pool *pool, bool strict)
 608{
 609	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
 610	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
 611	s32 inflight;
 612
 613	inflight = _distance(hold_cnt, release_cnt);
 614
 615	if (strict) {
 616		trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
 617		WARN(inflight < 0, "Negative(%d) inflight packet-pages",
 618		     inflight);
 619	} else {
 620		inflight = max(0, inflight);
 621	}
 622
 623	return inflight;
 624}
 625
 626void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
 627{
 628	netmem_set_pp(netmem, pool);
 629	netmem_or_pp_magic(netmem, PP_SIGNATURE);
 630
 631	/* Ensuring all pages have been split into one fragment initially:
 632	 * page_pool_set_pp_info() is only called once for every page when it
 633	 * is allocated from the page allocator and page_pool_fragment_page()
 634	 * is dirtying the same cache line as the page->pp_magic above, so
 635	 * the overhead is negligible.
 636	 */
 637	page_pool_fragment_netmem(netmem, 1);
 638	if (pool->has_init_callback)
 639		pool->slow.init_callback(netmem, pool->slow.init_arg);
 640}
 641
 642void page_pool_clear_pp_info(netmem_ref netmem)
 643{
 644	netmem_clear_pp_magic(netmem);
 645	netmem_set_pp(netmem, NULL);
 646}
 647
 648static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
 649							 netmem_ref netmem)
 650{
 651	dma_addr_t dma;
 652
 653	if (!pool->dma_map)
 654		/* Always account for inflight pages, even if we didn't
 655		 * map them
 656		 */
 657		return;
 658
 659	dma = page_pool_get_dma_addr_netmem(netmem);
 660
 661	/* When page is unmapped, it cannot be returned to our pool */
 662	dma_unmap_page_attrs(pool->p.dev, dma,
 663			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 664			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
 665	page_pool_set_dma_addr_netmem(netmem, 0);
 666}
 667
 668/* Disconnects a page (from a page_pool).  API users can have a need
 669 * to disconnect a page (from a page_pool), to allow it to be used as
 670 * a regular page (that will eventually be returned to the normal
 671 * page-allocator via put_page).
 672 */
 673void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
 674{
 675	int count;
 676	bool put;
 677
 678	put = true;
 679	if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
 680		put = mp_dmabuf_devmem_release_page(pool, netmem);
 681	else
 682		__page_pool_release_page_dma(pool, netmem);
 683
 684	/* This may be the last page returned, releasing the pool, so
 685	 * it is not safe to reference pool afterwards.
 686	 */
 687	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
 688	trace_page_pool_state_release(pool, netmem, count);
 689
 690	if (put) {
 691		page_pool_clear_pp_info(netmem);
 692		put_page(netmem_to_page(netmem));
 693	}
 694	/* An optimization would be to call __free_pages(page, pool->p.order)
 695	 * knowing page is not part of page-cache (thus avoiding a
 696	 * __page_cache_release() call).
 697	 */
 698}
 699
 700static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
 701{
 702	int ret;
 703	/* BH protection not needed if current is softirq */
 704	if (in_softirq())
 705		ret = ptr_ring_produce(&pool->ring, (__force void *)netmem);
 706	else
 707		ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem);
 708
 709	if (!ret) {
 710		recycle_stat_inc(pool, ring);
 711		return true;
 712	}
 713
 714	return false;
 715}
 716
 717/* Only allow direct recycling in special circumstances, into the
 718 * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
 719 *
 720 * Caller must provide appropriate safe context.
 721 */
 722static bool page_pool_recycle_in_cache(netmem_ref netmem,
 723				       struct page_pool *pool)
 724{
 725	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
 726		recycle_stat_inc(pool, cache_full);
 727		return false;
 728	}
 729
 730	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
 731	pool->alloc.cache[pool->alloc.count++] = netmem;
 732	recycle_stat_inc(pool, cached);
 733	return true;
 734}
 735
 736static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
 737{
 738	return netmem_is_net_iov(netmem) ||
 739	       (page_ref_count(netmem_to_page(netmem)) == 1 &&
 740		!page_is_pfmemalloc(netmem_to_page(netmem)));
 741}
 742
 743/* If the page refcnt == 1, this will try to recycle the page.
 744 * If pool->dma_sync is set, we'll try to sync the DMA area for
 745 * the configured size min(dma_sync_size, pool->max_len).
 746 * If the page refcnt != 1, then the page will be returned to memory
 747 * subsystem.
 748 */
 749static __always_inline netmem_ref
 750__page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
 751		     unsigned int dma_sync_size, bool allow_direct)
 752{
 753	lockdep_assert_no_hardirq();
 754
 755	/* This allocator is optimized for the XDP mode that uses
 756	 * one-frame-per-page, but have fallbacks that act like the
 757	 * regular page allocator APIs.
 758	 *
 759	 * refcnt == 1 means page_pool owns page, and can recycle it.
 760	 *
 761	 * page is NOT reusable when allocated when system is under
 762	 * some pressure. (page_is_pfmemalloc)
 763	 */
 764	if (likely(__page_pool_page_can_be_recycled(netmem))) {
 765		/* Read barrier done in page_ref_count / READ_ONCE */
 766
 767		page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
 768
 769		if (allow_direct && page_pool_recycle_in_cache(netmem, pool))
 770			return 0;
 771
 772		/* Page found as candidate for recycling */
 773		return netmem;
 774	}
 775
 776	/* Fallback/non-XDP mode: API user have elevated refcnt.
 777	 *
 778	 * Many drivers split up the page into fragments, and some
 779	 * want to keep doing this to save memory and do refcnt based
 780	 * recycling. Support this use case too, to ease drivers
 781	 * switching between XDP/non-XDP.
 782	 *
 783	 * In-case page_pool maintains the DMA mapping, API user must
 784	 * call page_pool_put_page once.  In this elevated refcnt
 785	 * case, the DMA is unmapped/released, as driver is likely
 786	 * doing refcnt based recycle tricks, meaning another process
 787	 * will be invoking put_page.
 788	 */
 789	recycle_stat_inc(pool, released_refcnt);
 790	page_pool_return_page(pool, netmem);
 791
 792	return 0;
 793}
 794
 795static bool page_pool_napi_local(const struct page_pool *pool)
 796{
 797	const struct napi_struct *napi;
 798	u32 cpuid;
 799
 800	if (unlikely(!in_softirq()))
 801		return false;
 802
 803	/* Allow direct recycle if we have reasons to believe that we are
 804	 * in the same context as the consumer would run, so there's
 805	 * no possible race.
 806	 * __page_pool_put_page() makes sure we're not in hardirq context
 807	 * and interrupts are enabled prior to accessing the cache.
 808	 */
 809	cpuid = smp_processor_id();
 810	if (READ_ONCE(pool->cpuid) == cpuid)
 811		return true;
 812
 813	napi = READ_ONCE(pool->p.napi);
 814
 815	return napi && READ_ONCE(napi->list_owner) == cpuid;
 816}
 817
 818void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
 819				  unsigned int dma_sync_size, bool allow_direct)
 820{
 821	if (!allow_direct)
 822		allow_direct = page_pool_napi_local(pool);
 823
 824	netmem =
 825		__page_pool_put_page(pool, netmem, dma_sync_size, allow_direct);
 826	if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
 827		/* Cache full, fallback to free pages */
 828		recycle_stat_inc(pool, ring_full);
 829		page_pool_return_page(pool, netmem);
 830	}
 831}
 832EXPORT_SYMBOL(page_pool_put_unrefed_netmem);
 833
 834void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
 835				unsigned int dma_sync_size, bool allow_direct)
 836{
 837	page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size,
 838				     allow_direct);
 839}
 840EXPORT_SYMBOL(page_pool_put_unrefed_page);
 841
 842/**
 843 * page_pool_put_page_bulk() - release references on multiple pages
 844 * @pool:	pool from which pages were allocated
 845 * @data:	array holding page pointers
 846 * @count:	number of pages in @data
 847 *
 848 * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
 849 * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
 850 * will release leftover pages to the page allocator.
 851 * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
 852 * completion loop for the XDP_REDIRECT use case.
 853 *
 854 * Please note the caller must not use data area after running
 855 * page_pool_put_page_bulk(), as this function overwrites it.
 856 */
 857void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 858			     int count)
 859{
 860	int i, bulk_len = 0;
 861	bool allow_direct;
 862	bool in_softirq;
 863
 864	allow_direct = page_pool_napi_local(pool);
 865
 866	for (i = 0; i < count; i++) {
 867		netmem_ref netmem = page_to_netmem(virt_to_head_page(data[i]));
 868
 869		/* It is not the last user for the page frag case */
 870		if (!page_pool_is_last_ref(netmem))
 871			continue;
 872
 873		netmem = __page_pool_put_page(pool, netmem, -1, allow_direct);
 874		/* Approved for bulk recycling in ptr_ring cache */
 875		if (netmem)
 876			data[bulk_len++] = (__force void *)netmem;
 877	}
 878
 879	if (!bulk_len)
 880		return;
 881
 882	/* Bulk producer into ptr_ring page_pool cache */
 883	in_softirq = page_pool_producer_lock(pool);
 884	for (i = 0; i < bulk_len; i++) {
 885		if (__ptr_ring_produce(&pool->ring, data[i])) {
 886			/* ring full */
 887			recycle_stat_inc(pool, ring_full);
 888			break;
 889		}
 890	}
 891	recycle_stat_add(pool, ring, i);
 892	page_pool_producer_unlock(pool, in_softirq);
 893
 894	/* Hopefully all pages was return into ptr_ring */
 895	if (likely(i == bulk_len))
 896		return;
 897
 898	/* ptr_ring cache full, free remaining pages outside producer lock
 899	 * since put_page() with refcnt == 1 can be an expensive operation
 900	 */
 901	for (; i < bulk_len; i++)
 902		page_pool_return_page(pool, (__force netmem_ref)data[i]);
 903}
 904EXPORT_SYMBOL(page_pool_put_page_bulk);
 905
 906static netmem_ref page_pool_drain_frag(struct page_pool *pool,
 907				       netmem_ref netmem)
 908{
 909	long drain_count = BIAS_MAX - pool->frag_users;
 910
 911	/* Some user is still using the page frag */
 912	if (likely(page_pool_unref_netmem(netmem, drain_count)))
 913		return 0;
 914
 915	if (__page_pool_page_can_be_recycled(netmem)) {
 916		page_pool_dma_sync_for_device(pool, netmem, -1);
 917		return netmem;
 918	}
 919
 920	page_pool_return_page(pool, netmem);
 921	return 0;
 922}
 923
 924static void page_pool_free_frag(struct page_pool *pool)
 925{
 926	long drain_count = BIAS_MAX - pool->frag_users;
 927	netmem_ref netmem = pool->frag_page;
 928
 929	pool->frag_page = 0;
 930
 931	if (!netmem || page_pool_unref_netmem(netmem, drain_count))
 932		return;
 933
 934	page_pool_return_page(pool, netmem);
 935}
 936
 937netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
 938				       unsigned int *offset, unsigned int size,
 939				       gfp_t gfp)
 940{
 941	unsigned int max_size = PAGE_SIZE << pool->p.order;
 942	netmem_ref netmem = pool->frag_page;
 943
 944	if (WARN_ON(size > max_size))
 945		return 0;
 946
 947	size = ALIGN(size, dma_get_cache_alignment());
 948	*offset = pool->frag_offset;
 949
 950	if (netmem && *offset + size > max_size) {
 951		netmem = page_pool_drain_frag(pool, netmem);
 952		if (netmem) {
 953			recycle_stat_inc(pool, cached);
 954			alloc_stat_inc(pool, fast);
 955			goto frag_reset;
 956		}
 957	}
 958
 959	if (!netmem) {
 960		netmem = page_pool_alloc_netmem(pool, gfp);
 961		if (unlikely(!netmem)) {
 962			pool->frag_page = 0;
 963			return 0;
 964		}
 965
 966		pool->frag_page = netmem;
 967
 968frag_reset:
 969		pool->frag_users = 1;
 970		*offset = 0;
 971		pool->frag_offset = size;
 972		page_pool_fragment_netmem(netmem, BIAS_MAX);
 973		return netmem;
 974	}
 975
 976	pool->frag_users++;
 977	pool->frag_offset = *offset + size;
 978	return netmem;
 979}
 980EXPORT_SYMBOL(page_pool_alloc_frag_netmem);
 981
 982struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
 983				  unsigned int size, gfp_t gfp)
 984{
 985	return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size,
 986							  gfp));
 987}
 988EXPORT_SYMBOL(page_pool_alloc_frag);
 989
 990static void page_pool_empty_ring(struct page_pool *pool)
 991{
 992	netmem_ref netmem;
 993
 994	/* Empty recycle ring */
 995	while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
 996		/* Verify the refcnt invariant of cached pages */
 997		if (!(netmem_ref_count(netmem) == 1))
 998			pr_crit("%s() page_pool refcnt %d violation\n",
 999				__func__, netmem_ref_count(netmem));
1000
1001		page_pool_return_page(pool, netmem);
1002	}
1003}
1004
1005static void __page_pool_destroy(struct page_pool *pool)
1006{
1007	if (pool->disconnect)
1008		pool->disconnect(pool);
1009
1010	page_pool_unlist(pool);
1011	page_pool_uninit(pool);
1012
1013	if (pool->mp_priv) {
1014		mp_dmabuf_devmem_destroy(pool);
1015		static_branch_dec(&page_pool_mem_providers);
1016	}
1017
1018	kfree(pool);
1019}
1020
1021static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
1022{
1023	netmem_ref netmem;
1024
1025	if (pool->destroy_cnt)
1026		return;
1027
1028	/* Empty alloc cache, assume caller made sure this is
1029	 * no-longer in use, and page_pool_alloc_pages() cannot be
1030	 * call concurrently.
1031	 */
1032	while (pool->alloc.count) {
1033		netmem = pool->alloc.cache[--pool->alloc.count];
1034		page_pool_return_page(pool, netmem);
1035	}
1036}
1037
1038static void page_pool_scrub(struct page_pool *pool)
1039{
1040	page_pool_empty_alloc_cache_once(pool);
1041	pool->destroy_cnt++;
1042
1043	/* No more consumers should exist, but producers could still
1044	 * be in-flight.
1045	 */
1046	page_pool_empty_ring(pool);
1047}
1048
1049static int page_pool_release(struct page_pool *pool)
1050{
1051	int inflight;
1052
1053	page_pool_scrub(pool);
1054	inflight = page_pool_inflight(pool, true);
1055	if (!inflight)
1056		__page_pool_destroy(pool);
1057
1058	return inflight;
1059}
1060
1061static void page_pool_release_retry(struct work_struct *wq)
1062{
1063	struct delayed_work *dwq = to_delayed_work(wq);
1064	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
1065	void *netdev;
1066	int inflight;
1067
1068	inflight = page_pool_release(pool);
1069	if (!inflight)
1070		return;
1071
1072	/* Periodic warning for page pools the user can't see */
1073	netdev = READ_ONCE(pool->slow.netdev);
1074	if (time_after_eq(jiffies, pool->defer_warn) &&
1075	    (!netdev || netdev == NET_PTR_POISON)) {
1076		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
1077
1078		pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
1079			__func__, pool->user.id, inflight, sec);
1080		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
1081	}
1082
1083	/* Still not ready to be disconnected, retry later */
1084	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1085}
1086
1087void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
1088			   const struct xdp_mem_info *mem)
1089{
1090	refcount_inc(&pool->user_cnt);
1091	pool->disconnect = disconnect;
1092	pool->xdp_mem_id = mem->id;
1093}
1094
1095void page_pool_disable_direct_recycling(struct page_pool *pool)
1096{
1097	/* Disable direct recycling based on pool->cpuid.
1098	 * Paired with READ_ONCE() in page_pool_napi_local().
1099	 */
1100	WRITE_ONCE(pool->cpuid, -1);
1101
1102	if (!pool->p.napi)
1103		return;
1104
1105	/* To avoid races with recycling and additional barriers make sure
1106	 * pool and NAPI are unlinked when NAPI is disabled.
1107	 */
1108	WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state));
1109	WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1);
1110
1111	mutex_lock(&page_pools_lock);
1112	WRITE_ONCE(pool->p.napi, NULL);
1113	mutex_unlock(&page_pools_lock);
1114}
1115EXPORT_SYMBOL(page_pool_disable_direct_recycling);
1116
1117void page_pool_destroy(struct page_pool *pool)
1118{
1119	if (!pool)
1120		return;
1121
1122	if (!page_pool_put(pool))
1123		return;
1124
1125	page_pool_disable_direct_recycling(pool);
1126	page_pool_free_frag(pool);
1127
1128	if (!page_pool_release(pool))
1129		return;
1130
1131	page_pool_detached(pool);
1132	pool->defer_start = jiffies;
1133	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
1134
1135	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
1136	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1137}
1138EXPORT_SYMBOL(page_pool_destroy);
1139
1140/* Caller must provide appropriate safe context, e.g. NAPI. */
1141void page_pool_update_nid(struct page_pool *pool, int new_nid)
1142{
1143	netmem_ref netmem;
1144
1145	trace_page_pool_update_nid(pool, new_nid);
1146	pool->p.nid = new_nid;
1147
1148	/* Flush pool alloc cache, as refill will check NUMA node */
1149	while (pool->alloc.count) {
1150		netmem = pool->alloc.cache[--pool->alloc.count];
1151		page_pool_return_page(pool, netmem);
1152	}
1153}
1154EXPORT_SYMBOL(page_pool_update_nid);