page_pool.c - net/core/page_pool.c - Linux diff v6.13.7

   1/* SPDX-License-Identifier: GPL-2.0
   2 *
   3 * page_pool.c
   4 *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
   5 *	Copyright (C) 2016 Red Hat, Inc.
   6 */
   7
   8#include <linux/error-injection.h>
   9#include <linux/types.h>
  10#include <linux/kernel.h>
  11#include <linux/slab.h>
  12#include <linux/device.h>
  13
  14#include <net/netdev_rx_queue.h>
  15#include <net/page_pool/helpers.h>
  16#include <net/xdp.h>
  17
  18#include <linux/dma-direction.h>
  19#include <linux/dma-mapping.h>
  20#include <linux/page-flags.h>
  21#include <linux/mm.h> /* for put_page() */
  22#include <linux/poison.h>
  23#include <linux/ethtool.h>
  24#include <linux/netdevice.h>
  25
  26#include <trace/events/page_pool.h>
  27
  28#include "mp_dmabuf_devmem.h"
  29#include "netmem_priv.h"
  30#include "page_pool_priv.h"
  31
  32DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
  33
  34#define DEFER_TIME (msecs_to_jiffies(1000))
  35#define DEFER_WARN_INTERVAL (60 * HZ)
  36
  37#define BIAS_MAX	(LONG_MAX >> 1)
  38
  39#ifdef CONFIG_PAGE_POOL_STATS
  40static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
  41
  42/* alloc_stat_inc is intended to be used in softirq context */
  43#define alloc_stat_inc(pool, __stat)	(pool->alloc_stats.__stat++)
  44/* recycle_stat_inc is safe to use when preemption is possible. */
  45#define recycle_stat_inc(pool, __stat)							\
  46	do {										\
  47		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
  48		this_cpu_inc(s->__stat);						\
  49	} while (0)
  50
  51#define recycle_stat_add(pool, __stat, val)						\
  52	do {										\
  53		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
  54		this_cpu_add(s->__stat, val);						\
  55	} while (0)
  56
  57static const char pp_stats[][ETH_GSTRING_LEN] = {
  58	"rx_pp_alloc_fast",
  59	"rx_pp_alloc_slow",
  60	"rx_pp_alloc_slow_ho",
  61	"rx_pp_alloc_empty",
  62	"rx_pp_alloc_refill",
  63	"rx_pp_alloc_waive",
  64	"rx_pp_recycle_cached",
  65	"rx_pp_recycle_cache_full",
  66	"rx_pp_recycle_ring",
  67	"rx_pp_recycle_ring_full",
  68	"rx_pp_recycle_released_ref",
  69};
  70
  71/**
  72 * page_pool_get_stats() - fetch page pool stats
  73 * @pool:	pool from which page was allocated
  74 * @stats:	struct page_pool_stats to fill in
  75 *
  76 * Retrieve statistics about the page_pool. This API is only available
  77 * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
  78 * A pointer to a caller allocated struct page_pool_stats structure
  79 * is passed to this API which is filled in. The caller can then report
  80 * those stats to the user (perhaps via ethtool, debugfs, etc.).
  81 */
  82bool page_pool_get_stats(const struct page_pool *pool,
  83			 struct page_pool_stats *stats)
  84{
  85	int cpu = 0;
  86
  87	if (!stats)
  88		return false;
  89
  90	/* The caller is responsible to initialize stats. */
  91	stats->alloc_stats.fast += pool->alloc_stats.fast;
  92	stats->alloc_stats.slow += pool->alloc_stats.slow;
  93	stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
  94	stats->alloc_stats.empty += pool->alloc_stats.empty;
  95	stats->alloc_stats.refill += pool->alloc_stats.refill;
  96	stats->alloc_stats.waive += pool->alloc_stats.waive;
  97
  98	for_each_possible_cpu(cpu) {
  99		const struct page_pool_recycle_stats *pcpu =
 100			per_cpu_ptr(pool->recycle_stats, cpu);
 101
 102		stats->recycle_stats.cached += pcpu->cached;
 103		stats->recycle_stats.cache_full += pcpu->cache_full;
 104		stats->recycle_stats.ring += pcpu->ring;
 105		stats->recycle_stats.ring_full += pcpu->ring_full;
 106		stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
 107	}
 108
 109	return true;
 110}
 111EXPORT_SYMBOL(page_pool_get_stats);
 112
 113u8 *page_pool_ethtool_stats_get_strings(u8 *data)
 114{
 115	int i;
 116
 117	for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
 118		memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
 119		data += ETH_GSTRING_LEN;
 120	}
 121
 122	return data;
 123}
 124EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
 125
 126int page_pool_ethtool_stats_get_count(void)
 127{
 128	return ARRAY_SIZE(pp_stats);
 129}
 130EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
 131
 132u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
 133{
 134	const struct page_pool_stats *pool_stats = stats;
 135
 136	*data++ = pool_stats->alloc_stats.fast;
 137	*data++ = pool_stats->alloc_stats.slow;
 138	*data++ = pool_stats->alloc_stats.slow_high_order;
 139	*data++ = pool_stats->alloc_stats.empty;
 140	*data++ = pool_stats->alloc_stats.refill;
 141	*data++ = pool_stats->alloc_stats.waive;
 142	*data++ = pool_stats->recycle_stats.cached;
 143	*data++ = pool_stats->recycle_stats.cache_full;
 144	*data++ = pool_stats->recycle_stats.ring;
 145	*data++ = pool_stats->recycle_stats.ring_full;
 146	*data++ = pool_stats->recycle_stats.released_refcnt;
 147
 148	return data;
 149}
 150EXPORT_SYMBOL(page_pool_ethtool_stats_get);
 151
 152#else
 153#define alloc_stat_inc(pool, __stat)
 154#define recycle_stat_inc(pool, __stat)
 155#define recycle_stat_add(pool, __stat, val)
 156#endif
 157
 158static bool page_pool_producer_lock(struct page_pool *pool)
 159	__acquires(&pool->ring.producer_lock)
 160{
 161	bool in_softirq = in_softirq();
 162
 163	if (in_softirq)
 164		spin_lock(&pool->ring.producer_lock);
 165	else
 166		spin_lock_bh(&pool->ring.producer_lock);
 167
 168	return in_softirq;
 169}
 170
 171static void page_pool_producer_unlock(struct page_pool *pool,
 172				      bool in_softirq)
 173	__releases(&pool->ring.producer_lock)
 174{
 175	if (in_softirq)
 176		spin_unlock(&pool->ring.producer_lock);
 177	else
 178		spin_unlock_bh(&pool->ring.producer_lock);
 179}
 180
 181static void page_pool_struct_check(void)
 182{
 183	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
 184	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
 185	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
 186	CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
 187				    PAGE_POOL_FRAG_GROUP_ALIGN);
 188}
 189
 190static int page_pool_init(struct page_pool *pool,
 191			  const struct page_pool_params *params,
 192			  int cpuid)
 193{
 194	unsigned int ring_qsize = 1024; /* Default */
 195	struct netdev_rx_queue *rxq;
 196	int err;
 197
 198	page_pool_struct_check();
 199
 200	memcpy(&pool->p, &params->fast, sizeof(pool->p));
 201	memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
 202
 203	pool->cpuid = cpuid;
 204
 205	/* Validate only known flags were used */
 206	if (pool->slow.flags & ~PP_FLAG_ALL)
 207		return -EINVAL;
 208
 209	if (pool->p.pool_size)
 210		ring_qsize = pool->p.pool_size;
 211
 212	/* Sanity limit mem that can be pinned down */
 213	if (ring_qsize > 32768)
 214		return -E2BIG;
 215
 216	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
 217	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
 218	 * which is the XDP_TX use-case.
 219	 */
 220	if (pool->slow.flags & PP_FLAG_DMA_MAP) {
 221		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
 222		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
 223			return -EINVAL;
 224
 225		pool->dma_map = true;
 226	}
 227
 228	if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
 229		/* In order to request DMA-sync-for-device the page
 230		 * needs to be mapped
 231		 */
 232		if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
 233			return -EINVAL;
 234
 235		if (!pool->p.max_len)
 236			return -EINVAL;
 237
 238		pool->dma_sync = true;
 239
 240		/* pool->p.offset has to be set according to the address
 241		 * offset used by the DMA engine to start copying rx data
 242		 */
 243	}
 244
 245	pool->has_init_callback = !!pool->slow.init_callback;
 
 
 246
 247#ifdef CONFIG_PAGE_POOL_STATS
 248	if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
 249		pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
 250		if (!pool->recycle_stats)
 251			return -ENOMEM;
 252	} else {
 253		/* For system page pool instance we use a singular stats object
 254		 * instead of allocating a separate percpu variable for each
 255		 * (also percpu) page pool instance.
 256		 */
 257		pool->recycle_stats = &pp_system_recycle_stats;
 258		pool->system = true;
 259	}
 260#endif
 261
 262	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
 263#ifdef CONFIG_PAGE_POOL_STATS
 264		if (!pool->system)
 265			free_percpu(pool->recycle_stats);
 266#endif
 267		return -ENOMEM;
 268	}
 269
 270	atomic_set(&pool->pages_state_release_cnt, 0);
 271
 272	/* Driver calling page_pool_create() also call page_pool_destroy() */
 273	refcount_set(&pool->user_cnt, 1);
 274
 275	if (pool->dma_map)
 276		get_device(pool->p.dev);
 277
 278	if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
 279		/* We rely on rtnl_lock()ing to make sure netdev_rx_queue
 280		 * configuration doesn't change while we're initializing
 281		 * the page_pool.
 282		 */
 283		ASSERT_RTNL();
 284		rxq = __netif_get_rx_queue(pool->slow.netdev,
 285					   pool->slow.queue_idx);
 286		pool->mp_priv = rxq->mp_params.mp_priv;
 287	}
 288
 289	if (pool->mp_priv) {
 290		err = mp_dmabuf_devmem_init(pool);
 291		if (err) {
 292			pr_warn("%s() mem-provider init failed %d\n", __func__,
 293				err);
 294			goto free_ptr_ring;
 295		}
 296
 297		static_branch_inc(&page_pool_mem_providers);
 298	}
 299
 300	return 0;
 301
 302free_ptr_ring:
 303	ptr_ring_cleanup(&pool->ring, NULL);
 304#ifdef CONFIG_PAGE_POOL_STATS
 305	if (!pool->system)
 306		free_percpu(pool->recycle_stats);
 307#endif
 308	return err;
 309}
 310
 311static void page_pool_uninit(struct page_pool *pool)
 312{
 313	ptr_ring_cleanup(&pool->ring, NULL);
 314
 315	if (pool->dma_map)
 316		put_device(pool->p.dev);
 317
 318#ifdef CONFIG_PAGE_POOL_STATS
 319	if (!pool->system)
 320		free_percpu(pool->recycle_stats);
 321#endif
 322}
 323
 324/**
 325 * page_pool_create_percpu() - create a page pool for a given cpu.
 326 * @params: parameters, see struct page_pool_params
 327 * @cpuid: cpu identifier
 328 */
 329struct page_pool *
 330page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
 331{
 332	struct page_pool *pool;
 333	int err;
 334
 335	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
 336	if (!pool)
 337		return ERR_PTR(-ENOMEM);
 338
 339	err = page_pool_init(pool, params, cpuid);
 340	if (err < 0)
 341		goto err_free;
 342
 343	err = page_pool_list(pool);
 344	if (err)
 345		goto err_uninit;
 346
 347	return pool;
 348
 349err_uninit:
 350	page_pool_uninit(pool);
 351err_free:
 352	pr_warn("%s() gave up with errno %d\n", __func__, err);
 353	kfree(pool);
 354	return ERR_PTR(err);
 355}
 356EXPORT_SYMBOL(page_pool_create_percpu);
 357
 358/**
 359 * page_pool_create() - create a page pool
 360 * @params: parameters, see struct page_pool_params
 361 */
 362struct page_pool *page_pool_create(const struct page_pool_params *params)
 363{
 364	return page_pool_create_percpu(params, -1);
 365}
 366EXPORT_SYMBOL(page_pool_create);
 367
 368static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem);
 369
 370static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
 
 371{
 372	struct ptr_ring *r = &pool->ring;
 373	netmem_ref netmem;
 374	int pref_nid; /* preferred NUMA node */
 375
 376	/* Quicker fallback, avoid locks when ring is empty */
 377	if (__ptr_ring_empty(r)) {
 378		alloc_stat_inc(pool, empty);
 379		return 0;
 380	}
 381
 382	/* Softirq guarantee CPU and thus NUMA node is stable. This,
 383	 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
 384	 */
 385#ifdef CONFIG_NUMA
 386	pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
 387#else
 388	/* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
 389	pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
 390#endif
 391
 392	/* Refill alloc array, but only if NUMA match */
 393	do {
 394		netmem = (__force netmem_ref)__ptr_ring_consume(r);
 395		if (unlikely(!netmem))
 396			break;
 397
 398		if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
 399			pool->alloc.cache[pool->alloc.count++] = netmem;
 400		} else {
 401			/* NUMA mismatch;
 402			 * (1) release 1 page to page-allocator and
 403			 * (2) break out to fallthrough to alloc_pages_node.
 404			 * This limit stress on page buddy alloactor.
 405			 */
 406			page_pool_return_page(pool, netmem);
 407			alloc_stat_inc(pool, waive);
 408			netmem = 0;
 409			break;
 410		}
 411	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
 412
 413	/* Return last page */
 414	if (likely(pool->alloc.count > 0)) {
 415		netmem = pool->alloc.cache[--pool->alloc.count];
 416		alloc_stat_inc(pool, refill);
 417	}
 418
 419	return netmem;
 420}
 421
 422/* fast path */
 423static netmem_ref __page_pool_get_cached(struct page_pool *pool)
 424{
 425	netmem_ref netmem;
 426
 427	/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
 428	if (likely(pool->alloc.count)) {
 429		/* Fast-path */
 430		netmem = pool->alloc.cache[--pool->alloc.count];
 431		alloc_stat_inc(pool, fast);
 432	} else {
 433		netmem = page_pool_refill_alloc_cache(pool);
 434	}
 435
 436	return netmem;
 437}
 438
 439static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
 440					    netmem_ref netmem,
 441					    u32 dma_sync_size)
 442{
 443#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
 444	dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
 445
 446	dma_sync_size = min(dma_sync_size, pool->p.max_len);
 447	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
 448				     dma_sync_size, pool->p.dma_dir);
 449#endif
 450}
 451
 452static __always_inline void
 453page_pool_dma_sync_for_device(const struct page_pool *pool,
 454			      netmem_ref netmem,
 455			      u32 dma_sync_size)
 456{
 457	if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
 458		__page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
 459}
 460
 461static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
 462{
 463	dma_addr_t dma;
 464
 465	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
 466	 * since dma_addr_t can be either 32 or 64 bits and does not always fit
 467	 * into page private data (i.e 32bit cpu with 64bit DMA caps)
 468	 * This mapping is kept for lifetime of page, until leaving pool.
 469	 */
 470	dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0,
 471				 (PAGE_SIZE << pool->p.order), pool->p.dma_dir,
 472				 DMA_ATTR_SKIP_CPU_SYNC |
 473					 DMA_ATTR_WEAK_ORDERING);
 474	if (dma_mapping_error(pool->p.dev, dma))
 475		return false;
 476
 477	if (page_pool_set_dma_addr_netmem(netmem, dma))
 478		goto unmap_failed;
 479
 480	page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
 
 481
 482	return true;
 
 483
 484unmap_failed:
 485	WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
 486	dma_unmap_page_attrs(pool->p.dev, dma,
 487			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 488			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
 489	return false;
 
 
 
 
 
 
 
 490}
 491
 492static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 493						 gfp_t gfp)
 494{
 495	struct page *page;
 496
 497	gfp |= __GFP_COMP;
 498	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
 499	if (unlikely(!page))
 500		return NULL;
 501
 502	if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
 
 503		put_page(page);
 504		return NULL;
 505	}
 506
 507	alloc_stat_inc(pool, slow_high_order);
 508	page_pool_set_pp_info(pool, page_to_netmem(page));
 509
 510	/* Track how many pages are held 'in-flight' */
 511	pool->pages_state_hold_cnt++;
 512	trace_page_pool_state_hold(pool, page_to_netmem(page),
 513				   pool->pages_state_hold_cnt);
 514	return page;
 515}
 516
 517/* slow path */
 518static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
 519							gfp_t gfp)
 
 520{
 521	const int bulk = PP_ALLOC_CACHE_REFILL;
 
 522	unsigned int pp_order = pool->p.order;
 523	bool dma_map = pool->dma_map;
 524	netmem_ref netmem;
 525	int i, nr_pages;
 526
 527	/* Don't support bulk alloc for high-order pages */
 528	if (unlikely(pp_order))
 529		return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
 530
 531	/* Unnecessary as alloc cache is empty, but guarantees zero count */
 532	if (unlikely(pool->alloc.count > 0))
 533		return pool->alloc.cache[--pool->alloc.count];
 534
 535	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
 536	memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
 537
 538	nr_pages = alloc_pages_bulk_array_node(gfp,
 539					       pool->p.nid, bulk,
 540					       (struct page **)pool->alloc.cache);
 541	if (unlikely(!nr_pages))
 542		return 0;
 543
 544	/* Pages have been filled into alloc.cache array, but count is zero and
 545	 * page element have not been (possibly) DMA mapped.
 546	 */
 547	for (i = 0; i < nr_pages; i++) {
 548		netmem = pool->alloc.cache[i];
 549		if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
 550			put_page(netmem_to_page(netmem));
 
 551			continue;
 552		}
 553
 554		page_pool_set_pp_info(pool, netmem);
 555		pool->alloc.cache[pool->alloc.count++] = netmem;
 556		/* Track how many pages are held 'in-flight' */
 557		pool->pages_state_hold_cnt++;
 558		trace_page_pool_state_hold(pool, netmem,
 559					   pool->pages_state_hold_cnt);
 560	}
 561
 562	/* Return last page */
 563	if (likely(pool->alloc.count > 0)) {
 564		netmem = pool->alloc.cache[--pool->alloc.count];
 565		alloc_stat_inc(pool, slow);
 566	} else {
 567		netmem = 0;
 568	}
 569
 570	/* When page just alloc'ed is should/must have refcnt 1. */
 571	return netmem;
 572}
 573
 574/* For using page_pool replace: alloc_pages() API calls, but provide
 575 * synchronization guarantee for allocation side.
 576 */
 577netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp)
 578{
 579	netmem_ref netmem;
 580
 581	/* Fast-path: Get a page from cache */
 582	netmem = __page_pool_get_cached(pool);
 583	if (netmem)
 584		return netmem;
 585
 586	/* Slow-path: cache empty, do real allocation */
 587	if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
 588		netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp);
 589	else
 590		netmem = __page_pool_alloc_pages_slow(pool, gfp);
 591	return netmem;
 592}
 593EXPORT_SYMBOL(page_pool_alloc_netmem);
 594
 595struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
 596{
 597	return netmem_to_page(page_pool_alloc_netmem(pool, gfp));
 598}
 599EXPORT_SYMBOL(page_pool_alloc_pages);
 600ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL);
 601
 602/* Calculate distance between two u32 values, valid if distance is below 2^(31)
 603 *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
 604 */
 605#define _distance(a, b)	(s32)((a) - (b))
 606
 607s32 page_pool_inflight(const struct page_pool *pool, bool strict)
 608{
 609	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
 610	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
 611	s32 inflight;
 612
 613	inflight = _distance(hold_cnt, release_cnt);
 614
 615	if (strict) {
 616		trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
 617		WARN(inflight < 0, "Negative(%d) inflight packet-pages",
 618		     inflight);
 619	} else {
 620		inflight = max(0, inflight);
 621	}
 622
 623	return inflight;
 624}
 625
 626void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
 627{
 628	netmem_set_pp(netmem, pool);
 629	netmem_or_pp_magic(netmem, PP_SIGNATURE);
 630
 631	/* Ensuring all pages have been split into one fragment initially:
 632	 * page_pool_set_pp_info() is only called once for every page when it
 633	 * is allocated from the page allocator and page_pool_fragment_page()
 634	 * is dirtying the same cache line as the page->pp_magic above, so
 635	 * the overhead is negligible.
 636	 */
 637	page_pool_fragment_netmem(netmem, 1);
 638	if (pool->has_init_callback)
 639		pool->slow.init_callback(netmem, pool->slow.init_arg);
 640}
 641
 642void page_pool_clear_pp_info(netmem_ref netmem)
 643{
 644	netmem_clear_pp_magic(netmem);
 645	netmem_set_pp(netmem, NULL);
 646}
 647
 648static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
 649							 netmem_ref netmem)
 650{
 651	dma_addr_t dma;
 
 652
 653	if (!pool->dma_map)
 654		/* Always account for inflight pages, even if we didn't
 655		 * map them
 656		 */
 657		return;
 658
 659	dma = page_pool_get_dma_addr_netmem(netmem);
 660
 661	/* When page is unmapped, it cannot be returned to our pool */
 662	dma_unmap_page_attrs(pool->p.dev, dma,
 663			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 664			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
 665	page_pool_set_dma_addr_netmem(netmem, 0);
 666}
 667
 668/* Disconnects a page (from a page_pool).  API users can have a need
 669 * to disconnect a page (from a page_pool), to allow it to be used as
 670 * a regular page (that will eventually be returned to the normal
 671 * page-allocator via put_page).
 672 */
 673void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
 674{
 675	int count;
 676	bool put;
 677
 678	put = true;
 679	if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
 680		put = mp_dmabuf_devmem_release_page(pool, netmem);
 681	else
 682		__page_pool_release_page_dma(pool, netmem);
 683
 684	/* This may be the last page returned, releasing the pool, so
 685	 * it is not safe to reference pool afterwards.
 686	 */
 687	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
 688	trace_page_pool_state_release(pool, netmem, count);
 
 
 689
 690	if (put) {
 691		page_pool_clear_pp_info(netmem);
 692		put_page(netmem_to_page(netmem));
 693	}
 
 
 694	/* An optimization would be to call __free_pages(page, pool->p.order)
 695	 * knowing page is not part of page-cache (thus avoiding a
 696	 * __page_cache_release() call).
 697	 */
 698}
 699
 700static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
 701{
 702	int ret;
 703	/* BH protection not needed if current is softirq */
 704	if (in_softirq())
 705		ret = ptr_ring_produce(&pool->ring, (__force void *)netmem);
 706	else
 707		ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem);
 708
 709	if (!ret) {
 710		recycle_stat_inc(pool, ring);
 711		return true;
 712	}
 713
 714	return false;
 715}
 716
 717/* Only allow direct recycling in special circumstances, into the
 718 * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
 719 *
 720 * Caller must provide appropriate safe context.
 721 */
 722static bool page_pool_recycle_in_cache(netmem_ref netmem,
 723				       struct page_pool *pool)
 724{
 725	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
 726		recycle_stat_inc(pool, cache_full);
 727		return false;
 728	}
 729
 730	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
 731	pool->alloc.cache[pool->alloc.count++] = netmem;
 732	recycle_stat_inc(pool, cached);
 733	return true;
 734}
 735
 736static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
 737{
 738	return netmem_is_net_iov(netmem) ||
 739	       (page_ref_count(netmem_to_page(netmem)) == 1 &&
 740		!page_is_pfmemalloc(netmem_to_page(netmem)));
 741}
 742
 743/* If the page refcnt == 1, this will try to recycle the page.
 744 * If pool->dma_sync is set, we'll try to sync the DMA area for
 745 * the configured size min(dma_sync_size, pool->max_len).
 746 * If the page refcnt != 1, then the page will be returned to memory
 747 * subsystem.
 748 */
 749static __always_inline netmem_ref
 750__page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
 751		     unsigned int dma_sync_size, bool allow_direct)
 752{
 753	lockdep_assert_no_hardirq();
 754
 755	/* This allocator is optimized for the XDP mode that uses
 756	 * one-frame-per-page, but have fallbacks that act like the
 757	 * regular page allocator APIs.
 758	 *
 759	 * refcnt == 1 means page_pool owns page, and can recycle it.
 760	 *
 761	 * page is NOT reusable when allocated when system is under
 762	 * some pressure. (page_is_pfmemalloc)
 763	 */
 764	if (likely(__page_pool_page_can_be_recycled(netmem))) {
 765		/* Read barrier done in page_ref_count / READ_ONCE */
 766
 767		page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
 768
 769		if (allow_direct && page_pool_recycle_in_cache(netmem, pool))
 770			return 0;
 
 
 
 771
 772		/* Page found as candidate for recycling */
 773		return netmem;
 774	}
 775
 776	/* Fallback/non-XDP mode: API user have elevated refcnt.
 777	 *
 778	 * Many drivers split up the page into fragments, and some
 779	 * want to keep doing this to save memory and do refcnt based
 780	 * recycling. Support this use case too, to ease drivers
 781	 * switching between XDP/non-XDP.
 782	 *
 783	 * In-case page_pool maintains the DMA mapping, API user must
 784	 * call page_pool_put_page once.  In this elevated refcnt
 785	 * case, the DMA is unmapped/released, as driver is likely
 786	 * doing refcnt based recycle tricks, meaning another process
 787	 * will be invoking put_page.
 788	 */
 789	recycle_stat_inc(pool, released_refcnt);
 790	page_pool_return_page(pool, netmem);
 
 
 791
 792	return 0;
 793}
 794
 795static bool page_pool_napi_local(const struct page_pool *pool)
 796{
 797	const struct napi_struct *napi;
 798	u32 cpuid;
 799
 800	if (unlikely(!in_softirq()))
 801		return false;
 802
 803	/* Allow direct recycle if we have reasons to believe that we are
 804	 * in the same context as the consumer would run, so there's
 805	 * no possible race.
 806	 * __page_pool_put_page() makes sure we're not in hardirq context
 807	 * and interrupts are enabled prior to accessing the cache.
 808	 */
 809	cpuid = smp_processor_id();
 810	if (READ_ONCE(pool->cpuid) == cpuid)
 811		return true;
 812
 813	napi = READ_ONCE(pool->p.napi);
 814
 815	return napi && READ_ONCE(napi->list_owner) == cpuid;
 816}
 817
 818void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
 819				  unsigned int dma_sync_size, bool allow_direct)
 820{
 821	if (!allow_direct)
 822		allow_direct = page_pool_napi_local(pool);
 823
 824	netmem =
 825		__page_pool_put_page(pool, netmem, dma_sync_size, allow_direct);
 826	if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
 827		/* Cache full, fallback to free pages */
 828		recycle_stat_inc(pool, ring_full);
 829		page_pool_return_page(pool, netmem);
 830	}
 831}
 832EXPORT_SYMBOL(page_pool_put_unrefed_netmem);
 833
 834void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
 835				unsigned int dma_sync_size, bool allow_direct)
 836{
 837	page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size,
 838				     allow_direct);
 839}
 840EXPORT_SYMBOL(page_pool_put_unrefed_page);
 841
 842/**
 843 * page_pool_put_page_bulk() - release references on multiple pages
 844 * @pool:	pool from which pages were allocated
 845 * @data:	array holding page pointers
 846 * @count:	number of pages in @data
 847 *
 848 * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
 849 * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
 850 * will release leftover pages to the page allocator.
 851 * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
 852 * completion loop for the XDP_REDIRECT use case.
 853 *
 854 * Please note the caller must not use data area after running
 855 * page_pool_put_page_bulk(), as this function overwrites it.
 856 */
 857void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 858			     int count)
 859{
 860	int i, bulk_len = 0;
 861	bool allow_direct;
 862	bool in_softirq;
 863
 864	allow_direct = page_pool_napi_local(pool);
 865
 866	for (i = 0; i < count; i++) {
 867		netmem_ref netmem = page_to_netmem(virt_to_head_page(data[i]));
 868
 869		/* It is not the last user for the page frag case */
 870		if (!page_pool_is_last_ref(netmem))
 871			continue;
 872
 873		netmem = __page_pool_put_page(pool, netmem, -1, allow_direct);
 874		/* Approved for bulk recycling in ptr_ring cache */
 875		if (netmem)
 876			data[bulk_len++] = (__force void *)netmem;
 877	}
 878
 879	if (!bulk_len)
 880		return;
 881
 882	/* Bulk producer into ptr_ring page_pool cache */
 883	in_softirq = page_pool_producer_lock(pool);
 884	for (i = 0; i < bulk_len; i++) {
 885		if (__ptr_ring_produce(&pool->ring, data[i])) {
 886			/* ring full */
 887			recycle_stat_inc(pool, ring_full);
 888			break;
 889		}
 890	}
 891	recycle_stat_add(pool, ring, i);
 892	page_pool_producer_unlock(pool, in_softirq);
 893
 894	/* Hopefully all pages was return into ptr_ring */
 895	if (likely(i == bulk_len))
 896		return;
 897
 898	/* ptr_ring cache full, free remaining pages outside producer lock
 899	 * since put_page() with refcnt == 1 can be an expensive operation
 900	 */
 901	for (; i < bulk_len; i++)
 902		page_pool_return_page(pool, (__force netmem_ref)data[i]);
 903}
 904EXPORT_SYMBOL(page_pool_put_page_bulk);
 905
 906static netmem_ref page_pool_drain_frag(struct page_pool *pool,
 907				       netmem_ref netmem)
 908{
 909	long drain_count = BIAS_MAX - pool->frag_users;
 910
 911	/* Some user is still using the page frag */
 912	if (likely(page_pool_unref_netmem(netmem, drain_count)))
 913		return 0;
 
 
 
 
 914
 915	if (__page_pool_page_can_be_recycled(netmem)) {
 916		page_pool_dma_sync_for_device(pool, netmem, -1);
 917		return netmem;
 918	}
 919
 920	page_pool_return_page(pool, netmem);
 921	return 0;
 922}
 923
 924static void page_pool_free_frag(struct page_pool *pool)
 925{
 926	long drain_count = BIAS_MAX - pool->frag_users;
 927	netmem_ref netmem = pool->frag_page;
 928
 929	pool->frag_page = 0;
 930
 931	if (!netmem || page_pool_unref_netmem(netmem, drain_count))
 932		return;
 933
 934	page_pool_return_page(pool, netmem);
 935}
 936
 937netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
 938				       unsigned int *offset, unsigned int size,
 939				       gfp_t gfp)
 940{
 941	unsigned int max_size = PAGE_SIZE << pool->p.order;
 942	netmem_ref netmem = pool->frag_page;
 943
 944	if (WARN_ON(size > max_size))
 945		return 0;
 
 946
 947	size = ALIGN(size, dma_get_cache_alignment());
 948	*offset = pool->frag_offset;
 949
 950	if (netmem && *offset + size > max_size) {
 951		netmem = page_pool_drain_frag(pool, netmem);
 952		if (netmem) {
 953			recycle_stat_inc(pool, cached);
 954			alloc_stat_inc(pool, fast);
 955			goto frag_reset;
 956		}
 957	}
 958
 959	if (!netmem) {
 960		netmem = page_pool_alloc_netmem(pool, gfp);
 961		if (unlikely(!netmem)) {
 962			pool->frag_page = 0;
 963			return 0;
 964		}
 965
 966		pool->frag_page = netmem;
 967
 968frag_reset:
 969		pool->frag_users = 1;
 970		*offset = 0;
 971		pool->frag_offset = size;
 972		page_pool_fragment_netmem(netmem, BIAS_MAX);
 973		return netmem;
 974	}
 975
 976	pool->frag_users++;
 977	pool->frag_offset = *offset + size;
 978	return netmem;
 979}
 980EXPORT_SYMBOL(page_pool_alloc_frag_netmem);
 981
 982struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
 983				  unsigned int size, gfp_t gfp)
 984{
 985	return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size,
 986							  gfp));
 987}
 988EXPORT_SYMBOL(page_pool_alloc_frag);
 989
 990static void page_pool_empty_ring(struct page_pool *pool)
 991{
 992	netmem_ref netmem;
 993
 994	/* Empty recycle ring */
 995	while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
 996		/* Verify the refcnt invariant of cached pages */
 997		if (!(netmem_ref_count(netmem) == 1))
 998			pr_crit("%s() page_pool refcnt %d violation\n",
 999				__func__, netmem_ref_count(netmem));
1000
1001		page_pool_return_page(pool, netmem);
1002	}
1003}
1004
1005static void __page_pool_destroy(struct page_pool *pool)
1006{
1007	if (pool->disconnect)
1008		pool->disconnect(pool);
1009
1010	page_pool_unlist(pool);
1011	page_pool_uninit(pool);
1012
1013	if (pool->mp_priv) {
1014		mp_dmabuf_devmem_destroy(pool);
1015		static_branch_dec(&page_pool_mem_providers);
1016	}
1017
 
 
 
1018	kfree(pool);
1019}
1020
1021static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
1022{
1023	netmem_ref netmem;
1024
1025	if (pool->destroy_cnt)
1026		return;
1027
1028	/* Empty alloc cache, assume caller made sure this is
1029	 * no-longer in use, and page_pool_alloc_pages() cannot be
1030	 * call concurrently.
1031	 */
1032	while (pool->alloc.count) {
1033		netmem = pool->alloc.cache[--pool->alloc.count];
1034		page_pool_return_page(pool, netmem);
1035	}
1036}
1037
1038static void page_pool_scrub(struct page_pool *pool)
1039{
1040	page_pool_empty_alloc_cache_once(pool);
1041	pool->destroy_cnt++;
1042
1043	/* No more consumers should exist, but producers could still
1044	 * be in-flight.
1045	 */
1046	page_pool_empty_ring(pool);
1047}
1048
1049static int page_pool_release(struct page_pool *pool)
1050{
1051	int inflight;
1052
1053	page_pool_scrub(pool);
1054	inflight = page_pool_inflight(pool, true);
1055	if (!inflight)
1056		__page_pool_destroy(pool);
1057
1058	return inflight;
1059}
1060
1061static void page_pool_release_retry(struct work_struct *wq)
1062{
1063	struct delayed_work *dwq = to_delayed_work(wq);
1064	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
1065	void *netdev;
1066	int inflight;
1067
1068	inflight = page_pool_release(pool);
1069	if (!inflight)
1070		return;
1071
1072	/* Periodic warning for page pools the user can't see */
1073	netdev = READ_ONCE(pool->slow.netdev);
1074	if (time_after_eq(jiffies, pool->defer_warn) &&
1075	    (!netdev || netdev == NET_PTR_POISON)) {
1076		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
1077
1078		pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
1079			__func__, pool->user.id, inflight, sec);
1080		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
1081	}
1082
1083	/* Still not ready to be disconnected, retry later */
1084	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1085}
1086
1087void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
1088			   const struct xdp_mem_info *mem)
1089{
1090	refcount_inc(&pool->user_cnt);
1091	pool->disconnect = disconnect;
1092	pool->xdp_mem_id = mem->id;
1093}
1094
1095void page_pool_disable_direct_recycling(struct page_pool *pool)
1096{
1097	/* Disable direct recycling based on pool->cpuid.
1098	 * Paired with READ_ONCE() in page_pool_napi_local().
1099	 */
1100	WRITE_ONCE(pool->cpuid, -1);
1101
1102	if (!pool->p.napi)
1103		return;
1104
1105	/* To avoid races with recycling and additional barriers make sure
1106	 * pool and NAPI are unlinked when NAPI is disabled.
1107	 */
1108	WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state));
1109	WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1);
1110
1111	mutex_lock(&page_pools_lock);
1112	WRITE_ONCE(pool->p.napi, NULL);
1113	mutex_unlock(&page_pools_lock);
1114}
1115EXPORT_SYMBOL(page_pool_disable_direct_recycling);
1116
1117void page_pool_destroy(struct page_pool *pool)
1118{
1119	if (!pool)
1120		return;
1121
1122	if (!page_pool_put(pool))
1123		return;
1124
1125	page_pool_disable_direct_recycling(pool);
1126	page_pool_free_frag(pool);
1127
1128	if (!page_pool_release(pool))
1129		return;
1130
1131	page_pool_detached(pool);
1132	pool->defer_start = jiffies;
1133	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
1134
1135	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
1136	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1137}
1138EXPORT_SYMBOL(page_pool_destroy);
1139
1140/* Caller must provide appropriate safe context, e.g. NAPI. */
1141void page_pool_update_nid(struct page_pool *pool, int new_nid)
1142{
1143	netmem_ref netmem;
1144
1145	trace_page_pool_update_nid(pool, new_nid);
1146	pool->p.nid = new_nid;
1147
1148	/* Flush pool alloc cache, as refill will check NUMA node */
1149	while (pool->alloc.count) {
1150		netmem = pool->alloc.cache[--pool->alloc.count];
1151		page_pool_return_page(pool, netmem);
1152	}
1153}
1154EXPORT_SYMBOL(page_pool_update_nid);

  1/* SPDX-License-Identifier: GPL-2.0
  2 *
  3 * page_pool.c
  4 *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
  5 *	Copyright (C) 2016 Red Hat, Inc.
  6 */
  7
 
  8#include <linux/types.h>
  9#include <linux/kernel.h>
 10#include <linux/slab.h>
 11#include <linux/device.h>
 12
 13#include <net/page_pool.h>
 
 14#include <net/xdp.h>
 15
 16#include <linux/dma-direction.h>
 17#include <linux/dma-mapping.h>
 18#include <linux/page-flags.h>
 19#include <linux/mm.h> /* for put_page() */
 20#include <linux/poison.h>
 21#include <linux/ethtool.h>
 
 22
 23#include <trace/events/page_pool.h>
 24
 
 
 
 
 
 
 25#define DEFER_TIME (msecs_to_jiffies(1000))
 26#define DEFER_WARN_INTERVAL (60 * HZ)
 27
 28#define BIAS_MAX	LONG_MAX
 29
 30#ifdef CONFIG_PAGE_POOL_STATS
 
 
 31/* alloc_stat_inc is intended to be used in softirq context */
 32#define alloc_stat_inc(pool, __stat)	(pool->alloc_stats.__stat++)
 33/* recycle_stat_inc is safe to use when preemption is possible. */
 34#define recycle_stat_inc(pool, __stat)							\
 35	do {										\
 36		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
 37		this_cpu_inc(s->__stat);						\
 38	} while (0)
 39
 40#define recycle_stat_add(pool, __stat, val)						\
 41	do {										\
 42		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
 43		this_cpu_add(s->__stat, val);						\
 44	} while (0)
 45
 46static const char pp_stats[][ETH_GSTRING_LEN] = {
 47	"rx_pp_alloc_fast",
 48	"rx_pp_alloc_slow",
 49	"rx_pp_alloc_slow_ho",
 50	"rx_pp_alloc_empty",
 51	"rx_pp_alloc_refill",
 52	"rx_pp_alloc_waive",
 53	"rx_pp_recycle_cached",
 54	"rx_pp_recycle_cache_full",
 55	"rx_pp_recycle_ring",
 56	"rx_pp_recycle_ring_full",
 57	"rx_pp_recycle_released_ref",
 58};
 59
 60bool page_pool_get_stats(struct page_pool *pool,
 
 
 
 
 
 
 
 
 
 
 
 61			 struct page_pool_stats *stats)
 62{
 63	int cpu = 0;
 64
 65	if (!stats)
 66		return false;
 67
 68	/* The caller is responsible to initialize stats. */
 69	stats->alloc_stats.fast += pool->alloc_stats.fast;
 70	stats->alloc_stats.slow += pool->alloc_stats.slow;
 71	stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
 72	stats->alloc_stats.empty += pool->alloc_stats.empty;
 73	stats->alloc_stats.refill += pool->alloc_stats.refill;
 74	stats->alloc_stats.waive += pool->alloc_stats.waive;
 75
 76	for_each_possible_cpu(cpu) {
 77		const struct page_pool_recycle_stats *pcpu =
 78			per_cpu_ptr(pool->recycle_stats, cpu);
 79
 80		stats->recycle_stats.cached += pcpu->cached;
 81		stats->recycle_stats.cache_full += pcpu->cache_full;
 82		stats->recycle_stats.ring += pcpu->ring;
 83		stats->recycle_stats.ring_full += pcpu->ring_full;
 84		stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
 85	}
 86
 87	return true;
 88}
 89EXPORT_SYMBOL(page_pool_get_stats);
 90
 91u8 *page_pool_ethtool_stats_get_strings(u8 *data)
 92{
 93	int i;
 94
 95	for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
 96		memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
 97		data += ETH_GSTRING_LEN;
 98	}
 99
100	return data;
101}
102EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
103
104int page_pool_ethtool_stats_get_count(void)
105{
106	return ARRAY_SIZE(pp_stats);
107}
108EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
109
110u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
111{
112	struct page_pool_stats *pool_stats = stats;
113
114	*data++ = pool_stats->alloc_stats.fast;
115	*data++ = pool_stats->alloc_stats.slow;
116	*data++ = pool_stats->alloc_stats.slow_high_order;
117	*data++ = pool_stats->alloc_stats.empty;
118	*data++ = pool_stats->alloc_stats.refill;
119	*data++ = pool_stats->alloc_stats.waive;
120	*data++ = pool_stats->recycle_stats.cached;
121	*data++ = pool_stats->recycle_stats.cache_full;
122	*data++ = pool_stats->recycle_stats.ring;
123	*data++ = pool_stats->recycle_stats.ring_full;
124	*data++ = pool_stats->recycle_stats.released_refcnt;
125
126	return data;
127}
128EXPORT_SYMBOL(page_pool_ethtool_stats_get);
129
130#else
131#define alloc_stat_inc(pool, __stat)
132#define recycle_stat_inc(pool, __stat)
133#define recycle_stat_add(pool, __stat, val)
134#endif
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136static int page_pool_init(struct page_pool *pool,
137			  const struct page_pool_params *params)
 
138{
139	unsigned int ring_qsize = 1024; /* Default */
 
 
 
 
 
 
 
140
141	memcpy(&pool->p, params, sizeof(pool->p));
142
143	/* Validate only known flags were used */
144	if (pool->p.flags & ~(PP_FLAG_ALL))
145		return -EINVAL;
146
147	if (pool->p.pool_size)
148		ring_qsize = pool->p.pool_size;
149
150	/* Sanity limit mem that can be pinned down */
151	if (ring_qsize > 32768)
152		return -E2BIG;
153
154	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
155	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
156	 * which is the XDP_TX use-case.
157	 */
158	if (pool->p.flags & PP_FLAG_DMA_MAP) {
159		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
160		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
161			return -EINVAL;
 
 
162	}
163
164	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
165		/* In order to request DMA-sync-for-device the page
166		 * needs to be mapped
167		 */
168		if (!(pool->p.flags & PP_FLAG_DMA_MAP))
169			return -EINVAL;
170
171		if (!pool->p.max_len)
172			return -EINVAL;
173
 
 
174		/* pool->p.offset has to be set according to the address
175		 * offset used by the DMA engine to start copying rx data
176		 */
177	}
178
179	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
180	    pool->p.flags & PP_FLAG_PAGE_FRAG)
181		return -EINVAL;
182
183#ifdef CONFIG_PAGE_POOL_STATS
184	pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
185	if (!pool->recycle_stats)
186		return -ENOMEM;
 
 
 
 
 
 
 
 
 
187#endif
188
189	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
 
 
 
 
190		return -ENOMEM;
 
191
192	atomic_set(&pool->pages_state_release_cnt, 0);
193
194	/* Driver calling page_pool_create() also call page_pool_destroy() */
195	refcount_set(&pool->user_cnt, 1);
196
197	if (pool->p.flags & PP_FLAG_DMA_MAP)
198		get_device(pool->p.dev);
199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200	return 0;
 
 
 
 
 
 
 
 
201}
202
203struct page_pool *page_pool_create(const struct page_pool_params *params)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204{
205	struct page_pool *pool;
206	int err;
207
208	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
209	if (!pool)
210		return ERR_PTR(-ENOMEM);
211
212	err = page_pool_init(pool, params);
213	if (err < 0) {
214		pr_warn("%s() gave up with errno %d\n", __func__, err);
215		kfree(pool);
216		return ERR_PTR(err);
217	}
 
218
219	return pool;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220}
221EXPORT_SYMBOL(page_pool_create);
222
223static void page_pool_return_page(struct page_pool *pool, struct page *page);
224
225noinline
226static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
227{
228	struct ptr_ring *r = &pool->ring;
229	struct page *page;
230	int pref_nid; /* preferred NUMA node */
231
232	/* Quicker fallback, avoid locks when ring is empty */
233	if (__ptr_ring_empty(r)) {
234		alloc_stat_inc(pool, empty);
235		return NULL;
236	}
237
238	/* Softirq guarantee CPU and thus NUMA node is stable. This,
239	 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
240	 */
241#ifdef CONFIG_NUMA
242	pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
243#else
244	/* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
245	pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
246#endif
247
248	/* Refill alloc array, but only if NUMA match */
249	do {
250		page = __ptr_ring_consume(r);
251		if (unlikely(!page))
252			break;
253
254		if (likely(page_to_nid(page) == pref_nid)) {
255			pool->alloc.cache[pool->alloc.count++] = page;
256		} else {
257			/* NUMA mismatch;
258			 * (1) release 1 page to page-allocator and
259			 * (2) break out to fallthrough to alloc_pages_node.
260			 * This limit stress on page buddy alloactor.
261			 */
262			page_pool_return_page(pool, page);
263			alloc_stat_inc(pool, waive);
264			page = NULL;
265			break;
266		}
267	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
268
269	/* Return last page */
270	if (likely(pool->alloc.count > 0)) {
271		page = pool->alloc.cache[--pool->alloc.count];
272		alloc_stat_inc(pool, refill);
273	}
274
275	return page;
276}
277
278/* fast path */
279static struct page *__page_pool_get_cached(struct page_pool *pool)
280{
281	struct page *page;
282
283	/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
284	if (likely(pool->alloc.count)) {
285		/* Fast-path */
286		page = pool->alloc.cache[--pool->alloc.count];
287		alloc_stat_inc(pool, fast);
288	} else {
289		page = page_pool_refill_alloc_cache(pool);
290	}
291
292	return page;
293}
294
295static void page_pool_dma_sync_for_device(struct page_pool *pool,
296					  struct page *page,
297					  unsigned int dma_sync_size)
298{
299	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
 
300
301	dma_sync_size = min(dma_sync_size, pool->p.max_len);
302	dma_sync_single_range_for_device(pool->p.dev, dma_addr,
303					 pool->p.offset, dma_sync_size,
304					 pool->p.dma_dir);
 
 
 
 
 
 
 
 
 
305}
306
307static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
308{
309	dma_addr_t dma;
310
311	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
312	 * since dma_addr_t can be either 32 or 64 bits and does not always fit
313	 * into page private data (i.e 32bit cpu with 64bit DMA caps)
314	 * This mapping is kept for lifetime of page, until leaving pool.
315	 */
316	dma = dma_map_page_attrs(pool->p.dev, page, 0,
317				 (PAGE_SIZE << pool->p.order),
318				 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
 
319	if (dma_mapping_error(pool->p.dev, dma))
320		return false;
321
322	page_pool_set_dma_addr(page, dma);
 
323
324	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
325		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
326
327	return true;
328}
329
330static void page_pool_set_pp_info(struct page_pool *pool,
331				  struct page *page)
332{
333	page->pp = pool;
334	page->pp_magic |= PP_SIGNATURE;
335	if (pool->p.init_callback)
336		pool->p.init_callback(page, pool->p.init_arg);
337}
338
339static void page_pool_clear_pp_info(struct page *page)
340{
341	page->pp_magic = 0;
342	page->pp = NULL;
343}
344
345static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
346						 gfp_t gfp)
347{
348	struct page *page;
349
350	gfp |= __GFP_COMP;
351	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
352	if (unlikely(!page))
353		return NULL;
354
355	if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
356	    unlikely(!page_pool_dma_map(pool, page))) {
357		put_page(page);
358		return NULL;
359	}
360
361	alloc_stat_inc(pool, slow_high_order);
362	page_pool_set_pp_info(pool, page);
363
364	/* Track how many pages are held 'in-flight' */
365	pool->pages_state_hold_cnt++;
366	trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
 
367	return page;
368}
369
370/* slow path */
371noinline
372static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
373						 gfp_t gfp)
374{
375	const int bulk = PP_ALLOC_CACHE_REFILL;
376	unsigned int pp_flags = pool->p.flags;
377	unsigned int pp_order = pool->p.order;
378	struct page *page;
 
379	int i, nr_pages;
380
381	/* Don't support bulk alloc for high-order pages */
382	if (unlikely(pp_order))
383		return __page_pool_alloc_page_order(pool, gfp);
384
385	/* Unnecessary as alloc cache is empty, but guarantees zero count */
386	if (unlikely(pool->alloc.count > 0))
387		return pool->alloc.cache[--pool->alloc.count];
388
389	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
390	memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
391
392	nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk,
393					       pool->alloc.cache);
 
394	if (unlikely(!nr_pages))
395		return NULL;
396
397	/* Pages have been filled into alloc.cache array, but count is zero and
398	 * page element have not been (possibly) DMA mapped.
399	 */
400	for (i = 0; i < nr_pages; i++) {
401		page = pool->alloc.cache[i];
402		if ((pp_flags & PP_FLAG_DMA_MAP) &&
403		    unlikely(!page_pool_dma_map(pool, page))) {
404			put_page(page);
405			continue;
406		}
407
408		page_pool_set_pp_info(pool, page);
409		pool->alloc.cache[pool->alloc.count++] = page;
410		/* Track how many pages are held 'in-flight' */
411		pool->pages_state_hold_cnt++;
412		trace_page_pool_state_hold(pool, page,
413					   pool->pages_state_hold_cnt);
414	}
415
416	/* Return last page */
417	if (likely(pool->alloc.count > 0)) {
418		page = pool->alloc.cache[--pool->alloc.count];
419		alloc_stat_inc(pool, slow);
420	} else {
421		page = NULL;
422	}
423
424	/* When page just alloc'ed is should/must have refcnt 1. */
425	return page;
426}
427
428/* For using page_pool replace: alloc_pages() API calls, but provide
429 * synchronization guarantee for allocation side.
430 */
431struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
432{
433	struct page *page;
434
435	/* Fast-path: Get a page from cache */
436	page = __page_pool_get_cached(pool);
437	if (page)
438		return page;
439
440	/* Slow-path: cache empty, do real allocation */
441	page = __page_pool_alloc_pages_slow(pool, gfp);
442	return page;
 
 
 
 
 
 
 
 
 
443}
444EXPORT_SYMBOL(page_pool_alloc_pages);
 
445
446/* Calculate distance between two u32 values, valid if distance is below 2^(31)
447 *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
448 */
449#define _distance(a, b)	(s32)((a) - (b))
450
451static s32 page_pool_inflight(struct page_pool *pool)
452{
453	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
454	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
455	s32 inflight;
456
457	inflight = _distance(hold_cnt, release_cnt);
458
459	trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
460	WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
 
 
 
 
 
461
462	return inflight;
463}
464
465/* Disconnects a page (from a page_pool).  API users can have a need
466 * to disconnect a page (from a page_pool), to allow it to be used as
467 * a regular page (that will eventually be returned to the normal
468 * page-allocator via put_page).
469 */
470void page_pool_release_page(struct page_pool *pool, struct page *page)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471{
472	dma_addr_t dma;
473	int count;
474
475	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
476		/* Always account for inflight pages, even if we didn't
477		 * map them
478		 */
479		goto skip_dma_unmap;
480
481	dma = page_pool_get_dma_addr(page);
482
483	/* When page is unmapped, it cannot be returned to our pool */
484	dma_unmap_page_attrs(pool->p.dev, dma,
485			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
486			     DMA_ATTR_SKIP_CPU_SYNC);
487	page_pool_set_dma_addr(page, 0);
488skip_dma_unmap:
489	page_pool_clear_pp_info(page);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
491	/* This may be the last page returned, releasing the pool, so
492	 * it is not safe to reference pool afterwards.
493	 */
494	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
495	trace_page_pool_state_release(pool, page, count);
496}
497EXPORT_SYMBOL(page_pool_release_page);
498
499/* Return a page to the page allocator, cleaning up our state */
500static void page_pool_return_page(struct page_pool *pool, struct page *page)
501{
502	page_pool_release_page(pool, page);
503
504	put_page(page);
505	/* An optimization would be to call __free_pages(page, pool->p.order)
506	 * knowing page is not part of page-cache (thus avoiding a
507	 * __page_cache_release() call).
508	 */
509}
510
511static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
512{
513	int ret;
514	/* BH protection not needed if current is serving softirq */
515	if (in_serving_softirq())
516		ret = ptr_ring_produce(&pool->ring, page);
517	else
518		ret = ptr_ring_produce_bh(&pool->ring, page);
519
520	if (!ret) {
521		recycle_stat_inc(pool, ring);
522		return true;
523	}
524
525	return false;
526}
527
528/* Only allow direct recycling in special circumstances, into the
529 * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
530 *
531 * Caller must provide appropriate safe context.
532 */
533static bool page_pool_recycle_in_cache(struct page *page,
534				       struct page_pool *pool)
535{
536	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
537		recycle_stat_inc(pool, cache_full);
538		return false;
539	}
540
541	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
542	pool->alloc.cache[pool->alloc.count++] = page;
543	recycle_stat_inc(pool, cached);
544	return true;
545}
546
 
 
 
 
 
 
 
547/* If the page refcnt == 1, this will try to recycle the page.
548 * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
549 * the configured size min(dma_sync_size, pool->max_len).
550 * If the page refcnt != 1, then the page will be returned to memory
551 * subsystem.
552 */
553static __always_inline struct page *
554__page_pool_put_page(struct page_pool *pool, struct page *page,
555		     unsigned int dma_sync_size, bool allow_direct)
556{
 
 
557	/* This allocator is optimized for the XDP mode that uses
558	 * one-frame-per-page, but have fallbacks that act like the
559	 * regular page allocator APIs.
560	 *
561	 * refcnt == 1 means page_pool owns page, and can recycle it.
562	 *
563	 * page is NOT reusable when allocated when system is under
564	 * some pressure. (page_is_pfmemalloc)
565	 */
566	if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
567		/* Read barrier done in page_ref_count / READ_ONCE */
568
569		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
570			page_pool_dma_sync_for_device(pool, page,
571						      dma_sync_size);
572
573		if (allow_direct && in_serving_softirq() &&
574		    page_pool_recycle_in_cache(page, pool))
575			return NULL;
576
577		/* Page found as candidate for recycling */
578		return page;
579	}
 
580	/* Fallback/non-XDP mode: API user have elevated refcnt.
581	 *
582	 * Many drivers split up the page into fragments, and some
583	 * want to keep doing this to save memory and do refcnt based
584	 * recycling. Support this use case too, to ease drivers
585	 * switching between XDP/non-XDP.
586	 *
587	 * In-case page_pool maintains the DMA mapping, API user must
588	 * call page_pool_put_page once.  In this elevated refcnt
589	 * case, the DMA is unmapped/released, as driver is likely
590	 * doing refcnt based recycle tricks, meaning another process
591	 * will be invoking put_page.
592	 */
593	recycle_stat_inc(pool, released_refcnt);
594	/* Do not replace this with page_pool_return_page() */
595	page_pool_release_page(pool, page);
596	put_page(page);
597
598	return NULL;
599}
600
601void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
602				  unsigned int dma_sync_size, bool allow_direct)
603{
604	page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
605	if (page && !page_pool_recycle_in_ring(pool, page)) {
 
 
 
 
606		/* Cache full, fallback to free pages */
607		recycle_stat_inc(pool, ring_full);
608		page_pool_return_page(pool, page);
609	}
610}
611EXPORT_SYMBOL(page_pool_put_defragged_page);
612
613/* Caller must not use data area after call, as this function overwrites it */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
614void page_pool_put_page_bulk(struct page_pool *pool, void **data,
615			     int count)
616{
617	int i, bulk_len = 0;
 
 
 
 
618
619	for (i = 0; i < count; i++) {
620		struct page *page = virt_to_head_page(data[i]);
621
622		/* It is not the last user for the page frag case */
623		if (!page_pool_is_last_frag(pool, page))
624			continue;
625
626		page = __page_pool_put_page(pool, page, -1, false);
627		/* Approved for bulk recycling in ptr_ring cache */
628		if (page)
629			data[bulk_len++] = page;
630	}
631
632	if (unlikely(!bulk_len))
633		return;
634
635	/* Bulk producer into ptr_ring page_pool cache */
636	page_pool_ring_lock(pool);
637	for (i = 0; i < bulk_len; i++) {
638		if (__ptr_ring_produce(&pool->ring, data[i])) {
639			/* ring full */
640			recycle_stat_inc(pool, ring_full);
641			break;
642		}
643	}
644	recycle_stat_add(pool, ring, i);
645	page_pool_ring_unlock(pool);
646
647	/* Hopefully all pages was return into ptr_ring */
648	if (likely(i == bulk_len))
649		return;
650
651	/* ptr_ring cache full, free remaining pages outside producer lock
652	 * since put_page() with refcnt == 1 can be an expensive operation
653	 */
654	for (; i < bulk_len; i++)
655		page_pool_return_page(pool, data[i]);
656}
657EXPORT_SYMBOL(page_pool_put_page_bulk);
658
659static struct page *page_pool_drain_frag(struct page_pool *pool,
660					 struct page *page)
661{
662	long drain_count = BIAS_MAX - pool->frag_users;
663
664	/* Some user is still using the page frag */
665	if (likely(page_pool_defrag_page(page, drain_count)))
666		return NULL;
667
668	if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
669		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
670			page_pool_dma_sync_for_device(pool, page, -1);
671
672		return page;
 
 
673	}
674
675	page_pool_return_page(pool, page);
676	return NULL;
677}
678
679static void page_pool_free_frag(struct page_pool *pool)
680{
681	long drain_count = BIAS_MAX - pool->frag_users;
682	struct page *page = pool->frag_page;
683
684	pool->frag_page = NULL;
685
686	if (!page || page_pool_defrag_page(page, drain_count))
687		return;
688
689	page_pool_return_page(pool, page);
690}
691
692struct page *page_pool_alloc_frag(struct page_pool *pool,
693				  unsigned int *offset,
694				  unsigned int size, gfp_t gfp)
695{
696	unsigned int max_size = PAGE_SIZE << pool->p.order;
697	struct page *page = pool->frag_page;
698
699	if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
700		    size > max_size))
701		return NULL;
702
703	size = ALIGN(size, dma_get_cache_alignment());
704	*offset = pool->frag_offset;
705
706	if (page && *offset + size > max_size) {
707		page = page_pool_drain_frag(pool, page);
708		if (page) {
 
709			alloc_stat_inc(pool, fast);
710			goto frag_reset;
711		}
712	}
713
714	if (!page) {
715		page = page_pool_alloc_pages(pool, gfp);
716		if (unlikely(!page)) {
717			pool->frag_page = NULL;
718			return NULL;
719		}
720
721		pool->frag_page = page;
722
723frag_reset:
724		pool->frag_users = 1;
725		*offset = 0;
726		pool->frag_offset = size;
727		page_pool_fragment_page(page, BIAS_MAX);
728		return page;
729	}
730
731	pool->frag_users++;
732	pool->frag_offset = *offset + size;
733	alloc_stat_inc(pool, fast);
734	return page;
 
 
 
 
 
 
 
735}
736EXPORT_SYMBOL(page_pool_alloc_frag);
737
738static void page_pool_empty_ring(struct page_pool *pool)
739{
740	struct page *page;
741
742	/* Empty recycle ring */
743	while ((page = ptr_ring_consume_bh(&pool->ring))) {
744		/* Verify the refcnt invariant of cached pages */
745		if (!(page_ref_count(page) == 1))
746			pr_crit("%s() page_pool refcnt %d violation\n",
747				__func__, page_ref_count(page));
748
749		page_pool_return_page(pool, page);
750	}
751}
752
753static void page_pool_free(struct page_pool *pool)
754{
755	if (pool->disconnect)
756		pool->disconnect(pool);
757
758	ptr_ring_cleanup(&pool->ring, NULL);
 
759
760	if (pool->p.flags & PP_FLAG_DMA_MAP)
761		put_device(pool->p.dev);
 
 
762
763#ifdef CONFIG_PAGE_POOL_STATS
764	free_percpu(pool->recycle_stats);
765#endif
766	kfree(pool);
767}
768
769static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
770{
771	struct page *page;
772
773	if (pool->destroy_cnt)
774		return;
775
776	/* Empty alloc cache, assume caller made sure this is
777	 * no-longer in use, and page_pool_alloc_pages() cannot be
778	 * call concurrently.
779	 */
780	while (pool->alloc.count) {
781		page = pool->alloc.cache[--pool->alloc.count];
782		page_pool_return_page(pool, page);
783	}
784}
785
786static void page_pool_scrub(struct page_pool *pool)
787{
788	page_pool_empty_alloc_cache_once(pool);
789	pool->destroy_cnt++;
790
791	/* No more consumers should exist, but producers could still
792	 * be in-flight.
793	 */
794	page_pool_empty_ring(pool);
795}
796
797static int page_pool_release(struct page_pool *pool)
798{
799	int inflight;
800
801	page_pool_scrub(pool);
802	inflight = page_pool_inflight(pool);
803	if (!inflight)
804		page_pool_free(pool);
805
806	return inflight;
807}
808
809static void page_pool_release_retry(struct work_struct *wq)
810{
811	struct delayed_work *dwq = to_delayed_work(wq);
812	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
 
813	int inflight;
814
815	inflight = page_pool_release(pool);
816	if (!inflight)
817		return;
818
819	/* Periodic warning */
820	if (time_after_eq(jiffies, pool->defer_warn)) {
 
 
821		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
822
823		pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
824			__func__, inflight, sec);
825		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
826	}
827
828	/* Still not ready to be disconnected, retry later */
829	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
830}
831
832void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
833			   struct xdp_mem_info *mem)
834{
835	refcount_inc(&pool->user_cnt);
836	pool->disconnect = disconnect;
837	pool->xdp_mem_id = mem->id;
838}
839
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840void page_pool_destroy(struct page_pool *pool)
841{
842	if (!pool)
843		return;
844
845	if (!page_pool_put(pool))
846		return;
847
 
848	page_pool_free_frag(pool);
849
850	if (!page_pool_release(pool))
851		return;
852
 
853	pool->defer_start = jiffies;
854	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
855
856	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
857	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
858}
859EXPORT_SYMBOL(page_pool_destroy);
860
861/* Caller must provide appropriate safe context, e.g. NAPI. */
862void page_pool_update_nid(struct page_pool *pool, int new_nid)
863{
864	struct page *page;
865
866	trace_page_pool_update_nid(pool, new_nid);
867	pool->p.nid = new_nid;
868
869	/* Flush pool alloc cache, as refill will check NUMA node */
870	while (pool->alloc.count) {
871		page = pool->alloc.cache[--pool->alloc.count];
872		page_pool_return_page(pool, page);
873	}
874}
875EXPORT_SYMBOL(page_pool_update_nid);
876
877bool page_pool_return_skb_page(struct page *page)
878{
879	struct page_pool *pp;
880
881	page = compound_head(page);
882
883	/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
884	 * in order to preserve any existing bits, such as bit 0 for the
885	 * head page of compound page and bit 1 for pfmemalloc page, so
886	 * mask those bits for freeing side when doing below checking,
887	 * and page_is_pfmemalloc() is checked in __page_pool_put_page()
888	 * to avoid recycling the pfmemalloc page.
889	 */
890	if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
891		return false;
892
893	pp = page->pp;
894
895	/* Driver set this to memory recycling info. Reset it on recycle.
896	 * This will *not* work for NIC using a split-page memory model.
897	 * The page will be returned to the pool here regardless of the
898	 * 'flipped' fragment being in use or not.
899	 */
900	page_pool_put_full_page(pp, page, false);
901
902	return true;
903}
904EXPORT_SYMBOL(page_pool_return_skb_page);