page_pool.c - net/core/page_pool.c - Linux diff v6.13.7

   1/* SPDX-License-Identifier: GPL-2.0
   2 *
   3 * page_pool.c
   4 *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
   5 *	Copyright (C) 2016 Red Hat, Inc.
   6 */
   7
   8#include <linux/error-injection.h>
   9#include <linux/types.h>
  10#include <linux/kernel.h>
  11#include <linux/slab.h>
  12#include <linux/device.h>
  13
  14#include <net/netdev_rx_queue.h>
  15#include <net/page_pool/helpers.h>
  16#include <net/xdp.h>
  17
  18#include <linux/dma-direction.h>
  19#include <linux/dma-mapping.h>
  20#include <linux/page-flags.h>
  21#include <linux/mm.h> /* for put_page() */
  22#include <linux/poison.h>
  23#include <linux/ethtool.h>
  24#include <linux/netdevice.h>
  25
  26#include <trace/events/page_pool.h>
  27
  28#include "mp_dmabuf_devmem.h"
  29#include "netmem_priv.h"
  30#include "page_pool_priv.h"
  31
  32DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
  33
  34#define DEFER_TIME (msecs_to_jiffies(1000))
  35#define DEFER_WARN_INTERVAL (60 * HZ)
  36
  37#define BIAS_MAX	(LONG_MAX >> 1)
  38
  39#ifdef CONFIG_PAGE_POOL_STATS
  40static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
  41
  42/* alloc_stat_inc is intended to be used in softirq context */
  43#define alloc_stat_inc(pool, __stat)	(pool->alloc_stats.__stat++)
  44/* recycle_stat_inc is safe to use when preemption is possible. */
  45#define recycle_stat_inc(pool, __stat)							\
  46	do {										\
  47		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
  48		this_cpu_inc(s->__stat);						\
  49	} while (0)
  50
  51#define recycle_stat_add(pool, __stat, val)						\
  52	do {										\
  53		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
  54		this_cpu_add(s->__stat, val);						\
  55	} while (0)
  56
  57static const char pp_stats[][ETH_GSTRING_LEN] = {
  58	"rx_pp_alloc_fast",
  59	"rx_pp_alloc_slow",
  60	"rx_pp_alloc_slow_ho",
  61	"rx_pp_alloc_empty",
  62	"rx_pp_alloc_refill",
  63	"rx_pp_alloc_waive",
  64	"rx_pp_recycle_cached",
  65	"rx_pp_recycle_cache_full",
  66	"rx_pp_recycle_ring",
  67	"rx_pp_recycle_ring_full",
  68	"rx_pp_recycle_released_ref",
  69};
  70
  71/**
  72 * page_pool_get_stats() - fetch page pool stats
  73 * @pool:	pool from which page was allocated
  74 * @stats:	struct page_pool_stats to fill in
  75 *
  76 * Retrieve statistics about the page_pool. This API is only available
  77 * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
  78 * A pointer to a caller allocated struct page_pool_stats structure
  79 * is passed to this API which is filled in. The caller can then report
  80 * those stats to the user (perhaps via ethtool, debugfs, etc.).
  81 */
  82bool page_pool_get_stats(const struct page_pool *pool,
  83			 struct page_pool_stats *stats)
  84{
  85	int cpu = 0;
  86
  87	if (!stats)
  88		return false;
  89
  90	/* The caller is responsible to initialize stats. */
  91	stats->alloc_stats.fast += pool->alloc_stats.fast;
  92	stats->alloc_stats.slow += pool->alloc_stats.slow;
  93	stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
  94	stats->alloc_stats.empty += pool->alloc_stats.empty;
  95	stats->alloc_stats.refill += pool->alloc_stats.refill;
  96	stats->alloc_stats.waive += pool->alloc_stats.waive;
  97
  98	for_each_possible_cpu(cpu) {
  99		const struct page_pool_recycle_stats *pcpu =
 100			per_cpu_ptr(pool->recycle_stats, cpu);
 101
 102		stats->recycle_stats.cached += pcpu->cached;
 103		stats->recycle_stats.cache_full += pcpu->cache_full;
 104		stats->recycle_stats.ring += pcpu->ring;
 105		stats->recycle_stats.ring_full += pcpu->ring_full;
 106		stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
 107	}
 108
 109	return true;
 110}
 111EXPORT_SYMBOL(page_pool_get_stats);
 112
 113u8 *page_pool_ethtool_stats_get_strings(u8 *data)
 114{
 115	int i;
 116
 117	for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
 118		memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
 119		data += ETH_GSTRING_LEN;
 120	}
 121
 122	return data;
 123}
 124EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
 125
 126int page_pool_ethtool_stats_get_count(void)
 127{
 128	return ARRAY_SIZE(pp_stats);
 129}
 130EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
 131
 132u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
 133{
 134	const struct page_pool_stats *pool_stats = stats;
 135
 136	*data++ = pool_stats->alloc_stats.fast;
 137	*data++ = pool_stats->alloc_stats.slow;
 138	*data++ = pool_stats->alloc_stats.slow_high_order;
 139	*data++ = pool_stats->alloc_stats.empty;
 140	*data++ = pool_stats->alloc_stats.refill;
 141	*data++ = pool_stats->alloc_stats.waive;
 142	*data++ = pool_stats->recycle_stats.cached;
 143	*data++ = pool_stats->recycle_stats.cache_full;
 144	*data++ = pool_stats->recycle_stats.ring;
 145	*data++ = pool_stats->recycle_stats.ring_full;
 146	*data++ = pool_stats->recycle_stats.released_refcnt;
 147
 148	return data;
 149}
 150EXPORT_SYMBOL(page_pool_ethtool_stats_get);
 151
 152#else
 153#define alloc_stat_inc(pool, __stat)
 154#define recycle_stat_inc(pool, __stat)
 155#define recycle_stat_add(pool, __stat, val)
 156#endif
 157
 158static bool page_pool_producer_lock(struct page_pool *pool)
 159	__acquires(&pool->ring.producer_lock)
 160{
 161	bool in_softirq = in_softirq();
 162
 163	if (in_softirq)
 164		spin_lock(&pool->ring.producer_lock);
 165	else
 166		spin_lock_bh(&pool->ring.producer_lock);
 167
 168	return in_softirq;
 169}
 170
 171static void page_pool_producer_unlock(struct page_pool *pool,
 172				      bool in_softirq)
 173	__releases(&pool->ring.producer_lock)
 174{
 175	if (in_softirq)
 176		spin_unlock(&pool->ring.producer_lock);
 177	else
 178		spin_unlock_bh(&pool->ring.producer_lock);
 179}
 180
 181static void page_pool_struct_check(void)
 182{
 183	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
 184	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
 185	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
 186	CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
 187				    PAGE_POOL_FRAG_GROUP_ALIGN);
 188}
 189
 190static int page_pool_init(struct page_pool *pool,
 191			  const struct page_pool_params *params,
 192			  int cpuid)
 193{
 194	unsigned int ring_qsize = 1024; /* Default */
 195	struct netdev_rx_queue *rxq;
 196	int err;
 197
 198	page_pool_struct_check();
 199
 200	memcpy(&pool->p, &params->fast, sizeof(pool->p));
 201	memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
 202
 203	pool->cpuid = cpuid;
 204
 205	/* Validate only known flags were used */
 206	if (pool->slow.flags & ~PP_FLAG_ALL)
 207		return -EINVAL;
 208
 209	if (pool->p.pool_size)
 210		ring_qsize = pool->p.pool_size;
 211
 212	/* Sanity limit mem that can be pinned down */
 213	if (ring_qsize > 32768)
 214		return -E2BIG;
 215
 216	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
 217	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
 218	 * which is the XDP_TX use-case.
 219	 */
 220	if (pool->slow.flags & PP_FLAG_DMA_MAP) {
 221		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
 222		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
 223			return -EINVAL;
 224
 225		pool->dma_map = true;
 226	}
 227
 228	if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
 229		/* In order to request DMA-sync-for-device the page
 230		 * needs to be mapped
 231		 */
 232		if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
 233			return -EINVAL;
 234
 235		if (!pool->p.max_len)
 236			return -EINVAL;
 237
 238		pool->dma_sync = true;
 239
 240		/* pool->p.offset has to be set according to the address
 241		 * offset used by the DMA engine to start copying rx data
 242		 */
 243	}
 244
 245	pool->has_init_callback = !!pool->slow.init_callback;
 246
 247#ifdef CONFIG_PAGE_POOL_STATS
 248	if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
 249		pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
 250		if (!pool->recycle_stats)
 251			return -ENOMEM;
 252	} else {
 253		/* For system page pool instance we use a singular stats object
 254		 * instead of allocating a separate percpu variable for each
 255		 * (also percpu) page pool instance.
 256		 */
 257		pool->recycle_stats = &pp_system_recycle_stats;
 258		pool->system = true;
 259	}
 260#endif
 261
 262	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
 263#ifdef CONFIG_PAGE_POOL_STATS
 264		if (!pool->system)
 265			free_percpu(pool->recycle_stats);
 266#endif
 267		return -ENOMEM;
 268	}
 269
 270	atomic_set(&pool->pages_state_release_cnt, 0);
 271
 272	/* Driver calling page_pool_create() also call page_pool_destroy() */
 273	refcount_set(&pool->user_cnt, 1);
 274
 275	if (pool->dma_map)
 276		get_device(pool->p.dev);
 277
 278	if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
 279		/* We rely on rtnl_lock()ing to make sure netdev_rx_queue
 280		 * configuration doesn't change while we're initializing
 281		 * the page_pool.
 282		 */
 283		ASSERT_RTNL();
 284		rxq = __netif_get_rx_queue(pool->slow.netdev,
 285					   pool->slow.queue_idx);
 286		pool->mp_priv = rxq->mp_params.mp_priv;
 287	}
 288
 289	if (pool->mp_priv) {
 290		err = mp_dmabuf_devmem_init(pool);
 291		if (err) {
 292			pr_warn("%s() mem-provider init failed %d\n", __func__,
 293				err);
 294			goto free_ptr_ring;
 295		}
 296
 297		static_branch_inc(&page_pool_mem_providers);
 298	}
 299
 300	return 0;
 301
 302free_ptr_ring:
 303	ptr_ring_cleanup(&pool->ring, NULL);
 304#ifdef CONFIG_PAGE_POOL_STATS
 305	if (!pool->system)
 306		free_percpu(pool->recycle_stats);
 307#endif
 308	return err;
 309}
 310
 311static void page_pool_uninit(struct page_pool *pool)
 312{
 313	ptr_ring_cleanup(&pool->ring, NULL);
 314
 315	if (pool->dma_map)
 316		put_device(pool->p.dev);
 317
 318#ifdef CONFIG_PAGE_POOL_STATS
 319	if (!pool->system)
 320		free_percpu(pool->recycle_stats);
 321#endif
 322}
 323
 324/**
 325 * page_pool_create_percpu() - create a page pool for a given cpu.
 326 * @params: parameters, see struct page_pool_params
 327 * @cpuid: cpu identifier
 328 */
 329struct page_pool *
 330page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
 331{
 332	struct page_pool *pool;
 333	int err;
 334
 335	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
 336	if (!pool)
 337		return ERR_PTR(-ENOMEM);
 338
 339	err = page_pool_init(pool, params, cpuid);
 340	if (err < 0)
 341		goto err_free;
 342
 343	err = page_pool_list(pool);
 344	if (err)
 345		goto err_uninit;
 346
 347	return pool;
 348
 349err_uninit:
 350	page_pool_uninit(pool);
 351err_free:
 352	pr_warn("%s() gave up with errno %d\n", __func__, err);
 353	kfree(pool);
 354	return ERR_PTR(err);
 355}
 356EXPORT_SYMBOL(page_pool_create_percpu);
 357
 358/**
 359 * page_pool_create() - create a page pool
 360 * @params: parameters, see struct page_pool_params
 361 */
 362struct page_pool *page_pool_create(const struct page_pool_params *params)
 363{
 364	return page_pool_create_percpu(params, -1);
 365}
 366EXPORT_SYMBOL(page_pool_create);
 367
 368static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem);
 369
 370static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
 
 371{
 372	struct ptr_ring *r = &pool->ring;
 373	netmem_ref netmem;
 374	int pref_nid; /* preferred NUMA node */
 375
 376	/* Quicker fallback, avoid locks when ring is empty */
 377	if (__ptr_ring_empty(r)) {
 378		alloc_stat_inc(pool, empty);
 379		return 0;
 380	}
 381
 382	/* Softirq guarantee CPU and thus NUMA node is stable. This,
 383	 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
 384	 */
 385#ifdef CONFIG_NUMA
 386	pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
 387#else
 388	/* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
 389	pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
 390#endif
 391
 392	/* Refill alloc array, but only if NUMA match */
 393	do {
 394		netmem = (__force netmem_ref)__ptr_ring_consume(r);
 395		if (unlikely(!netmem))
 396			break;
 397
 398		if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
 399			pool->alloc.cache[pool->alloc.count++] = netmem;
 400		} else {
 401			/* NUMA mismatch;
 402			 * (1) release 1 page to page-allocator and
 403			 * (2) break out to fallthrough to alloc_pages_node.
 404			 * This limit stress on page buddy alloactor.
 405			 */
 406			page_pool_return_page(pool, netmem);
 407			alloc_stat_inc(pool, waive);
 408			netmem = 0;
 409			break;
 410		}
 411	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
 412
 413	/* Return last page */
 414	if (likely(pool->alloc.count > 0)) {
 415		netmem = pool->alloc.cache[--pool->alloc.count];
 416		alloc_stat_inc(pool, refill);
 417	}
 418
 419	return netmem;
 420}
 421
 422/* fast path */
 423static netmem_ref __page_pool_get_cached(struct page_pool *pool)
 424{
 425	netmem_ref netmem;
 426
 427	/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
 428	if (likely(pool->alloc.count)) {
 429		/* Fast-path */
 430		netmem = pool->alloc.cache[--pool->alloc.count];
 431		alloc_stat_inc(pool, fast);
 432	} else {
 433		netmem = page_pool_refill_alloc_cache(pool);
 434	}
 435
 436	return netmem;
 437}
 438
 439static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
 440					    netmem_ref netmem,
 441					    u32 dma_sync_size)
 442{
 443#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
 444	dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
 445
 446	dma_sync_size = min(dma_sync_size, pool->p.max_len);
 447	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
 448				     dma_sync_size, pool->p.dma_dir);
 449#endif
 450}
 451
 452static __always_inline void
 453page_pool_dma_sync_for_device(const struct page_pool *pool,
 454			      netmem_ref netmem,
 455			      u32 dma_sync_size)
 456{
 457	if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
 458		__page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
 459}
 460
 461static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
 462{
 463	dma_addr_t dma;
 464
 465	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
 466	 * since dma_addr_t can be either 32 or 64 bits and does not always fit
 467	 * into page private data (i.e 32bit cpu with 64bit DMA caps)
 468	 * This mapping is kept for lifetime of page, until leaving pool.
 469	 */
 470	dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0,
 471				 (PAGE_SIZE << pool->p.order), pool->p.dma_dir,
 472				 DMA_ATTR_SKIP_CPU_SYNC |
 473					 DMA_ATTR_WEAK_ORDERING);
 474	if (dma_mapping_error(pool->p.dev, dma))
 475		return false;
 476
 477	if (page_pool_set_dma_addr_netmem(netmem, dma))
 478		goto unmap_failed;
 479
 480	page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
 
 481
 482	return true;
 483
 484unmap_failed:
 485	WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
 486	dma_unmap_page_attrs(pool->p.dev, dma,
 487			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 488			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
 489	return false;
 490}
 491
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 492static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 493						 gfp_t gfp)
 494{
 495	struct page *page;
 496
 497	gfp |= __GFP_COMP;
 498	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
 499	if (unlikely(!page))
 500		return NULL;
 501
 502	if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
 
 503		put_page(page);
 504		return NULL;
 505	}
 506
 507	alloc_stat_inc(pool, slow_high_order);
 508	page_pool_set_pp_info(pool, page_to_netmem(page));
 509
 510	/* Track how many pages are held 'in-flight' */
 511	pool->pages_state_hold_cnt++;
 512	trace_page_pool_state_hold(pool, page_to_netmem(page),
 513				   pool->pages_state_hold_cnt);
 514	return page;
 515}
 516
 517/* slow path */
 518static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
 519							gfp_t gfp)
 
 520{
 521	const int bulk = PP_ALLOC_CACHE_REFILL;
 
 522	unsigned int pp_order = pool->p.order;
 523	bool dma_map = pool->dma_map;
 524	netmem_ref netmem;
 525	int i, nr_pages;
 526
 527	/* Don't support bulk alloc for high-order pages */
 528	if (unlikely(pp_order))
 529		return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
 530
 531	/* Unnecessary as alloc cache is empty, but guarantees zero count */
 532	if (unlikely(pool->alloc.count > 0))
 533		return pool->alloc.cache[--pool->alloc.count];
 534
 535	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
 536	memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
 537
 538	nr_pages = alloc_pages_bulk_array_node(gfp,
 539					       pool->p.nid, bulk,
 540					       (struct page **)pool->alloc.cache);
 541	if (unlikely(!nr_pages))
 542		return 0;
 543
 544	/* Pages have been filled into alloc.cache array, but count is zero and
 545	 * page element have not been (possibly) DMA mapped.
 546	 */
 547	for (i = 0; i < nr_pages; i++) {
 548		netmem = pool->alloc.cache[i];
 549		if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
 550			put_page(netmem_to_page(netmem));
 
 551			continue;
 552		}
 553
 554		page_pool_set_pp_info(pool, netmem);
 555		pool->alloc.cache[pool->alloc.count++] = netmem;
 556		/* Track how many pages are held 'in-flight' */
 557		pool->pages_state_hold_cnt++;
 558		trace_page_pool_state_hold(pool, netmem,
 559					   pool->pages_state_hold_cnt);
 560	}
 561
 562	/* Return last page */
 563	if (likely(pool->alloc.count > 0)) {
 564		netmem = pool->alloc.cache[--pool->alloc.count];
 565		alloc_stat_inc(pool, slow);
 566	} else {
 567		netmem = 0;
 568	}
 569
 570	/* When page just alloc'ed is should/must have refcnt 1. */
 571	return netmem;
 572}
 573
 574/* For using page_pool replace: alloc_pages() API calls, but provide
 575 * synchronization guarantee for allocation side.
 576 */
 577netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp)
 578{
 579	netmem_ref netmem;
 580
 581	/* Fast-path: Get a page from cache */
 582	netmem = __page_pool_get_cached(pool);
 583	if (netmem)
 584		return netmem;
 585
 586	/* Slow-path: cache empty, do real allocation */
 587	if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
 588		netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp);
 589	else
 590		netmem = __page_pool_alloc_pages_slow(pool, gfp);
 591	return netmem;
 592}
 593EXPORT_SYMBOL(page_pool_alloc_netmem);
 594
 595struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
 596{
 597	return netmem_to_page(page_pool_alloc_netmem(pool, gfp));
 598}
 599EXPORT_SYMBOL(page_pool_alloc_pages);
 600ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL);
 601
 602/* Calculate distance between two u32 values, valid if distance is below 2^(31)
 603 *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
 604 */
 605#define _distance(a, b)	(s32)((a) - (b))
 606
 607s32 page_pool_inflight(const struct page_pool *pool, bool strict)
 608{
 609	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
 610	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
 611	s32 inflight;
 612
 613	inflight = _distance(hold_cnt, release_cnt);
 614
 615	if (strict) {
 616		trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
 617		WARN(inflight < 0, "Negative(%d) inflight packet-pages",
 618		     inflight);
 619	} else {
 620		inflight = max(0, inflight);
 621	}
 622
 623	return inflight;
 624}
 625
 626void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
 627{
 628	netmem_set_pp(netmem, pool);
 629	netmem_or_pp_magic(netmem, PP_SIGNATURE);
 630
 631	/* Ensuring all pages have been split into one fragment initially:
 632	 * page_pool_set_pp_info() is only called once for every page when it
 633	 * is allocated from the page allocator and page_pool_fragment_page()
 634	 * is dirtying the same cache line as the page->pp_magic above, so
 635	 * the overhead is negligible.
 636	 */
 637	page_pool_fragment_netmem(netmem, 1);
 638	if (pool->has_init_callback)
 639		pool->slow.init_callback(netmem, pool->slow.init_arg);
 640}
 641
 642void page_pool_clear_pp_info(netmem_ref netmem)
 643{
 644	netmem_clear_pp_magic(netmem);
 645	netmem_set_pp(netmem, NULL);
 646}
 647
 648static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
 649							 netmem_ref netmem)
 650{
 651	dma_addr_t dma;
 652
 653	if (!pool->dma_map)
 654		/* Always account for inflight pages, even if we didn't
 655		 * map them
 656		 */
 657		return;
 658
 659	dma = page_pool_get_dma_addr_netmem(netmem);
 660
 661	/* When page is unmapped, it cannot be returned to our pool */
 662	dma_unmap_page_attrs(pool->p.dev, dma,
 663			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 664			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
 665	page_pool_set_dma_addr_netmem(netmem, 0);
 666}
 667
 668/* Disconnects a page (from a page_pool).  API users can have a need
 669 * to disconnect a page (from a page_pool), to allow it to be used as
 670 * a regular page (that will eventually be returned to the normal
 671 * page-allocator via put_page).
 672 */
 673void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
 674{
 675	int count;
 676	bool put;
 677
 678	put = true;
 679	if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
 680		put = mp_dmabuf_devmem_release_page(pool, netmem);
 681	else
 682		__page_pool_release_page_dma(pool, netmem);
 683
 684	/* This may be the last page returned, releasing the pool, so
 685	 * it is not safe to reference pool afterwards.
 686	 */
 687	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
 688	trace_page_pool_state_release(pool, netmem, count);
 689
 690	if (put) {
 691		page_pool_clear_pp_info(netmem);
 692		put_page(netmem_to_page(netmem));
 693	}
 694	/* An optimization would be to call __free_pages(page, pool->p.order)
 695	 * knowing page is not part of page-cache (thus avoiding a
 696	 * __page_cache_release() call).
 697	 */
 698}
 699
 700static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
 701{
 702	int ret;
 703	/* BH protection not needed if current is softirq */
 704	if (in_softirq())
 705		ret = ptr_ring_produce(&pool->ring, (__force void *)netmem);
 706	else
 707		ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem);
 708
 709	if (!ret) {
 710		recycle_stat_inc(pool, ring);
 711		return true;
 712	}
 713
 714	return false;
 715}
 716
 717/* Only allow direct recycling in special circumstances, into the
 718 * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
 719 *
 720 * Caller must provide appropriate safe context.
 721 */
 722static bool page_pool_recycle_in_cache(netmem_ref netmem,
 723				       struct page_pool *pool)
 724{
 725	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
 726		recycle_stat_inc(pool, cache_full);
 727		return false;
 728	}
 729
 730	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
 731	pool->alloc.cache[pool->alloc.count++] = netmem;
 732	recycle_stat_inc(pool, cached);
 733	return true;
 734}
 735
 736static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
 737{
 738	return netmem_is_net_iov(netmem) ||
 739	       (page_ref_count(netmem_to_page(netmem)) == 1 &&
 740		!page_is_pfmemalloc(netmem_to_page(netmem)));
 741}
 742
 743/* If the page refcnt == 1, this will try to recycle the page.
 744 * If pool->dma_sync is set, we'll try to sync the DMA area for
 745 * the configured size min(dma_sync_size, pool->max_len).
 746 * If the page refcnt != 1, then the page will be returned to memory
 747 * subsystem.
 748 */
 749static __always_inline netmem_ref
 750__page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
 751		     unsigned int dma_sync_size, bool allow_direct)
 752{
 753	lockdep_assert_no_hardirq();
 754
 755	/* This allocator is optimized for the XDP mode that uses
 756	 * one-frame-per-page, but have fallbacks that act like the
 757	 * regular page allocator APIs.
 758	 *
 759	 * refcnt == 1 means page_pool owns page, and can recycle it.
 760	 *
 761	 * page is NOT reusable when allocated when system is under
 762	 * some pressure. (page_is_pfmemalloc)
 763	 */
 764	if (likely(__page_pool_page_can_be_recycled(netmem))) {
 765		/* Read barrier done in page_ref_count / READ_ONCE */
 766
 767		page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
 768
 769		if (allow_direct && page_pool_recycle_in_cache(netmem, pool))
 770			return 0;
 
 
 
 771
 772		/* Page found as candidate for recycling */
 773		return netmem;
 774	}
 775
 776	/* Fallback/non-XDP mode: API user have elevated refcnt.
 777	 *
 778	 * Many drivers split up the page into fragments, and some
 779	 * want to keep doing this to save memory and do refcnt based
 780	 * recycling. Support this use case too, to ease drivers
 781	 * switching between XDP/non-XDP.
 782	 *
 783	 * In-case page_pool maintains the DMA mapping, API user must
 784	 * call page_pool_put_page once.  In this elevated refcnt
 785	 * case, the DMA is unmapped/released, as driver is likely
 786	 * doing refcnt based recycle tricks, meaning another process
 787	 * will be invoking put_page.
 788	 */
 789	recycle_stat_inc(pool, released_refcnt);
 790	page_pool_return_page(pool, netmem);
 791
 792	return 0;
 793}
 794
 795static bool page_pool_napi_local(const struct page_pool *pool)
 796{
 797	const struct napi_struct *napi;
 798	u32 cpuid;
 799
 800	if (unlikely(!in_softirq()))
 801		return false;
 802
 803	/* Allow direct recycle if we have reasons to believe that we are
 804	 * in the same context as the consumer would run, so there's
 805	 * no possible race.
 806	 * __page_pool_put_page() makes sure we're not in hardirq context
 807	 * and interrupts are enabled prior to accessing the cache.
 808	 */
 809	cpuid = smp_processor_id();
 810	if (READ_ONCE(pool->cpuid) == cpuid)
 811		return true;
 812
 813	napi = READ_ONCE(pool->p.napi);
 814
 815	return napi && READ_ONCE(napi->list_owner) == cpuid;
 816}
 817
 818void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
 819				  unsigned int dma_sync_size, bool allow_direct)
 820{
 821	if (!allow_direct)
 822		allow_direct = page_pool_napi_local(pool);
 823
 824	netmem =
 825		__page_pool_put_page(pool, netmem, dma_sync_size, allow_direct);
 826	if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
 827		/* Cache full, fallback to free pages */
 828		recycle_stat_inc(pool, ring_full);
 829		page_pool_return_page(pool, netmem);
 830	}
 831}
 832EXPORT_SYMBOL(page_pool_put_unrefed_netmem);
 833
 834void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
 835				unsigned int dma_sync_size, bool allow_direct)
 836{
 837	page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size,
 838				     allow_direct);
 839}
 840EXPORT_SYMBOL(page_pool_put_unrefed_page);
 841
 842/**
 843 * page_pool_put_page_bulk() - release references on multiple pages
 844 * @pool:	pool from which pages were allocated
 845 * @data:	array holding page pointers
 846 * @count:	number of pages in @data
 847 *
 848 * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
 849 * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
 850 * will release leftover pages to the page allocator.
 851 * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
 852 * completion loop for the XDP_REDIRECT use case.
 853 *
 854 * Please note the caller must not use data area after running
 855 * page_pool_put_page_bulk(), as this function overwrites it.
 856 */
 857void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 858			     int count)
 859{
 860	int i, bulk_len = 0;
 861	bool allow_direct;
 862	bool in_softirq;
 863
 864	allow_direct = page_pool_napi_local(pool);
 865
 866	for (i = 0; i < count; i++) {
 867		netmem_ref netmem = page_to_netmem(virt_to_head_page(data[i]));
 868
 869		/* It is not the last user for the page frag case */
 870		if (!page_pool_is_last_ref(netmem))
 871			continue;
 872
 873		netmem = __page_pool_put_page(pool, netmem, -1, allow_direct);
 874		/* Approved for bulk recycling in ptr_ring cache */
 875		if (netmem)
 876			data[bulk_len++] = (__force void *)netmem;
 877	}
 878
 879	if (!bulk_len)
 880		return;
 881
 882	/* Bulk producer into ptr_ring page_pool cache */
 883	in_softirq = page_pool_producer_lock(pool);
 884	for (i = 0; i < bulk_len; i++) {
 885		if (__ptr_ring_produce(&pool->ring, data[i])) {
 886			/* ring full */
 887			recycle_stat_inc(pool, ring_full);
 888			break;
 889		}
 890	}
 891	recycle_stat_add(pool, ring, i);
 892	page_pool_producer_unlock(pool, in_softirq);
 893
 894	/* Hopefully all pages was return into ptr_ring */
 895	if (likely(i == bulk_len))
 896		return;
 897
 898	/* ptr_ring cache full, free remaining pages outside producer lock
 899	 * since put_page() with refcnt == 1 can be an expensive operation
 900	 */
 901	for (; i < bulk_len; i++)
 902		page_pool_return_page(pool, (__force netmem_ref)data[i]);
 903}
 904EXPORT_SYMBOL(page_pool_put_page_bulk);
 905
 906static netmem_ref page_pool_drain_frag(struct page_pool *pool,
 907				       netmem_ref netmem)
 908{
 909	long drain_count = BIAS_MAX - pool->frag_users;
 910
 911	/* Some user is still using the page frag */
 912	if (likely(page_pool_unref_netmem(netmem, drain_count)))
 913		return 0;
 
 
 
 
 914
 915	if (__page_pool_page_can_be_recycled(netmem)) {
 916		page_pool_dma_sync_for_device(pool, netmem, -1);
 917		return netmem;
 918	}
 919
 920	page_pool_return_page(pool, netmem);
 921	return 0;
 922}
 923
 924static void page_pool_free_frag(struct page_pool *pool)
 925{
 926	long drain_count = BIAS_MAX - pool->frag_users;
 927	netmem_ref netmem = pool->frag_page;
 928
 929	pool->frag_page = 0;
 930
 931	if (!netmem || page_pool_unref_netmem(netmem, drain_count))
 932		return;
 933
 934	page_pool_return_page(pool, netmem);
 935}
 936
 937netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
 938				       unsigned int *offset, unsigned int size,
 939				       gfp_t gfp)
 940{
 941	unsigned int max_size = PAGE_SIZE << pool->p.order;
 942	netmem_ref netmem = pool->frag_page;
 943
 944	if (WARN_ON(size > max_size))
 945		return 0;
 946
 947	size = ALIGN(size, dma_get_cache_alignment());
 948	*offset = pool->frag_offset;
 949
 950	if (netmem && *offset + size > max_size) {
 951		netmem = page_pool_drain_frag(pool, netmem);
 952		if (netmem) {
 953			recycle_stat_inc(pool, cached);
 954			alloc_stat_inc(pool, fast);
 955			goto frag_reset;
 956		}
 957	}
 958
 959	if (!netmem) {
 960		netmem = page_pool_alloc_netmem(pool, gfp);
 961		if (unlikely(!netmem)) {
 962			pool->frag_page = 0;
 963			return 0;
 964		}
 965
 966		pool->frag_page = netmem;
 967
 968frag_reset:
 969		pool->frag_users = 1;
 970		*offset = 0;
 971		pool->frag_offset = size;
 972		page_pool_fragment_netmem(netmem, BIAS_MAX);
 973		return netmem;
 974	}
 975
 976	pool->frag_users++;
 977	pool->frag_offset = *offset + size;
 978	return netmem;
 979}
 980EXPORT_SYMBOL(page_pool_alloc_frag_netmem);
 981
 982struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
 983				  unsigned int size, gfp_t gfp)
 984{
 985	return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size,
 986							  gfp));
 987}
 988EXPORT_SYMBOL(page_pool_alloc_frag);
 989
 990static void page_pool_empty_ring(struct page_pool *pool)
 991{
 992	netmem_ref netmem;
 993
 994	/* Empty recycle ring */
 995	while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
 996		/* Verify the refcnt invariant of cached pages */
 997		if (!(netmem_ref_count(netmem) == 1))
 998			pr_crit("%s() page_pool refcnt %d violation\n",
 999				__func__, netmem_ref_count(netmem));
1000
1001		page_pool_return_page(pool, netmem);
1002	}
1003}
1004
1005static void __page_pool_destroy(struct page_pool *pool)
1006{
1007	if (pool->disconnect)
1008		pool->disconnect(pool);
1009
1010	page_pool_unlist(pool);
1011	page_pool_uninit(pool);
1012
1013	if (pool->mp_priv) {
1014		mp_dmabuf_devmem_destroy(pool);
1015		static_branch_dec(&page_pool_mem_providers);
1016	}
1017
1018	kfree(pool);
1019}
1020
1021static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
1022{
1023	netmem_ref netmem;
1024
1025	if (pool->destroy_cnt)
1026		return;
1027
1028	/* Empty alloc cache, assume caller made sure this is
1029	 * no-longer in use, and page_pool_alloc_pages() cannot be
1030	 * call concurrently.
1031	 */
1032	while (pool->alloc.count) {
1033		netmem = pool->alloc.cache[--pool->alloc.count];
1034		page_pool_return_page(pool, netmem);
1035	}
1036}
1037
1038static void page_pool_scrub(struct page_pool *pool)
1039{
1040	page_pool_empty_alloc_cache_once(pool);
1041	pool->destroy_cnt++;
1042
1043	/* No more consumers should exist, but producers could still
1044	 * be in-flight.
1045	 */
1046	page_pool_empty_ring(pool);
1047}
1048
1049static int page_pool_release(struct page_pool *pool)
1050{
1051	int inflight;
1052
1053	page_pool_scrub(pool);
1054	inflight = page_pool_inflight(pool, true);
1055	if (!inflight)
1056		__page_pool_destroy(pool);
1057
1058	return inflight;
1059}
1060
1061static void page_pool_release_retry(struct work_struct *wq)
1062{
1063	struct delayed_work *dwq = to_delayed_work(wq);
1064	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
1065	void *netdev;
1066	int inflight;
1067
1068	inflight = page_pool_release(pool);
1069	if (!inflight)
1070		return;
1071
1072	/* Periodic warning for page pools the user can't see */
1073	netdev = READ_ONCE(pool->slow.netdev);
1074	if (time_after_eq(jiffies, pool->defer_warn) &&
1075	    (!netdev || netdev == NET_PTR_POISON)) {
1076		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
1077
1078		pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
1079			__func__, pool->user.id, inflight, sec);
1080		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
1081	}
1082
1083	/* Still not ready to be disconnected, retry later */
1084	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1085}
1086
1087void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
1088			   const struct xdp_mem_info *mem)
1089{
1090	refcount_inc(&pool->user_cnt);
1091	pool->disconnect = disconnect;
1092	pool->xdp_mem_id = mem->id;
1093}
1094
1095void page_pool_disable_direct_recycling(struct page_pool *pool)
1096{
1097	/* Disable direct recycling based on pool->cpuid.
1098	 * Paired with READ_ONCE() in page_pool_napi_local().
1099	 */
1100	WRITE_ONCE(pool->cpuid, -1);
1101
1102	if (!pool->p.napi)
1103		return;
1104
1105	/* To avoid races with recycling and additional barriers make sure
1106	 * pool and NAPI are unlinked when NAPI is disabled.
1107	 */
1108	WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state));
1109	WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1);
1110
1111	mutex_lock(&page_pools_lock);
1112	WRITE_ONCE(pool->p.napi, NULL);
1113	mutex_unlock(&page_pools_lock);
1114}
1115EXPORT_SYMBOL(page_pool_disable_direct_recycling);
1116
1117void page_pool_destroy(struct page_pool *pool)
1118{
1119	if (!pool)
1120		return;
1121
1122	if (!page_pool_put(pool))
1123		return;
1124
1125	page_pool_disable_direct_recycling(pool);
1126	page_pool_free_frag(pool);
1127
1128	if (!page_pool_release(pool))
1129		return;
1130
1131	page_pool_detached(pool);
1132	pool->defer_start = jiffies;
1133	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
1134
1135	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
1136	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1137}
1138EXPORT_SYMBOL(page_pool_destroy);
1139
1140/* Caller must provide appropriate safe context, e.g. NAPI. */
1141void page_pool_update_nid(struct page_pool *pool, int new_nid)
1142{
1143	netmem_ref netmem;
1144
1145	trace_page_pool_update_nid(pool, new_nid);
1146	pool->p.nid = new_nid;
1147
1148	/* Flush pool alloc cache, as refill will check NUMA node */
1149	while (pool->alloc.count) {
1150		netmem = pool->alloc.cache[--pool->alloc.count];
1151		page_pool_return_page(pool, netmem);
1152	}
1153}
1154EXPORT_SYMBOL(page_pool_update_nid);

   1/* SPDX-License-Identifier: GPL-2.0
   2 *
   3 * page_pool.c
   4 *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
   5 *	Copyright (C) 2016 Red Hat, Inc.
   6 */
   7
 
   8#include <linux/types.h>
   9#include <linux/kernel.h>
  10#include <linux/slab.h>
  11#include <linux/device.h>
  12
 
  13#include <net/page_pool/helpers.h>
  14#include <net/xdp.h>
  15
  16#include <linux/dma-direction.h>
  17#include <linux/dma-mapping.h>
  18#include <linux/page-flags.h>
  19#include <linux/mm.h> /* for put_page() */
  20#include <linux/poison.h>
  21#include <linux/ethtool.h>
  22#include <linux/netdevice.h>
  23
  24#include <trace/events/page_pool.h>
  25
 
 
  26#include "page_pool_priv.h"
  27
 
 
  28#define DEFER_TIME (msecs_to_jiffies(1000))
  29#define DEFER_WARN_INTERVAL (60 * HZ)
  30
  31#define BIAS_MAX	(LONG_MAX >> 1)
  32
  33#ifdef CONFIG_PAGE_POOL_STATS
  34static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
  35
  36/* alloc_stat_inc is intended to be used in softirq context */
  37#define alloc_stat_inc(pool, __stat)	(pool->alloc_stats.__stat++)
  38/* recycle_stat_inc is safe to use when preemption is possible. */
  39#define recycle_stat_inc(pool, __stat)							\
  40	do {										\
  41		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
  42		this_cpu_inc(s->__stat);						\
  43	} while (0)
  44
  45#define recycle_stat_add(pool, __stat, val)						\
  46	do {										\
  47		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
  48		this_cpu_add(s->__stat, val);						\
  49	} while (0)
  50
  51static const char pp_stats[][ETH_GSTRING_LEN] = {
  52	"rx_pp_alloc_fast",
  53	"rx_pp_alloc_slow",
  54	"rx_pp_alloc_slow_ho",
  55	"rx_pp_alloc_empty",
  56	"rx_pp_alloc_refill",
  57	"rx_pp_alloc_waive",
  58	"rx_pp_recycle_cached",
  59	"rx_pp_recycle_cache_full",
  60	"rx_pp_recycle_ring",
  61	"rx_pp_recycle_ring_full",
  62	"rx_pp_recycle_released_ref",
  63};
  64
  65/**
  66 * page_pool_get_stats() - fetch page pool stats
  67 * @pool:	pool from which page was allocated
  68 * @stats:	struct page_pool_stats to fill in
  69 *
  70 * Retrieve statistics about the page_pool. This API is only available
  71 * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
  72 * A pointer to a caller allocated struct page_pool_stats structure
  73 * is passed to this API which is filled in. The caller can then report
  74 * those stats to the user (perhaps via ethtool, debugfs, etc.).
  75 */
  76bool page_pool_get_stats(const struct page_pool *pool,
  77			 struct page_pool_stats *stats)
  78{
  79	int cpu = 0;
  80
  81	if (!stats)
  82		return false;
  83
  84	/* The caller is responsible to initialize stats. */
  85	stats->alloc_stats.fast += pool->alloc_stats.fast;
  86	stats->alloc_stats.slow += pool->alloc_stats.slow;
  87	stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
  88	stats->alloc_stats.empty += pool->alloc_stats.empty;
  89	stats->alloc_stats.refill += pool->alloc_stats.refill;
  90	stats->alloc_stats.waive += pool->alloc_stats.waive;
  91
  92	for_each_possible_cpu(cpu) {
  93		const struct page_pool_recycle_stats *pcpu =
  94			per_cpu_ptr(pool->recycle_stats, cpu);
  95
  96		stats->recycle_stats.cached += pcpu->cached;
  97		stats->recycle_stats.cache_full += pcpu->cache_full;
  98		stats->recycle_stats.ring += pcpu->ring;
  99		stats->recycle_stats.ring_full += pcpu->ring_full;
 100		stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
 101	}
 102
 103	return true;
 104}
 105EXPORT_SYMBOL(page_pool_get_stats);
 106
 107u8 *page_pool_ethtool_stats_get_strings(u8 *data)
 108{
 109	int i;
 110
 111	for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
 112		memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
 113		data += ETH_GSTRING_LEN;
 114	}
 115
 116	return data;
 117}
 118EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
 119
 120int page_pool_ethtool_stats_get_count(void)
 121{
 122	return ARRAY_SIZE(pp_stats);
 123}
 124EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
 125
 126u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
 127{
 128	struct page_pool_stats *pool_stats = stats;
 129
 130	*data++ = pool_stats->alloc_stats.fast;
 131	*data++ = pool_stats->alloc_stats.slow;
 132	*data++ = pool_stats->alloc_stats.slow_high_order;
 133	*data++ = pool_stats->alloc_stats.empty;
 134	*data++ = pool_stats->alloc_stats.refill;
 135	*data++ = pool_stats->alloc_stats.waive;
 136	*data++ = pool_stats->recycle_stats.cached;
 137	*data++ = pool_stats->recycle_stats.cache_full;
 138	*data++ = pool_stats->recycle_stats.ring;
 139	*data++ = pool_stats->recycle_stats.ring_full;
 140	*data++ = pool_stats->recycle_stats.released_refcnt;
 141
 142	return data;
 143}
 144EXPORT_SYMBOL(page_pool_ethtool_stats_get);
 145
 146#else
 147#define alloc_stat_inc(pool, __stat)
 148#define recycle_stat_inc(pool, __stat)
 149#define recycle_stat_add(pool, __stat, val)
 150#endif
 151
 152static bool page_pool_producer_lock(struct page_pool *pool)
 153	__acquires(&pool->ring.producer_lock)
 154{
 155	bool in_softirq = in_softirq();
 156
 157	if (in_softirq)
 158		spin_lock(&pool->ring.producer_lock);
 159	else
 160		spin_lock_bh(&pool->ring.producer_lock);
 161
 162	return in_softirq;
 163}
 164
 165static void page_pool_producer_unlock(struct page_pool *pool,
 166				      bool in_softirq)
 167	__releases(&pool->ring.producer_lock)
 168{
 169	if (in_softirq)
 170		spin_unlock(&pool->ring.producer_lock);
 171	else
 172		spin_unlock_bh(&pool->ring.producer_lock);
 173}
 174
 
 
 
 
 
 
 
 
 
 175static int page_pool_init(struct page_pool *pool,
 176			  const struct page_pool_params *params,
 177			  int cpuid)
 178{
 179	unsigned int ring_qsize = 1024; /* Default */
 
 
 
 
 180
 181	memcpy(&pool->p, &params->fast, sizeof(pool->p));
 182	memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
 183
 184	pool->cpuid = cpuid;
 185
 186	/* Validate only known flags were used */
 187	if (pool->p.flags & ~(PP_FLAG_ALL))
 188		return -EINVAL;
 189
 190	if (pool->p.pool_size)
 191		ring_qsize = pool->p.pool_size;
 192
 193	/* Sanity limit mem that can be pinned down */
 194	if (ring_qsize > 32768)
 195		return -E2BIG;
 196
 197	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
 198	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
 199	 * which is the XDP_TX use-case.
 200	 */
 201	if (pool->p.flags & PP_FLAG_DMA_MAP) {
 202		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
 203		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
 204			return -EINVAL;
 
 
 205	}
 206
 207	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
 208		/* In order to request DMA-sync-for-device the page
 209		 * needs to be mapped
 210		 */
 211		if (!(pool->p.flags & PP_FLAG_DMA_MAP))
 212			return -EINVAL;
 213
 214		if (!pool->p.max_len)
 215			return -EINVAL;
 216
 
 
 217		/* pool->p.offset has to be set according to the address
 218		 * offset used by the DMA engine to start copying rx data
 219		 */
 220	}
 221
 222	pool->has_init_callback = !!pool->slow.init_callback;
 223
 224#ifdef CONFIG_PAGE_POOL_STATS
 225	if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) {
 226		pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
 227		if (!pool->recycle_stats)
 228			return -ENOMEM;
 229	} else {
 230		/* For system page pool instance we use a singular stats object
 231		 * instead of allocating a separate percpu variable for each
 232		 * (also percpu) page pool instance.
 233		 */
 234		pool->recycle_stats = &pp_system_recycle_stats;
 
 235	}
 236#endif
 237
 238	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
 239#ifdef CONFIG_PAGE_POOL_STATS
 240		if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL))
 241			free_percpu(pool->recycle_stats);
 242#endif
 243		return -ENOMEM;
 244	}
 245
 246	atomic_set(&pool->pages_state_release_cnt, 0);
 247
 248	/* Driver calling page_pool_create() also call page_pool_destroy() */
 249	refcount_set(&pool->user_cnt, 1);
 250
 251	if (pool->p.flags & PP_FLAG_DMA_MAP)
 252		get_device(pool->p.dev);
 253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 254	return 0;
 
 
 
 
 
 
 
 
 255}
 256
 257static void page_pool_uninit(struct page_pool *pool)
 258{
 259	ptr_ring_cleanup(&pool->ring, NULL);
 260
 261	if (pool->p.flags & PP_FLAG_DMA_MAP)
 262		put_device(pool->p.dev);
 263
 264#ifdef CONFIG_PAGE_POOL_STATS
 265	if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL))
 266		free_percpu(pool->recycle_stats);
 267#endif
 268}
 269
 270/**
 271 * page_pool_create_percpu() - create a page pool for a given cpu.
 272 * @params: parameters, see struct page_pool_params
 273 * @cpuid: cpu identifier
 274 */
 275struct page_pool *
 276page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
 277{
 278	struct page_pool *pool;
 279	int err;
 280
 281	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
 282	if (!pool)
 283		return ERR_PTR(-ENOMEM);
 284
 285	err = page_pool_init(pool, params, cpuid);
 286	if (err < 0)
 287		goto err_free;
 288
 289	err = page_pool_list(pool);
 290	if (err)
 291		goto err_uninit;
 292
 293	return pool;
 294
 295err_uninit:
 296	page_pool_uninit(pool);
 297err_free:
 298	pr_warn("%s() gave up with errno %d\n", __func__, err);
 299	kfree(pool);
 300	return ERR_PTR(err);
 301}
 302EXPORT_SYMBOL(page_pool_create_percpu);
 303
 304/**
 305 * page_pool_create() - create a page pool
 306 * @params: parameters, see struct page_pool_params
 307 */
 308struct page_pool *page_pool_create(const struct page_pool_params *params)
 309{
 310	return page_pool_create_percpu(params, -1);
 311}
 312EXPORT_SYMBOL(page_pool_create);
 313
 314static void page_pool_return_page(struct page_pool *pool, struct page *page);
 315
 316noinline
 317static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
 318{
 319	struct ptr_ring *r = &pool->ring;
 320	struct page *page;
 321	int pref_nid; /* preferred NUMA node */
 322
 323	/* Quicker fallback, avoid locks when ring is empty */
 324	if (__ptr_ring_empty(r)) {
 325		alloc_stat_inc(pool, empty);
 326		return NULL;
 327	}
 328
 329	/* Softirq guarantee CPU and thus NUMA node is stable. This,
 330	 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
 331	 */
 332#ifdef CONFIG_NUMA
 333	pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
 334#else
 335	/* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
 336	pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
 337#endif
 338
 339	/* Refill alloc array, but only if NUMA match */
 340	do {
 341		page = __ptr_ring_consume(r);
 342		if (unlikely(!page))
 343			break;
 344
 345		if (likely(page_to_nid(page) == pref_nid)) {
 346			pool->alloc.cache[pool->alloc.count++] = page;
 347		} else {
 348			/* NUMA mismatch;
 349			 * (1) release 1 page to page-allocator and
 350			 * (2) break out to fallthrough to alloc_pages_node.
 351			 * This limit stress on page buddy alloactor.
 352			 */
 353			page_pool_return_page(pool, page);
 354			alloc_stat_inc(pool, waive);
 355			page = NULL;
 356			break;
 357		}
 358	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
 359
 360	/* Return last page */
 361	if (likely(pool->alloc.count > 0)) {
 362		page = pool->alloc.cache[--pool->alloc.count];
 363		alloc_stat_inc(pool, refill);
 364	}
 365
 366	return page;
 367}
 368
 369/* fast path */
 370static struct page *__page_pool_get_cached(struct page_pool *pool)
 371{
 372	struct page *page;
 373
 374	/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
 375	if (likely(pool->alloc.count)) {
 376		/* Fast-path */
 377		page = pool->alloc.cache[--pool->alloc.count];
 378		alloc_stat_inc(pool, fast);
 379	} else {
 380		page = page_pool_refill_alloc_cache(pool);
 381	}
 382
 383	return page;
 384}
 385
 386static void page_pool_dma_sync_for_device(struct page_pool *pool,
 387					  struct page *page,
 388					  unsigned int dma_sync_size)
 389{
 390	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
 
 391
 392	dma_sync_size = min(dma_sync_size, pool->p.max_len);
 393	dma_sync_single_range_for_device(pool->p.dev, dma_addr,
 394					 pool->p.offset, dma_sync_size,
 395					 pool->p.dma_dir);
 396}
 397
 398static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 
 
 
 
 
 
 
 
 
 399{
 400	dma_addr_t dma;
 401
 402	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
 403	 * since dma_addr_t can be either 32 or 64 bits and does not always fit
 404	 * into page private data (i.e 32bit cpu with 64bit DMA caps)
 405	 * This mapping is kept for lifetime of page, until leaving pool.
 406	 */
 407	dma = dma_map_page_attrs(pool->p.dev, page, 0,
 408				 (PAGE_SIZE << pool->p.order),
 409				 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC |
 410						  DMA_ATTR_WEAK_ORDERING);
 411	if (dma_mapping_error(pool->p.dev, dma))
 412		return false;
 413
 414	if (page_pool_set_dma_addr(page, dma))
 415		goto unmap_failed;
 416
 417	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
 418		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
 419
 420	return true;
 421
 422unmap_failed:
 423	WARN_ON_ONCE("unexpected DMA address, please report to netdev@");
 424	dma_unmap_page_attrs(pool->p.dev, dma,
 425			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 426			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
 427	return false;
 428}
 429
 430static void page_pool_set_pp_info(struct page_pool *pool,
 431				  struct page *page)
 432{
 433	page->pp = pool;
 434	page->pp_magic |= PP_SIGNATURE;
 435
 436	/* Ensuring all pages have been split into one fragment initially:
 437	 * page_pool_set_pp_info() is only called once for every page when it
 438	 * is allocated from the page allocator and page_pool_fragment_page()
 439	 * is dirtying the same cache line as the page->pp_magic above, so
 440	 * the overhead is negligible.
 441	 */
 442	page_pool_fragment_page(page, 1);
 443	if (pool->has_init_callback)
 444		pool->slow.init_callback(page, pool->slow.init_arg);
 445}
 446
 447static void page_pool_clear_pp_info(struct page *page)
 448{
 449	page->pp_magic = 0;
 450	page->pp = NULL;
 451}
 452
 453static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 454						 gfp_t gfp)
 455{
 456	struct page *page;
 457
 458	gfp |= __GFP_COMP;
 459	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
 460	if (unlikely(!page))
 461		return NULL;
 462
 463	if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
 464	    unlikely(!page_pool_dma_map(pool, page))) {
 465		put_page(page);
 466		return NULL;
 467	}
 468
 469	alloc_stat_inc(pool, slow_high_order);
 470	page_pool_set_pp_info(pool, page);
 471
 472	/* Track how many pages are held 'in-flight' */
 473	pool->pages_state_hold_cnt++;
 474	trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
 
 475	return page;
 476}
 477
 478/* slow path */
 479noinline
 480static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 481						 gfp_t gfp)
 482{
 483	const int bulk = PP_ALLOC_CACHE_REFILL;
 484	unsigned int pp_flags = pool->p.flags;
 485	unsigned int pp_order = pool->p.order;
 486	struct page *page;
 
 487	int i, nr_pages;
 488
 489	/* Don't support bulk alloc for high-order pages */
 490	if (unlikely(pp_order))
 491		return __page_pool_alloc_page_order(pool, gfp);
 492
 493	/* Unnecessary as alloc cache is empty, but guarantees zero count */
 494	if (unlikely(pool->alloc.count > 0))
 495		return pool->alloc.cache[--pool->alloc.count];
 496
 497	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
 498	memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
 499
 500	nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk,
 501					       pool->alloc.cache);
 
 502	if (unlikely(!nr_pages))
 503		return NULL;
 504
 505	/* Pages have been filled into alloc.cache array, but count is zero and
 506	 * page element have not been (possibly) DMA mapped.
 507	 */
 508	for (i = 0; i < nr_pages; i++) {
 509		page = pool->alloc.cache[i];
 510		if ((pp_flags & PP_FLAG_DMA_MAP) &&
 511		    unlikely(!page_pool_dma_map(pool, page))) {
 512			put_page(page);
 513			continue;
 514		}
 515
 516		page_pool_set_pp_info(pool, page);
 517		pool->alloc.cache[pool->alloc.count++] = page;
 518		/* Track how many pages are held 'in-flight' */
 519		pool->pages_state_hold_cnt++;
 520		trace_page_pool_state_hold(pool, page,
 521					   pool->pages_state_hold_cnt);
 522	}
 523
 524	/* Return last page */
 525	if (likely(pool->alloc.count > 0)) {
 526		page = pool->alloc.cache[--pool->alloc.count];
 527		alloc_stat_inc(pool, slow);
 528	} else {
 529		page = NULL;
 530	}
 531
 532	/* When page just alloc'ed is should/must have refcnt 1. */
 533	return page;
 534}
 535
 536/* For using page_pool replace: alloc_pages() API calls, but provide
 537 * synchronization guarantee for allocation side.
 538 */
 539struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
 540{
 541	struct page *page;
 542
 543	/* Fast-path: Get a page from cache */
 544	page = __page_pool_get_cached(pool);
 545	if (page)
 546		return page;
 547
 548	/* Slow-path: cache empty, do real allocation */
 549	page = __page_pool_alloc_pages_slow(pool, gfp);
 550	return page;
 
 
 
 
 
 
 
 
 
 551}
 552EXPORT_SYMBOL(page_pool_alloc_pages);
 
 553
 554/* Calculate distance between two u32 values, valid if distance is below 2^(31)
 555 *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
 556 */
 557#define _distance(a, b)	(s32)((a) - (b))
 558
 559s32 page_pool_inflight(const struct page_pool *pool, bool strict)
 560{
 561	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
 562	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
 563	s32 inflight;
 564
 565	inflight = _distance(hold_cnt, release_cnt);
 566
 567	if (strict) {
 568		trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
 569		WARN(inflight < 0, "Negative(%d) inflight packet-pages",
 570		     inflight);
 571	} else {
 572		inflight = max(0, inflight);
 573	}
 574
 575	return inflight;
 576}
 577
 578static __always_inline
 579void __page_pool_release_page_dma(struct page_pool *pool, struct page *page)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 580{
 581	dma_addr_t dma;
 582
 583	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
 584		/* Always account for inflight pages, even if we didn't
 585		 * map them
 586		 */
 587		return;
 588
 589	dma = page_pool_get_dma_addr(page);
 590
 591	/* When page is unmapped, it cannot be returned to our pool */
 592	dma_unmap_page_attrs(pool->p.dev, dma,
 593			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 594			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
 595	page_pool_set_dma_addr(page, 0);
 596}
 597
 598/* Disconnects a page (from a page_pool).  API users can have a need
 599 * to disconnect a page (from a page_pool), to allow it to be used as
 600 * a regular page (that will eventually be returned to the normal
 601 * page-allocator via put_page).
 602 */
 603void page_pool_return_page(struct page_pool *pool, struct page *page)
 604{
 605	int count;
 
 606
 607	__page_pool_release_page_dma(pool, page);
 608
 609	page_pool_clear_pp_info(page);
 
 
 610
 611	/* This may be the last page returned, releasing the pool, so
 612	 * it is not safe to reference pool afterwards.
 613	 */
 614	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
 615	trace_page_pool_state_release(pool, page, count);
 616
 617	put_page(page);
 
 
 
 618	/* An optimization would be to call __free_pages(page, pool->p.order)
 619	 * knowing page is not part of page-cache (thus avoiding a
 620	 * __page_cache_release() call).
 621	 */
 622}
 623
 624static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
 625{
 626	int ret;
 627	/* BH protection not needed if current is softirq */
 628	if (in_softirq())
 629		ret = ptr_ring_produce(&pool->ring, page);
 630	else
 631		ret = ptr_ring_produce_bh(&pool->ring, page);
 632
 633	if (!ret) {
 634		recycle_stat_inc(pool, ring);
 635		return true;
 636	}
 637
 638	return false;
 639}
 640
 641/* Only allow direct recycling in special circumstances, into the
 642 * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
 643 *
 644 * Caller must provide appropriate safe context.
 645 */
 646static bool page_pool_recycle_in_cache(struct page *page,
 647				       struct page_pool *pool)
 648{
 649	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
 650		recycle_stat_inc(pool, cache_full);
 651		return false;
 652	}
 653
 654	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
 655	pool->alloc.cache[pool->alloc.count++] = page;
 656	recycle_stat_inc(pool, cached);
 657	return true;
 658}
 659
 660static bool __page_pool_page_can_be_recycled(const struct page *page)
 661{
 662	return page_ref_count(page) == 1 && !page_is_pfmemalloc(page);
 
 
 663}
 664
 665/* If the page refcnt == 1, this will try to recycle the page.
 666 * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
 667 * the configured size min(dma_sync_size, pool->max_len).
 668 * If the page refcnt != 1, then the page will be returned to memory
 669 * subsystem.
 670 */
 671static __always_inline struct page *
 672__page_pool_put_page(struct page_pool *pool, struct page *page,
 673		     unsigned int dma_sync_size, bool allow_direct)
 674{
 675	lockdep_assert_no_hardirq();
 676
 677	/* This allocator is optimized for the XDP mode that uses
 678	 * one-frame-per-page, but have fallbacks that act like the
 679	 * regular page allocator APIs.
 680	 *
 681	 * refcnt == 1 means page_pool owns page, and can recycle it.
 682	 *
 683	 * page is NOT reusable when allocated when system is under
 684	 * some pressure. (page_is_pfmemalloc)
 685	 */
 686	if (likely(__page_pool_page_can_be_recycled(page))) {
 687		/* Read barrier done in page_ref_count / READ_ONCE */
 688
 689		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
 690			page_pool_dma_sync_for_device(pool, page,
 691						      dma_sync_size);
 692
 693		if (allow_direct && in_softirq() &&
 694		    page_pool_recycle_in_cache(page, pool))
 695			return NULL;
 696
 697		/* Page found as candidate for recycling */
 698		return page;
 699	}
 
 700	/* Fallback/non-XDP mode: API user have elevated refcnt.
 701	 *
 702	 * Many drivers split up the page into fragments, and some
 703	 * want to keep doing this to save memory and do refcnt based
 704	 * recycling. Support this use case too, to ease drivers
 705	 * switching between XDP/non-XDP.
 706	 *
 707	 * In-case page_pool maintains the DMA mapping, API user must
 708	 * call page_pool_put_page once.  In this elevated refcnt
 709	 * case, the DMA is unmapped/released, as driver is likely
 710	 * doing refcnt based recycle tricks, meaning another process
 711	 * will be invoking put_page.
 712	 */
 713	recycle_stat_inc(pool, released_refcnt);
 714	page_pool_return_page(pool, page);
 715
 716	return NULL;
 717}
 718
 719void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
 720				unsigned int dma_sync_size, bool allow_direct)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 721{
 722	page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
 723	if (page && !page_pool_recycle_in_ring(pool, page)) {
 
 
 
 
 724		/* Cache full, fallback to free pages */
 725		recycle_stat_inc(pool, ring_full);
 726		page_pool_return_page(pool, page);
 727	}
 728}
 
 
 
 
 
 
 
 
 729EXPORT_SYMBOL(page_pool_put_unrefed_page);
 730
 731/**
 732 * page_pool_put_page_bulk() - release references on multiple pages
 733 * @pool:	pool from which pages were allocated
 734 * @data:	array holding page pointers
 735 * @count:	number of pages in @data
 736 *
 737 * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
 738 * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
 739 * will release leftover pages to the page allocator.
 740 * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
 741 * completion loop for the XDP_REDIRECT use case.
 742 *
 743 * Please note the caller must not use data area after running
 744 * page_pool_put_page_bulk(), as this function overwrites it.
 745 */
 746void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 747			     int count)
 748{
 749	int i, bulk_len = 0;
 
 750	bool in_softirq;
 751
 
 
 752	for (i = 0; i < count; i++) {
 753		struct page *page = virt_to_head_page(data[i]);
 754
 755		/* It is not the last user for the page frag case */
 756		if (!page_pool_is_last_ref(page))
 757			continue;
 758
 759		page = __page_pool_put_page(pool, page, -1, false);
 760		/* Approved for bulk recycling in ptr_ring cache */
 761		if (page)
 762			data[bulk_len++] = page;
 763	}
 764
 765	if (unlikely(!bulk_len))
 766		return;
 767
 768	/* Bulk producer into ptr_ring page_pool cache */
 769	in_softirq = page_pool_producer_lock(pool);
 770	for (i = 0; i < bulk_len; i++) {
 771		if (__ptr_ring_produce(&pool->ring, data[i])) {
 772			/* ring full */
 773			recycle_stat_inc(pool, ring_full);
 774			break;
 775		}
 776	}
 777	recycle_stat_add(pool, ring, i);
 778	page_pool_producer_unlock(pool, in_softirq);
 779
 780	/* Hopefully all pages was return into ptr_ring */
 781	if (likely(i == bulk_len))
 782		return;
 783
 784	/* ptr_ring cache full, free remaining pages outside producer lock
 785	 * since put_page() with refcnt == 1 can be an expensive operation
 786	 */
 787	for (; i < bulk_len; i++)
 788		page_pool_return_page(pool, data[i]);
 789}
 790EXPORT_SYMBOL(page_pool_put_page_bulk);
 791
 792static struct page *page_pool_drain_frag(struct page_pool *pool,
 793					 struct page *page)
 794{
 795	long drain_count = BIAS_MAX - pool->frag_users;
 796
 797	/* Some user is still using the page frag */
 798	if (likely(page_pool_unref_page(page, drain_count)))
 799		return NULL;
 800
 801	if (__page_pool_page_can_be_recycled(page)) {
 802		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
 803			page_pool_dma_sync_for_device(pool, page, -1);
 804
 805		return page;
 
 
 806	}
 807
 808	page_pool_return_page(pool, page);
 809	return NULL;
 810}
 811
 812static void page_pool_free_frag(struct page_pool *pool)
 813{
 814	long drain_count = BIAS_MAX - pool->frag_users;
 815	struct page *page = pool->frag_page;
 816
 817	pool->frag_page = NULL;
 818
 819	if (!page || page_pool_unref_page(page, drain_count))
 820		return;
 821
 822	page_pool_return_page(pool, page);
 823}
 824
 825struct page *page_pool_alloc_frag(struct page_pool *pool,
 826				  unsigned int *offset,
 827				  unsigned int size, gfp_t gfp)
 828{
 829	unsigned int max_size = PAGE_SIZE << pool->p.order;
 830	struct page *page = pool->frag_page;
 831
 832	if (WARN_ON(size > max_size))
 833		return NULL;
 834
 835	size = ALIGN(size, dma_get_cache_alignment());
 836	*offset = pool->frag_offset;
 837
 838	if (page && *offset + size > max_size) {
 839		page = page_pool_drain_frag(pool, page);
 840		if (page) {
 
 841			alloc_stat_inc(pool, fast);
 842			goto frag_reset;
 843		}
 844	}
 845
 846	if (!page) {
 847		page = page_pool_alloc_pages(pool, gfp);
 848		if (unlikely(!page)) {
 849			pool->frag_page = NULL;
 850			return NULL;
 851		}
 852
 853		pool->frag_page = page;
 854
 855frag_reset:
 856		pool->frag_users = 1;
 857		*offset = 0;
 858		pool->frag_offset = size;
 859		page_pool_fragment_page(page, BIAS_MAX);
 860		return page;
 861	}
 862
 863	pool->frag_users++;
 864	pool->frag_offset = *offset + size;
 865	alloc_stat_inc(pool, fast);
 866	return page;
 
 
 
 
 
 
 
 867}
 868EXPORT_SYMBOL(page_pool_alloc_frag);
 869
 870static void page_pool_empty_ring(struct page_pool *pool)
 871{
 872	struct page *page;
 873
 874	/* Empty recycle ring */
 875	while ((page = ptr_ring_consume_bh(&pool->ring))) {
 876		/* Verify the refcnt invariant of cached pages */
 877		if (!(page_ref_count(page) == 1))
 878			pr_crit("%s() page_pool refcnt %d violation\n",
 879				__func__, page_ref_count(page));
 880
 881		page_pool_return_page(pool, page);
 882	}
 883}
 884
 885static void __page_pool_destroy(struct page_pool *pool)
 886{
 887	if (pool->disconnect)
 888		pool->disconnect(pool);
 889
 890	page_pool_unlist(pool);
 891	page_pool_uninit(pool);
 
 
 
 
 
 
 892	kfree(pool);
 893}
 894
 895static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
 896{
 897	struct page *page;
 898
 899	if (pool->destroy_cnt)
 900		return;
 901
 902	/* Empty alloc cache, assume caller made sure this is
 903	 * no-longer in use, and page_pool_alloc_pages() cannot be
 904	 * call concurrently.
 905	 */
 906	while (pool->alloc.count) {
 907		page = pool->alloc.cache[--pool->alloc.count];
 908		page_pool_return_page(pool, page);
 909	}
 910}
 911
 912static void page_pool_scrub(struct page_pool *pool)
 913{
 914	page_pool_empty_alloc_cache_once(pool);
 915	pool->destroy_cnt++;
 916
 917	/* No more consumers should exist, but producers could still
 918	 * be in-flight.
 919	 */
 920	page_pool_empty_ring(pool);
 921}
 922
 923static int page_pool_release(struct page_pool *pool)
 924{
 925	int inflight;
 926
 927	page_pool_scrub(pool);
 928	inflight = page_pool_inflight(pool, true);
 929	if (!inflight)
 930		__page_pool_destroy(pool);
 931
 932	return inflight;
 933}
 934
 935static void page_pool_release_retry(struct work_struct *wq)
 936{
 937	struct delayed_work *dwq = to_delayed_work(wq);
 938	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
 939	void *netdev;
 940	int inflight;
 941
 942	inflight = page_pool_release(pool);
 943	if (!inflight)
 944		return;
 945
 946	/* Periodic warning for page pools the user can't see */
 947	netdev = READ_ONCE(pool->slow.netdev);
 948	if (time_after_eq(jiffies, pool->defer_warn) &&
 949	    (!netdev || netdev == NET_PTR_POISON)) {
 950		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
 951
 952		pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
 953			__func__, pool->user.id, inflight, sec);
 954		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
 955	}
 956
 957	/* Still not ready to be disconnected, retry later */
 958	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
 959}
 960
 961void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
 962			   struct xdp_mem_info *mem)
 963{
 964	refcount_inc(&pool->user_cnt);
 965	pool->disconnect = disconnect;
 966	pool->xdp_mem_id = mem->id;
 967}
 968
 969static void page_pool_disable_direct_recycling(struct page_pool *pool)
 970{
 971	/* Disable direct recycling based on pool->cpuid.
 972	 * Paired with READ_ONCE() in napi_pp_put_page().
 973	 */
 974	WRITE_ONCE(pool->cpuid, -1);
 975
 976	if (!pool->p.napi)
 977		return;
 978
 979	/* To avoid races with recycling and additional barriers make sure
 980	 * pool and NAPI are unlinked when NAPI is disabled.
 981	 */
 982	WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) ||
 983		READ_ONCE(pool->p.napi->list_owner) != -1);
 984
 
 985	WRITE_ONCE(pool->p.napi, NULL);
 
 986}
 
 987
 988void page_pool_destroy(struct page_pool *pool)
 989{
 990	if (!pool)
 991		return;
 992
 993	if (!page_pool_put(pool))
 994		return;
 995
 996	page_pool_disable_direct_recycling(pool);
 997	page_pool_free_frag(pool);
 998
 999	if (!page_pool_release(pool))
1000		return;
1001
1002	page_pool_detached(pool);
1003	pool->defer_start = jiffies;
1004	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
1005
1006	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
1007	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1008}
1009EXPORT_SYMBOL(page_pool_destroy);
1010
1011/* Caller must provide appropriate safe context, e.g. NAPI. */
1012void page_pool_update_nid(struct page_pool *pool, int new_nid)
1013{
1014	struct page *page;
1015
1016	trace_page_pool_update_nid(pool, new_nid);
1017	pool->p.nid = new_nid;
1018
1019	/* Flush pool alloc cache, as refill will check NUMA node */
1020	while (pool->alloc.count) {
1021		page = pool->alloc.cache[--pool->alloc.count];
1022		page_pool_return_page(pool, page);
1023	}
1024}
1025EXPORT_SYMBOL(page_pool_update_nid);