page_pool.c - net/core/page_pool.c - Linux diff v6.13.7

   1/* SPDX-License-Identifier: GPL-2.0
   2 *
   3 * page_pool.c
   4 *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
   5 *	Copyright (C) 2016 Red Hat, Inc.
   6 */
   7
   8#include <linux/error-injection.h>
   9#include <linux/types.h>
  10#include <linux/kernel.h>
  11#include <linux/slab.h>
  12#include <linux/device.h>
  13
  14#include <net/netdev_rx_queue.h>
  15#include <net/page_pool/helpers.h>
  16#include <net/xdp.h>
  17
  18#include <linux/dma-direction.h>
  19#include <linux/dma-mapping.h>
  20#include <linux/page-flags.h>
  21#include <linux/mm.h> /* for put_page() */
  22#include <linux/poison.h>
  23#include <linux/ethtool.h>
  24#include <linux/netdevice.h>
  25
  26#include <trace/events/page_pool.h>
  27
  28#include "mp_dmabuf_devmem.h"
  29#include "netmem_priv.h"
  30#include "page_pool_priv.h"
  31
  32DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
  33
  34#define DEFER_TIME (msecs_to_jiffies(1000))
  35#define DEFER_WARN_INTERVAL (60 * HZ)
  36
  37#define BIAS_MAX	(LONG_MAX >> 1)
  38
  39#ifdef CONFIG_PAGE_POOL_STATS
  40static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
  41
  42/* alloc_stat_inc is intended to be used in softirq context */
  43#define alloc_stat_inc(pool, __stat)	(pool->alloc_stats.__stat++)
  44/* recycle_stat_inc is safe to use when preemption is possible. */
  45#define recycle_stat_inc(pool, __stat)							\
  46	do {										\
  47		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
  48		this_cpu_inc(s->__stat);						\
  49	} while (0)
  50
  51#define recycle_stat_add(pool, __stat, val)						\
  52	do {										\
  53		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
  54		this_cpu_add(s->__stat, val);						\
  55	} while (0)
  56
  57static const char pp_stats[][ETH_GSTRING_LEN] = {
  58	"rx_pp_alloc_fast",
  59	"rx_pp_alloc_slow",
  60	"rx_pp_alloc_slow_ho",
  61	"rx_pp_alloc_empty",
  62	"rx_pp_alloc_refill",
  63	"rx_pp_alloc_waive",
  64	"rx_pp_recycle_cached",
  65	"rx_pp_recycle_cache_full",
  66	"rx_pp_recycle_ring",
  67	"rx_pp_recycle_ring_full",
  68	"rx_pp_recycle_released_ref",
  69};
  70
  71/**
  72 * page_pool_get_stats() - fetch page pool stats
  73 * @pool:	pool from which page was allocated
  74 * @stats:	struct page_pool_stats to fill in
  75 *
  76 * Retrieve statistics about the page_pool. This API is only available
  77 * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
  78 * A pointer to a caller allocated struct page_pool_stats structure
  79 * is passed to this API which is filled in. The caller can then report
  80 * those stats to the user (perhaps via ethtool, debugfs, etc.).
  81 */
  82bool page_pool_get_stats(const struct page_pool *pool,
  83			 struct page_pool_stats *stats)
  84{
  85	int cpu = 0;
  86
  87	if (!stats)
  88		return false;
  89
  90	/* The caller is responsible to initialize stats. */
  91	stats->alloc_stats.fast += pool->alloc_stats.fast;
  92	stats->alloc_stats.slow += pool->alloc_stats.slow;
  93	stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
  94	stats->alloc_stats.empty += pool->alloc_stats.empty;
  95	stats->alloc_stats.refill += pool->alloc_stats.refill;
  96	stats->alloc_stats.waive += pool->alloc_stats.waive;
  97
  98	for_each_possible_cpu(cpu) {
  99		const struct page_pool_recycle_stats *pcpu =
 100			per_cpu_ptr(pool->recycle_stats, cpu);
 101
 102		stats->recycle_stats.cached += pcpu->cached;
 103		stats->recycle_stats.cache_full += pcpu->cache_full;
 104		stats->recycle_stats.ring += pcpu->ring;
 105		stats->recycle_stats.ring_full += pcpu->ring_full;
 106		stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
 107	}
 108
 109	return true;
 110}
 111EXPORT_SYMBOL(page_pool_get_stats);
 112
 113u8 *page_pool_ethtool_stats_get_strings(u8 *data)
 114{
 115	int i;
 116
 117	for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
 118		memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
 119		data += ETH_GSTRING_LEN;
 120	}
 121
 122	return data;
 123}
 124EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
 125
 126int page_pool_ethtool_stats_get_count(void)
 127{
 128	return ARRAY_SIZE(pp_stats);
 129}
 130EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
 131
 132u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
 133{
 134	const struct page_pool_stats *pool_stats = stats;
 135
 136	*data++ = pool_stats->alloc_stats.fast;
 137	*data++ = pool_stats->alloc_stats.slow;
 138	*data++ = pool_stats->alloc_stats.slow_high_order;
 139	*data++ = pool_stats->alloc_stats.empty;
 140	*data++ = pool_stats->alloc_stats.refill;
 141	*data++ = pool_stats->alloc_stats.waive;
 142	*data++ = pool_stats->recycle_stats.cached;
 143	*data++ = pool_stats->recycle_stats.cache_full;
 144	*data++ = pool_stats->recycle_stats.ring;
 145	*data++ = pool_stats->recycle_stats.ring_full;
 146	*data++ = pool_stats->recycle_stats.released_refcnt;
 147
 148	return data;
 149}
 150EXPORT_SYMBOL(page_pool_ethtool_stats_get);
 151
 152#else
 153#define alloc_stat_inc(pool, __stat)
 154#define recycle_stat_inc(pool, __stat)
 155#define recycle_stat_add(pool, __stat, val)
 156#endif
 157
 158static bool page_pool_producer_lock(struct page_pool *pool)
 159	__acquires(&pool->ring.producer_lock)
 160{
 161	bool in_softirq = in_softirq();
 162
 163	if (in_softirq)
 164		spin_lock(&pool->ring.producer_lock);
 165	else
 166		spin_lock_bh(&pool->ring.producer_lock);
 167
 168	return in_softirq;
 169}
 170
 171static void page_pool_producer_unlock(struct page_pool *pool,
 172				      bool in_softirq)
 173	__releases(&pool->ring.producer_lock)
 174{
 175	if (in_softirq)
 176		spin_unlock(&pool->ring.producer_lock);
 177	else
 178		spin_unlock_bh(&pool->ring.producer_lock);
 179}
 180
 181static void page_pool_struct_check(void)
 182{
 183	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
 184	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
 185	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
 186	CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
 187				    PAGE_POOL_FRAG_GROUP_ALIGN);
 188}
 189
 190static int page_pool_init(struct page_pool *pool,
 191			  const struct page_pool_params *params,
 192			  int cpuid)
 193{
 194	unsigned int ring_qsize = 1024; /* Default */
 195	struct netdev_rx_queue *rxq;
 196	int err;
 197
 198	page_pool_struct_check();
 199
 200	memcpy(&pool->p, &params->fast, sizeof(pool->p));
 201	memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
 202
 203	pool->cpuid = cpuid;
 204
 205	/* Validate only known flags were used */
 206	if (pool->slow.flags & ~PP_FLAG_ALL)
 207		return -EINVAL;
 208
 209	if (pool->p.pool_size)
 210		ring_qsize = pool->p.pool_size;
 211
 212	/* Sanity limit mem that can be pinned down */
 213	if (ring_qsize > 32768)
 214		return -E2BIG;
 215
 216	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
 217	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
 218	 * which is the XDP_TX use-case.
 219	 */
 220	if (pool->slow.flags & PP_FLAG_DMA_MAP) {
 221		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
 222		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
 223			return -EINVAL;
 224
 225		pool->dma_map = true;
 226	}
 227
 228	if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
 229		/* In order to request DMA-sync-for-device the page
 230		 * needs to be mapped
 231		 */
 232		if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
 233			return -EINVAL;
 234
 235		if (!pool->p.max_len)
 236			return -EINVAL;
 237
 238		pool->dma_sync = true;
 239
 240		/* pool->p.offset has to be set according to the address
 241		 * offset used by the DMA engine to start copying rx data
 242		 */
 243	}
 244
 245	pool->has_init_callback = !!pool->slow.init_callback;
 246
 247#ifdef CONFIG_PAGE_POOL_STATS
 248	if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
 249		pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
 250		if (!pool->recycle_stats)
 251			return -ENOMEM;
 252	} else {
 253		/* For system page pool instance we use a singular stats object
 254		 * instead of allocating a separate percpu variable for each
 255		 * (also percpu) page pool instance.
 256		 */
 257		pool->recycle_stats = &pp_system_recycle_stats;
 258		pool->system = true;
 259	}
 260#endif
 261
 262	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
 263#ifdef CONFIG_PAGE_POOL_STATS
 264		if (!pool->system)
 265			free_percpu(pool->recycle_stats);
 266#endif
 267		return -ENOMEM;
 268	}
 269
 270	atomic_set(&pool->pages_state_release_cnt, 0);
 271
 272	/* Driver calling page_pool_create() also call page_pool_destroy() */
 273	refcount_set(&pool->user_cnt, 1);
 274
 275	if (pool->dma_map)
 276		get_device(pool->p.dev);
 277
 278	if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
 279		/* We rely on rtnl_lock()ing to make sure netdev_rx_queue
 280		 * configuration doesn't change while we're initializing
 281		 * the page_pool.
 282		 */
 283		ASSERT_RTNL();
 284		rxq = __netif_get_rx_queue(pool->slow.netdev,
 285					   pool->slow.queue_idx);
 286		pool->mp_priv = rxq->mp_params.mp_priv;
 287	}
 288
 289	if (pool->mp_priv) {
 290		err = mp_dmabuf_devmem_init(pool);
 291		if (err) {
 292			pr_warn("%s() mem-provider init failed %d\n", __func__,
 293				err);
 294			goto free_ptr_ring;
 295		}
 296
 297		static_branch_inc(&page_pool_mem_providers);
 298	}
 299
 300	return 0;
 301
 302free_ptr_ring:
 303	ptr_ring_cleanup(&pool->ring, NULL);
 304#ifdef CONFIG_PAGE_POOL_STATS
 305	if (!pool->system)
 306		free_percpu(pool->recycle_stats);
 307#endif
 308	return err;
 309}
 310
 311static void page_pool_uninit(struct page_pool *pool)
 312{
 313	ptr_ring_cleanup(&pool->ring, NULL);
 314
 315	if (pool->dma_map)
 316		put_device(pool->p.dev);
 317
 318#ifdef CONFIG_PAGE_POOL_STATS
 319	if (!pool->system)
 320		free_percpu(pool->recycle_stats);
 321#endif
 322}
 323
 324/**
 325 * page_pool_create_percpu() - create a page pool for a given cpu.
 326 * @params: parameters, see struct page_pool_params
 327 * @cpuid: cpu identifier
 328 */
 329struct page_pool *
 330page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
 331{
 332	struct page_pool *pool;
 333	int err;
 334
 335	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
 336	if (!pool)
 337		return ERR_PTR(-ENOMEM);
 338
 339	err = page_pool_init(pool, params, cpuid);
 340	if (err < 0)
 341		goto err_free;
 342
 343	err = page_pool_list(pool);
 344	if (err)
 345		goto err_uninit;
 346
 347	return pool;
 348
 349err_uninit:
 350	page_pool_uninit(pool);
 351err_free:
 352	pr_warn("%s() gave up with errno %d\n", __func__, err);
 353	kfree(pool);
 354	return ERR_PTR(err);
 355}
 356EXPORT_SYMBOL(page_pool_create_percpu);
 357
 358/**
 359 * page_pool_create() - create a page pool
 360 * @params: parameters, see struct page_pool_params
 361 */
 362struct page_pool *page_pool_create(const struct page_pool_params *params)
 363{
 364	return page_pool_create_percpu(params, -1);
 365}
 366EXPORT_SYMBOL(page_pool_create);
 367
 368static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem);
 369
 370static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
 
 371{
 372	struct ptr_ring *r = &pool->ring;
 373	netmem_ref netmem;
 374	int pref_nid; /* preferred NUMA node */
 375
 376	/* Quicker fallback, avoid locks when ring is empty */
 377	if (__ptr_ring_empty(r)) {
 378		alloc_stat_inc(pool, empty);
 379		return 0;
 380	}
 381
 382	/* Softirq guarantee CPU and thus NUMA node is stable. This,
 383	 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
 384	 */
 385#ifdef CONFIG_NUMA
 386	pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
 387#else
 388	/* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
 389	pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
 390#endif
 391
 392	/* Refill alloc array, but only if NUMA match */
 393	do {
 394		netmem = (__force netmem_ref)__ptr_ring_consume(r);
 395		if (unlikely(!netmem))
 396			break;
 397
 398		if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
 399			pool->alloc.cache[pool->alloc.count++] = netmem;
 400		} else {
 401			/* NUMA mismatch;
 402			 * (1) release 1 page to page-allocator and
 403			 * (2) break out to fallthrough to alloc_pages_node.
 404			 * This limit stress on page buddy alloactor.
 405			 */
 406			page_pool_return_page(pool, netmem);
 407			alloc_stat_inc(pool, waive);
 408			netmem = 0;
 409			break;
 410		}
 411	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
 412
 413	/* Return last page */
 414	if (likely(pool->alloc.count > 0)) {
 415		netmem = pool->alloc.cache[--pool->alloc.count];
 416		alloc_stat_inc(pool, refill);
 417	}
 418
 419	return netmem;
 420}
 421
 422/* fast path */
 423static netmem_ref __page_pool_get_cached(struct page_pool *pool)
 424{
 425	netmem_ref netmem;
 426
 427	/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
 428	if (likely(pool->alloc.count)) {
 429		/* Fast-path */
 430		netmem = pool->alloc.cache[--pool->alloc.count];
 431		alloc_stat_inc(pool, fast);
 432	} else {
 433		netmem = page_pool_refill_alloc_cache(pool);
 434	}
 435
 436	return netmem;
 437}
 438
 439static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
 440					    netmem_ref netmem,
 441					    u32 dma_sync_size)
 442{
 443#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
 444	dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
 445
 446	dma_sync_size = min(dma_sync_size, pool->p.max_len);
 447	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
 448				     dma_sync_size, pool->p.dma_dir);
 449#endif
 450}
 451
 452static __always_inline void
 453page_pool_dma_sync_for_device(const struct page_pool *pool,
 454			      netmem_ref netmem,
 455			      u32 dma_sync_size)
 456{
 457	if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
 458		__page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
 459}
 460
 461static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
 462{
 463	dma_addr_t dma;
 464
 465	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
 466	 * since dma_addr_t can be either 32 or 64 bits and does not always fit
 467	 * into page private data (i.e 32bit cpu with 64bit DMA caps)
 468	 * This mapping is kept for lifetime of page, until leaving pool.
 469	 */
 470	dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0,
 471				 (PAGE_SIZE << pool->p.order), pool->p.dma_dir,
 472				 DMA_ATTR_SKIP_CPU_SYNC |
 473					 DMA_ATTR_WEAK_ORDERING);
 474	if (dma_mapping_error(pool->p.dev, dma))
 475		return false;
 476
 477	if (page_pool_set_dma_addr_netmem(netmem, dma))
 478		goto unmap_failed;
 479
 480	page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
 
 481
 482	return true;
 483
 484unmap_failed:
 485	WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
 486	dma_unmap_page_attrs(pool->p.dev, dma,
 487			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 488			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
 489	return false;
 490}
 491
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 492static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 493						 gfp_t gfp)
 494{
 495	struct page *page;
 496
 497	gfp |= __GFP_COMP;
 498	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
 499	if (unlikely(!page))
 500		return NULL;
 501
 502	if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
 
 503		put_page(page);
 504		return NULL;
 505	}
 506
 507	alloc_stat_inc(pool, slow_high_order);
 508	page_pool_set_pp_info(pool, page_to_netmem(page));
 509
 510	/* Track how many pages are held 'in-flight' */
 511	pool->pages_state_hold_cnt++;
 512	trace_page_pool_state_hold(pool, page_to_netmem(page),
 513				   pool->pages_state_hold_cnt);
 514	return page;
 515}
 516
 517/* slow path */
 518static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
 519							gfp_t gfp)
 
 520{
 521	const int bulk = PP_ALLOC_CACHE_REFILL;
 
 522	unsigned int pp_order = pool->p.order;
 523	bool dma_map = pool->dma_map;
 524	netmem_ref netmem;
 525	int i, nr_pages;
 526
 527	/* Don't support bulk alloc for high-order pages */
 528	if (unlikely(pp_order))
 529		return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
 530
 531	/* Unnecessary as alloc cache is empty, but guarantees zero count */
 532	if (unlikely(pool->alloc.count > 0))
 533		return pool->alloc.cache[--pool->alloc.count];
 534
 535	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
 536	memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
 537
 538	nr_pages = alloc_pages_bulk_array_node(gfp,
 539					       pool->p.nid, bulk,
 540					       (struct page **)pool->alloc.cache);
 541	if (unlikely(!nr_pages))
 542		return 0;
 543
 544	/* Pages have been filled into alloc.cache array, but count is zero and
 545	 * page element have not been (possibly) DMA mapped.
 546	 */
 547	for (i = 0; i < nr_pages; i++) {
 548		netmem = pool->alloc.cache[i];
 549		if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
 550			put_page(netmem_to_page(netmem));
 
 551			continue;
 552		}
 553
 554		page_pool_set_pp_info(pool, netmem);
 555		pool->alloc.cache[pool->alloc.count++] = netmem;
 556		/* Track how many pages are held 'in-flight' */
 557		pool->pages_state_hold_cnt++;
 558		trace_page_pool_state_hold(pool, netmem,
 559					   pool->pages_state_hold_cnt);
 560	}
 561
 562	/* Return last page */
 563	if (likely(pool->alloc.count > 0)) {
 564		netmem = pool->alloc.cache[--pool->alloc.count];
 565		alloc_stat_inc(pool, slow);
 566	} else {
 567		netmem = 0;
 568	}
 569
 570	/* When page just alloc'ed is should/must have refcnt 1. */
 571	return netmem;
 572}
 573
 574/* For using page_pool replace: alloc_pages() API calls, but provide
 575 * synchronization guarantee for allocation side.
 576 */
 577netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp)
 578{
 579	netmem_ref netmem;
 580
 581	/* Fast-path: Get a page from cache */
 582	netmem = __page_pool_get_cached(pool);
 583	if (netmem)
 584		return netmem;
 585
 586	/* Slow-path: cache empty, do real allocation */
 587	if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
 588		netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp);
 589	else
 590		netmem = __page_pool_alloc_pages_slow(pool, gfp);
 591	return netmem;
 592}
 593EXPORT_SYMBOL(page_pool_alloc_netmem);
 594
 595struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
 596{
 597	return netmem_to_page(page_pool_alloc_netmem(pool, gfp));
 598}
 599EXPORT_SYMBOL(page_pool_alloc_pages);
 600ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL);
 601
 602/* Calculate distance between two u32 values, valid if distance is below 2^(31)
 603 *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
 604 */
 605#define _distance(a, b)	(s32)((a) - (b))
 606
 607s32 page_pool_inflight(const struct page_pool *pool, bool strict)
 608{
 609	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
 610	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
 611	s32 inflight;
 612
 613	inflight = _distance(hold_cnt, release_cnt);
 614
 615	if (strict) {
 616		trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
 617		WARN(inflight < 0, "Negative(%d) inflight packet-pages",
 618		     inflight);
 619	} else {
 620		inflight = max(0, inflight);
 621	}
 622
 623	return inflight;
 624}
 625
 626void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
 627{
 628	netmem_set_pp(netmem, pool);
 629	netmem_or_pp_magic(netmem, PP_SIGNATURE);
 630
 631	/* Ensuring all pages have been split into one fragment initially:
 632	 * page_pool_set_pp_info() is only called once for every page when it
 633	 * is allocated from the page allocator and page_pool_fragment_page()
 634	 * is dirtying the same cache line as the page->pp_magic above, so
 635	 * the overhead is negligible.
 636	 */
 637	page_pool_fragment_netmem(netmem, 1);
 638	if (pool->has_init_callback)
 639		pool->slow.init_callback(netmem, pool->slow.init_arg);
 640}
 641
 642void page_pool_clear_pp_info(netmem_ref netmem)
 643{
 644	netmem_clear_pp_magic(netmem);
 645	netmem_set_pp(netmem, NULL);
 646}
 647
 648static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
 649							 netmem_ref netmem)
 650{
 651	dma_addr_t dma;
 652
 653	if (!pool->dma_map)
 654		/* Always account for inflight pages, even if we didn't
 655		 * map them
 656		 */
 657		return;
 658
 659	dma = page_pool_get_dma_addr_netmem(netmem);
 660
 661	/* When page is unmapped, it cannot be returned to our pool */
 662	dma_unmap_page_attrs(pool->p.dev, dma,
 663			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 664			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
 665	page_pool_set_dma_addr_netmem(netmem, 0);
 666}
 667
 668/* Disconnects a page (from a page_pool).  API users can have a need
 669 * to disconnect a page (from a page_pool), to allow it to be used as
 670 * a regular page (that will eventually be returned to the normal
 671 * page-allocator via put_page).
 672 */
 673void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
 674{
 675	int count;
 676	bool put;
 677
 678	put = true;
 679	if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
 680		put = mp_dmabuf_devmem_release_page(pool, netmem);
 681	else
 682		__page_pool_release_page_dma(pool, netmem);
 683
 684	/* This may be the last page returned, releasing the pool, so
 685	 * it is not safe to reference pool afterwards.
 686	 */
 687	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
 688	trace_page_pool_state_release(pool, netmem, count);
 689
 690	if (put) {
 691		page_pool_clear_pp_info(netmem);
 692		put_page(netmem_to_page(netmem));
 693	}
 694	/* An optimization would be to call __free_pages(page, pool->p.order)
 695	 * knowing page is not part of page-cache (thus avoiding a
 696	 * __page_cache_release() call).
 697	 */
 698}
 699
 700static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
 701{
 702	int ret;
 703	/* BH protection not needed if current is softirq */
 704	if (in_softirq())
 705		ret = ptr_ring_produce(&pool->ring, (__force void *)netmem);
 706	else
 707		ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem);
 708
 709	if (!ret) {
 710		recycle_stat_inc(pool, ring);
 711		return true;
 712	}
 713
 714	return false;
 715}
 716
 717/* Only allow direct recycling in special circumstances, into the
 718 * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
 719 *
 720 * Caller must provide appropriate safe context.
 721 */
 722static bool page_pool_recycle_in_cache(netmem_ref netmem,
 723				       struct page_pool *pool)
 724{
 725	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
 726		recycle_stat_inc(pool, cache_full);
 727		return false;
 728	}
 729
 730	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
 731	pool->alloc.cache[pool->alloc.count++] = netmem;
 732	recycle_stat_inc(pool, cached);
 733	return true;
 734}
 735
 736static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
 737{
 738	return netmem_is_net_iov(netmem) ||
 739	       (page_ref_count(netmem_to_page(netmem)) == 1 &&
 740		!page_is_pfmemalloc(netmem_to_page(netmem)));
 741}
 742
 743/* If the page refcnt == 1, this will try to recycle the page.
 744 * If pool->dma_sync is set, we'll try to sync the DMA area for
 745 * the configured size min(dma_sync_size, pool->max_len).
 746 * If the page refcnt != 1, then the page will be returned to memory
 747 * subsystem.
 748 */
 749static __always_inline netmem_ref
 750__page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
 751		     unsigned int dma_sync_size, bool allow_direct)
 752{
 753	lockdep_assert_no_hardirq();
 754
 755	/* This allocator is optimized for the XDP mode that uses
 756	 * one-frame-per-page, but have fallbacks that act like the
 757	 * regular page allocator APIs.
 758	 *
 759	 * refcnt == 1 means page_pool owns page, and can recycle it.
 760	 *
 761	 * page is NOT reusable when allocated when system is under
 762	 * some pressure. (page_is_pfmemalloc)
 763	 */
 764	if (likely(__page_pool_page_can_be_recycled(netmem))) {
 765		/* Read barrier done in page_ref_count / READ_ONCE */
 766
 767		page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
 768
 769		if (allow_direct && page_pool_recycle_in_cache(netmem, pool))
 770			return 0;
 
 
 
 771
 772		/* Page found as candidate for recycling */
 773		return netmem;
 774	}
 775
 776	/* Fallback/non-XDP mode: API user have elevated refcnt.
 777	 *
 778	 * Many drivers split up the page into fragments, and some
 779	 * want to keep doing this to save memory and do refcnt based
 780	 * recycling. Support this use case too, to ease drivers
 781	 * switching between XDP/non-XDP.
 782	 *
 783	 * In-case page_pool maintains the DMA mapping, API user must
 784	 * call page_pool_put_page once.  In this elevated refcnt
 785	 * case, the DMA is unmapped/released, as driver is likely
 786	 * doing refcnt based recycle tricks, meaning another process
 787	 * will be invoking put_page.
 788	 */
 789	recycle_stat_inc(pool, released_refcnt);
 790	page_pool_return_page(pool, netmem);
 791
 792	return 0;
 793}
 794
 795static bool page_pool_napi_local(const struct page_pool *pool)
 796{
 797	const struct napi_struct *napi;
 798	u32 cpuid;
 799
 800	if (unlikely(!in_softirq()))
 801		return false;
 802
 803	/* Allow direct recycle if we have reasons to believe that we are
 804	 * in the same context as the consumer would run, so there's
 805	 * no possible race.
 806	 * __page_pool_put_page() makes sure we're not in hardirq context
 807	 * and interrupts are enabled prior to accessing the cache.
 808	 */
 809	cpuid = smp_processor_id();
 810	if (READ_ONCE(pool->cpuid) == cpuid)
 811		return true;
 812
 813	napi = READ_ONCE(pool->p.napi);
 814
 815	return napi && READ_ONCE(napi->list_owner) == cpuid;
 816}
 817
 818void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
 819				  unsigned int dma_sync_size, bool allow_direct)
 820{
 821	if (!allow_direct)
 822		allow_direct = page_pool_napi_local(pool);
 823
 824	netmem =
 825		__page_pool_put_page(pool, netmem, dma_sync_size, allow_direct);
 826	if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
 827		/* Cache full, fallback to free pages */
 828		recycle_stat_inc(pool, ring_full);
 829		page_pool_return_page(pool, netmem);
 830	}
 831}
 832EXPORT_SYMBOL(page_pool_put_unrefed_netmem);
 833
 834void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
 835				unsigned int dma_sync_size, bool allow_direct)
 836{
 837	page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size,
 838				     allow_direct);
 839}
 840EXPORT_SYMBOL(page_pool_put_unrefed_page);
 841
 842/**
 843 * page_pool_put_page_bulk() - release references on multiple pages
 844 * @pool:	pool from which pages were allocated
 845 * @data:	array holding page pointers
 846 * @count:	number of pages in @data
 847 *
 848 * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
 849 * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
 850 * will release leftover pages to the page allocator.
 851 * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
 852 * completion loop for the XDP_REDIRECT use case.
 853 *
 854 * Please note the caller must not use data area after running
 855 * page_pool_put_page_bulk(), as this function overwrites it.
 856 */
 857void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 858			     int count)
 859{
 860	int i, bulk_len = 0;
 861	bool allow_direct;
 862	bool in_softirq;
 863
 864	allow_direct = page_pool_napi_local(pool);
 865
 866	for (i = 0; i < count; i++) {
 867		netmem_ref netmem = page_to_netmem(virt_to_head_page(data[i]));
 868
 869		/* It is not the last user for the page frag case */
 870		if (!page_pool_is_last_ref(netmem))
 871			continue;
 872
 873		netmem = __page_pool_put_page(pool, netmem, -1, allow_direct);
 874		/* Approved for bulk recycling in ptr_ring cache */
 875		if (netmem)
 876			data[bulk_len++] = (__force void *)netmem;
 877	}
 878
 879	if (!bulk_len)
 880		return;
 881
 882	/* Bulk producer into ptr_ring page_pool cache */
 883	in_softirq = page_pool_producer_lock(pool);
 884	for (i = 0; i < bulk_len; i++) {
 885		if (__ptr_ring_produce(&pool->ring, data[i])) {
 886			/* ring full */
 887			recycle_stat_inc(pool, ring_full);
 888			break;
 889		}
 890	}
 891	recycle_stat_add(pool, ring, i);
 892	page_pool_producer_unlock(pool, in_softirq);
 893
 894	/* Hopefully all pages was return into ptr_ring */
 895	if (likely(i == bulk_len))
 896		return;
 897
 898	/* ptr_ring cache full, free remaining pages outside producer lock
 899	 * since put_page() with refcnt == 1 can be an expensive operation
 900	 */
 901	for (; i < bulk_len; i++)
 902		page_pool_return_page(pool, (__force netmem_ref)data[i]);
 903}
 904EXPORT_SYMBOL(page_pool_put_page_bulk);
 905
 906static netmem_ref page_pool_drain_frag(struct page_pool *pool,
 907				       netmem_ref netmem)
 908{
 909	long drain_count = BIAS_MAX - pool->frag_users;
 910
 911	/* Some user is still using the page frag */
 912	if (likely(page_pool_unref_netmem(netmem, drain_count)))
 913		return 0;
 
 
 
 
 914
 915	if (__page_pool_page_can_be_recycled(netmem)) {
 916		page_pool_dma_sync_for_device(pool, netmem, -1);
 917		return netmem;
 918	}
 919
 920	page_pool_return_page(pool, netmem);
 921	return 0;
 922}
 923
 924static void page_pool_free_frag(struct page_pool *pool)
 925{
 926	long drain_count = BIAS_MAX - pool->frag_users;
 927	netmem_ref netmem = pool->frag_page;
 928
 929	pool->frag_page = 0;
 930
 931	if (!netmem || page_pool_unref_netmem(netmem, drain_count))
 932		return;
 933
 934	page_pool_return_page(pool, netmem);
 935}
 936
 937netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
 938				       unsigned int *offset, unsigned int size,
 939				       gfp_t gfp)
 940{
 941	unsigned int max_size = PAGE_SIZE << pool->p.order;
 942	netmem_ref netmem = pool->frag_page;
 943
 944	if (WARN_ON(size > max_size))
 945		return 0;
 946
 947	size = ALIGN(size, dma_get_cache_alignment());
 948	*offset = pool->frag_offset;
 949
 950	if (netmem && *offset + size > max_size) {
 951		netmem = page_pool_drain_frag(pool, netmem);
 952		if (netmem) {
 953			recycle_stat_inc(pool, cached);
 954			alloc_stat_inc(pool, fast);
 955			goto frag_reset;
 956		}
 957	}
 958
 959	if (!netmem) {
 960		netmem = page_pool_alloc_netmem(pool, gfp);
 961		if (unlikely(!netmem)) {
 962			pool->frag_page = 0;
 963			return 0;
 964		}
 965
 966		pool->frag_page = netmem;
 967
 968frag_reset:
 969		pool->frag_users = 1;
 970		*offset = 0;
 971		pool->frag_offset = size;
 972		page_pool_fragment_netmem(netmem, BIAS_MAX);
 973		return netmem;
 974	}
 975
 976	pool->frag_users++;
 977	pool->frag_offset = *offset + size;
 978	return netmem;
 979}
 980EXPORT_SYMBOL(page_pool_alloc_frag_netmem);
 981
 982struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
 983				  unsigned int size, gfp_t gfp)
 984{
 985	return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size,
 986							  gfp));
 987}
 988EXPORT_SYMBOL(page_pool_alloc_frag);
 989
 990static void page_pool_empty_ring(struct page_pool *pool)
 991{
 992	netmem_ref netmem;
 993
 994	/* Empty recycle ring */
 995	while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
 996		/* Verify the refcnt invariant of cached pages */
 997		if (!(netmem_ref_count(netmem) == 1))
 998			pr_crit("%s() page_pool refcnt %d violation\n",
 999				__func__, netmem_ref_count(netmem));
1000
1001		page_pool_return_page(pool, netmem);
1002	}
1003}
1004
1005static void __page_pool_destroy(struct page_pool *pool)
1006{
1007	if (pool->disconnect)
1008		pool->disconnect(pool);
1009
1010	page_pool_unlist(pool);
1011	page_pool_uninit(pool);
1012
1013	if (pool->mp_priv) {
1014		mp_dmabuf_devmem_destroy(pool);
1015		static_branch_dec(&page_pool_mem_providers);
1016	}
1017
1018	kfree(pool);
1019}
1020
1021static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
1022{
1023	netmem_ref netmem;
1024
1025	if (pool->destroy_cnt)
1026		return;
1027
1028	/* Empty alloc cache, assume caller made sure this is
1029	 * no-longer in use, and page_pool_alloc_pages() cannot be
1030	 * call concurrently.
1031	 */
1032	while (pool->alloc.count) {
1033		netmem = pool->alloc.cache[--pool->alloc.count];
1034		page_pool_return_page(pool, netmem);
1035	}
1036}
1037
1038static void page_pool_scrub(struct page_pool *pool)
1039{
1040	page_pool_empty_alloc_cache_once(pool);
1041	pool->destroy_cnt++;
1042
1043	/* No more consumers should exist, but producers could still
1044	 * be in-flight.
1045	 */
1046	page_pool_empty_ring(pool);
1047}
1048
1049static int page_pool_release(struct page_pool *pool)
1050{
1051	int inflight;
1052
1053	page_pool_scrub(pool);
1054	inflight = page_pool_inflight(pool, true);
1055	if (!inflight)
1056		__page_pool_destroy(pool);
1057
1058	return inflight;
1059}
1060
1061static void page_pool_release_retry(struct work_struct *wq)
1062{
1063	struct delayed_work *dwq = to_delayed_work(wq);
1064	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
1065	void *netdev;
1066	int inflight;
1067
1068	inflight = page_pool_release(pool);
1069	if (!inflight)
1070		return;
1071
1072	/* Periodic warning for page pools the user can't see */
1073	netdev = READ_ONCE(pool->slow.netdev);
1074	if (time_after_eq(jiffies, pool->defer_warn) &&
1075	    (!netdev || netdev == NET_PTR_POISON)) {
1076		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
1077
1078		pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
1079			__func__, pool->user.id, inflight, sec);
1080		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
1081	}
1082
1083	/* Still not ready to be disconnected, retry later */
1084	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1085}
1086
1087void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
1088			   const struct xdp_mem_info *mem)
1089{
1090	refcount_inc(&pool->user_cnt);
1091	pool->disconnect = disconnect;
1092	pool->xdp_mem_id = mem->id;
1093}
1094
1095void page_pool_disable_direct_recycling(struct page_pool *pool)
1096{
1097	/* Disable direct recycling based on pool->cpuid.
1098	 * Paired with READ_ONCE() in page_pool_napi_local().
1099	 */
1100	WRITE_ONCE(pool->cpuid, -1);
1101
1102	if (!pool->p.napi)
1103		return;
1104
1105	/* To avoid races with recycling and additional barriers make sure
1106	 * pool and NAPI are unlinked when NAPI is disabled.
1107	 */
1108	WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state));
1109	WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1);
1110
1111	mutex_lock(&page_pools_lock);
1112	WRITE_ONCE(pool->p.napi, NULL);
1113	mutex_unlock(&page_pools_lock);
1114}
1115EXPORT_SYMBOL(page_pool_disable_direct_recycling);
1116
1117void page_pool_destroy(struct page_pool *pool)
1118{
1119	if (!pool)
1120		return;
1121
1122	if (!page_pool_put(pool))
1123		return;
1124
1125	page_pool_disable_direct_recycling(pool);
1126	page_pool_free_frag(pool);
1127
1128	if (!page_pool_release(pool))
1129		return;
1130
1131	page_pool_detached(pool);
1132	pool->defer_start = jiffies;
1133	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
1134
1135	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
1136	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1137}
1138EXPORT_SYMBOL(page_pool_destroy);
1139
1140/* Caller must provide appropriate safe context, e.g. NAPI. */
1141void page_pool_update_nid(struct page_pool *pool, int new_nid)
1142{
1143	netmem_ref netmem;
1144
1145	trace_page_pool_update_nid(pool, new_nid);
1146	pool->p.nid = new_nid;
1147
1148	/* Flush pool alloc cache, as refill will check NUMA node */
1149	while (pool->alloc.count) {
1150		netmem = pool->alloc.cache[--pool->alloc.count];
1151		page_pool_return_page(pool, netmem);
1152	}
1153}
1154EXPORT_SYMBOL(page_pool_update_nid);

  1/* SPDX-License-Identifier: GPL-2.0
  2 *
  3 * page_pool.c
  4 *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
  5 *	Copyright (C) 2016 Red Hat, Inc.
  6 */
  7
 
  8#include <linux/types.h>
  9#include <linux/kernel.h>
 10#include <linux/slab.h>
 11#include <linux/device.h>
 12
 
 13#include <net/page_pool/helpers.h>
 14#include <net/xdp.h>
 15
 16#include <linux/dma-direction.h>
 17#include <linux/dma-mapping.h>
 18#include <linux/page-flags.h>
 19#include <linux/mm.h> /* for put_page() */
 20#include <linux/poison.h>
 21#include <linux/ethtool.h>
 22#include <linux/netdevice.h>
 23
 24#include <trace/events/page_pool.h>
 25
 
 
 26#include "page_pool_priv.h"
 27
 
 
 28#define DEFER_TIME (msecs_to_jiffies(1000))
 29#define DEFER_WARN_INTERVAL (60 * HZ)
 30
 31#define BIAS_MAX	(LONG_MAX >> 1)
 32
 33#ifdef CONFIG_PAGE_POOL_STATS
 
 
 34/* alloc_stat_inc is intended to be used in softirq context */
 35#define alloc_stat_inc(pool, __stat)	(pool->alloc_stats.__stat++)
 36/* recycle_stat_inc is safe to use when preemption is possible. */
 37#define recycle_stat_inc(pool, __stat)							\
 38	do {										\
 39		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
 40		this_cpu_inc(s->__stat);						\
 41	} while (0)
 42
 43#define recycle_stat_add(pool, __stat, val)						\
 44	do {										\
 45		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
 46		this_cpu_add(s->__stat, val);						\
 47	} while (0)
 48
 49static const char pp_stats[][ETH_GSTRING_LEN] = {
 50	"rx_pp_alloc_fast",
 51	"rx_pp_alloc_slow",
 52	"rx_pp_alloc_slow_ho",
 53	"rx_pp_alloc_empty",
 54	"rx_pp_alloc_refill",
 55	"rx_pp_alloc_waive",
 56	"rx_pp_recycle_cached",
 57	"rx_pp_recycle_cache_full",
 58	"rx_pp_recycle_ring",
 59	"rx_pp_recycle_ring_full",
 60	"rx_pp_recycle_released_ref",
 61};
 62
 63/**
 64 * page_pool_get_stats() - fetch page pool stats
 65 * @pool:	pool from which page was allocated
 66 * @stats:	struct page_pool_stats to fill in
 67 *
 68 * Retrieve statistics about the page_pool. This API is only available
 69 * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
 70 * A pointer to a caller allocated struct page_pool_stats structure
 71 * is passed to this API which is filled in. The caller can then report
 72 * those stats to the user (perhaps via ethtool, debugfs, etc.).
 73 */
 74bool page_pool_get_stats(const struct page_pool *pool,
 75			 struct page_pool_stats *stats)
 76{
 77	int cpu = 0;
 78
 79	if (!stats)
 80		return false;
 81
 82	/* The caller is responsible to initialize stats. */
 83	stats->alloc_stats.fast += pool->alloc_stats.fast;
 84	stats->alloc_stats.slow += pool->alloc_stats.slow;
 85	stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
 86	stats->alloc_stats.empty += pool->alloc_stats.empty;
 87	stats->alloc_stats.refill += pool->alloc_stats.refill;
 88	stats->alloc_stats.waive += pool->alloc_stats.waive;
 89
 90	for_each_possible_cpu(cpu) {
 91		const struct page_pool_recycle_stats *pcpu =
 92			per_cpu_ptr(pool->recycle_stats, cpu);
 93
 94		stats->recycle_stats.cached += pcpu->cached;
 95		stats->recycle_stats.cache_full += pcpu->cache_full;
 96		stats->recycle_stats.ring += pcpu->ring;
 97		stats->recycle_stats.ring_full += pcpu->ring_full;
 98		stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
 99	}
100
101	return true;
102}
103EXPORT_SYMBOL(page_pool_get_stats);
104
105u8 *page_pool_ethtool_stats_get_strings(u8 *data)
106{
107	int i;
108
109	for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
110		memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
111		data += ETH_GSTRING_LEN;
112	}
113
114	return data;
115}
116EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
117
118int page_pool_ethtool_stats_get_count(void)
119{
120	return ARRAY_SIZE(pp_stats);
121}
122EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
123
124u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
125{
126	struct page_pool_stats *pool_stats = stats;
127
128	*data++ = pool_stats->alloc_stats.fast;
129	*data++ = pool_stats->alloc_stats.slow;
130	*data++ = pool_stats->alloc_stats.slow_high_order;
131	*data++ = pool_stats->alloc_stats.empty;
132	*data++ = pool_stats->alloc_stats.refill;
133	*data++ = pool_stats->alloc_stats.waive;
134	*data++ = pool_stats->recycle_stats.cached;
135	*data++ = pool_stats->recycle_stats.cache_full;
136	*data++ = pool_stats->recycle_stats.ring;
137	*data++ = pool_stats->recycle_stats.ring_full;
138	*data++ = pool_stats->recycle_stats.released_refcnt;
139
140	return data;
141}
142EXPORT_SYMBOL(page_pool_ethtool_stats_get);
143
144#else
145#define alloc_stat_inc(pool, __stat)
146#define recycle_stat_inc(pool, __stat)
147#define recycle_stat_add(pool, __stat, val)
148#endif
149
150static bool page_pool_producer_lock(struct page_pool *pool)
151	__acquires(&pool->ring.producer_lock)
152{
153	bool in_softirq = in_softirq();
154
155	if (in_softirq)
156		spin_lock(&pool->ring.producer_lock);
157	else
158		spin_lock_bh(&pool->ring.producer_lock);
159
160	return in_softirq;
161}
162
163static void page_pool_producer_unlock(struct page_pool *pool,
164				      bool in_softirq)
165	__releases(&pool->ring.producer_lock)
166{
167	if (in_softirq)
168		spin_unlock(&pool->ring.producer_lock);
169	else
170		spin_unlock_bh(&pool->ring.producer_lock);
171}
172
 
 
 
 
 
 
 
 
 
173static int page_pool_init(struct page_pool *pool,
174			  const struct page_pool_params *params)
 
175{
176	unsigned int ring_qsize = 1024; /* Default */
 
 
 
 
177
178	memcpy(&pool->p, &params->fast, sizeof(pool->p));
179	memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
180
 
 
181	/* Validate only known flags were used */
182	if (pool->p.flags & ~(PP_FLAG_ALL))
183		return -EINVAL;
184
185	if (pool->p.pool_size)
186		ring_qsize = pool->p.pool_size;
187
188	/* Sanity limit mem that can be pinned down */
189	if (ring_qsize > 32768)
190		return -E2BIG;
191
192	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
193	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
194	 * which is the XDP_TX use-case.
195	 */
196	if (pool->p.flags & PP_FLAG_DMA_MAP) {
197		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
198		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
199			return -EINVAL;
 
 
200	}
201
202	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
203		/* In order to request DMA-sync-for-device the page
204		 * needs to be mapped
205		 */
206		if (!(pool->p.flags & PP_FLAG_DMA_MAP))
207			return -EINVAL;
208
209		if (!pool->p.max_len)
210			return -EINVAL;
211
 
 
212		/* pool->p.offset has to be set according to the address
213		 * offset used by the DMA engine to start copying rx data
214		 */
215	}
216
217	pool->has_init_callback = !!pool->slow.init_callback;
218
219#ifdef CONFIG_PAGE_POOL_STATS
220	pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
221	if (!pool->recycle_stats)
222		return -ENOMEM;
 
 
 
 
 
 
 
 
 
223#endif
224
225	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
226#ifdef CONFIG_PAGE_POOL_STATS
227		free_percpu(pool->recycle_stats);
 
228#endif
229		return -ENOMEM;
230	}
231
232	atomic_set(&pool->pages_state_release_cnt, 0);
233
234	/* Driver calling page_pool_create() also call page_pool_destroy() */
235	refcount_set(&pool->user_cnt, 1);
236
237	if (pool->p.flags & PP_FLAG_DMA_MAP)
238		get_device(pool->p.dev);
239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240	return 0;
 
 
 
 
 
 
 
 
241}
242
243static void page_pool_uninit(struct page_pool *pool)
244{
245	ptr_ring_cleanup(&pool->ring, NULL);
246
247	if (pool->p.flags & PP_FLAG_DMA_MAP)
248		put_device(pool->p.dev);
249
250#ifdef CONFIG_PAGE_POOL_STATS
251	free_percpu(pool->recycle_stats);
 
252#endif
253}
254
255/**
256 * page_pool_create() - create a page pool.
257 * @params: parameters, see struct page_pool_params
 
258 */
259struct page_pool *page_pool_create(const struct page_pool_params *params)
 
260{
261	struct page_pool *pool;
262	int err;
263
264	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
265	if (!pool)
266		return ERR_PTR(-ENOMEM);
267
268	err = page_pool_init(pool, params);
269	if (err < 0)
270		goto err_free;
271
272	err = page_pool_list(pool);
273	if (err)
274		goto err_uninit;
275
276	return pool;
277
278err_uninit:
279	page_pool_uninit(pool);
280err_free:
281	pr_warn("%s() gave up with errno %d\n", __func__, err);
282	kfree(pool);
283	return ERR_PTR(err);
284}
 
 
 
 
 
 
 
 
 
 
285EXPORT_SYMBOL(page_pool_create);
286
287static void page_pool_return_page(struct page_pool *pool, struct page *page);
288
289noinline
290static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
291{
292	struct ptr_ring *r = &pool->ring;
293	struct page *page;
294	int pref_nid; /* preferred NUMA node */
295
296	/* Quicker fallback, avoid locks when ring is empty */
297	if (__ptr_ring_empty(r)) {
298		alloc_stat_inc(pool, empty);
299		return NULL;
300	}
301
302	/* Softirq guarantee CPU and thus NUMA node is stable. This,
303	 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
304	 */
305#ifdef CONFIG_NUMA
306	pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
307#else
308	/* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
309	pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
310#endif
311
312	/* Refill alloc array, but only if NUMA match */
313	do {
314		page = __ptr_ring_consume(r);
315		if (unlikely(!page))
316			break;
317
318		if (likely(page_to_nid(page) == pref_nid)) {
319			pool->alloc.cache[pool->alloc.count++] = page;
320		} else {
321			/* NUMA mismatch;
322			 * (1) release 1 page to page-allocator and
323			 * (2) break out to fallthrough to alloc_pages_node.
324			 * This limit stress on page buddy alloactor.
325			 */
326			page_pool_return_page(pool, page);
327			alloc_stat_inc(pool, waive);
328			page = NULL;
329			break;
330		}
331	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
332
333	/* Return last page */
334	if (likely(pool->alloc.count > 0)) {
335		page = pool->alloc.cache[--pool->alloc.count];
336		alloc_stat_inc(pool, refill);
337	}
338
339	return page;
340}
341
342/* fast path */
343static struct page *__page_pool_get_cached(struct page_pool *pool)
344{
345	struct page *page;
346
347	/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
348	if (likely(pool->alloc.count)) {
349		/* Fast-path */
350		page = pool->alloc.cache[--pool->alloc.count];
351		alloc_stat_inc(pool, fast);
352	} else {
353		page = page_pool_refill_alloc_cache(pool);
354	}
355
356	return page;
357}
358
359static void page_pool_dma_sync_for_device(struct page_pool *pool,
360					  struct page *page,
361					  unsigned int dma_sync_size)
362{
363	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
 
364
365	dma_sync_size = min(dma_sync_size, pool->p.max_len);
366	dma_sync_single_range_for_device(pool->p.dev, dma_addr,
367					 pool->p.offset, dma_sync_size,
368					 pool->p.dma_dir);
369}
370
371static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 
 
 
 
 
 
 
 
 
372{
373	dma_addr_t dma;
374
375	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
376	 * since dma_addr_t can be either 32 or 64 bits and does not always fit
377	 * into page private data (i.e 32bit cpu with 64bit DMA caps)
378	 * This mapping is kept for lifetime of page, until leaving pool.
379	 */
380	dma = dma_map_page_attrs(pool->p.dev, page, 0,
381				 (PAGE_SIZE << pool->p.order),
382				 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC |
383						  DMA_ATTR_WEAK_ORDERING);
384	if (dma_mapping_error(pool->p.dev, dma))
385		return false;
386
387	if (page_pool_set_dma_addr(page, dma))
388		goto unmap_failed;
389
390	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
391		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
392
393	return true;
394
395unmap_failed:
396	WARN_ON_ONCE("unexpected DMA address, please report to netdev@");
397	dma_unmap_page_attrs(pool->p.dev, dma,
398			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
399			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
400	return false;
401}
402
403static void page_pool_set_pp_info(struct page_pool *pool,
404				  struct page *page)
405{
406	page->pp = pool;
407	page->pp_magic |= PP_SIGNATURE;
408
409	/* Ensuring all pages have been split into one fragment initially:
410	 * page_pool_set_pp_info() is only called once for every page when it
411	 * is allocated from the page allocator and page_pool_fragment_page()
412	 * is dirtying the same cache line as the page->pp_magic above, so
413	 * the overhead is negligible.
414	 */
415	page_pool_fragment_page(page, 1);
416	if (pool->has_init_callback)
417		pool->slow.init_callback(page, pool->slow.init_arg);
418}
419
420static void page_pool_clear_pp_info(struct page *page)
421{
422	page->pp_magic = 0;
423	page->pp = NULL;
424}
425
426static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
427						 gfp_t gfp)
428{
429	struct page *page;
430
431	gfp |= __GFP_COMP;
432	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
433	if (unlikely(!page))
434		return NULL;
435
436	if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
437	    unlikely(!page_pool_dma_map(pool, page))) {
438		put_page(page);
439		return NULL;
440	}
441
442	alloc_stat_inc(pool, slow_high_order);
443	page_pool_set_pp_info(pool, page);
444
445	/* Track how many pages are held 'in-flight' */
446	pool->pages_state_hold_cnt++;
447	trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
 
448	return page;
449}
450
451/* slow path */
452noinline
453static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
454						 gfp_t gfp)
455{
456	const int bulk = PP_ALLOC_CACHE_REFILL;
457	unsigned int pp_flags = pool->p.flags;
458	unsigned int pp_order = pool->p.order;
459	struct page *page;
 
460	int i, nr_pages;
461
462	/* Don't support bulk alloc for high-order pages */
463	if (unlikely(pp_order))
464		return __page_pool_alloc_page_order(pool, gfp);
465
466	/* Unnecessary as alloc cache is empty, but guarantees zero count */
467	if (unlikely(pool->alloc.count > 0))
468		return pool->alloc.cache[--pool->alloc.count];
469
470	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
471	memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
472
473	nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk,
474					       pool->alloc.cache);
 
475	if (unlikely(!nr_pages))
476		return NULL;
477
478	/* Pages have been filled into alloc.cache array, but count is zero and
479	 * page element have not been (possibly) DMA mapped.
480	 */
481	for (i = 0; i < nr_pages; i++) {
482		page = pool->alloc.cache[i];
483		if ((pp_flags & PP_FLAG_DMA_MAP) &&
484		    unlikely(!page_pool_dma_map(pool, page))) {
485			put_page(page);
486			continue;
487		}
488
489		page_pool_set_pp_info(pool, page);
490		pool->alloc.cache[pool->alloc.count++] = page;
491		/* Track how many pages are held 'in-flight' */
492		pool->pages_state_hold_cnt++;
493		trace_page_pool_state_hold(pool, page,
494					   pool->pages_state_hold_cnt);
495	}
496
497	/* Return last page */
498	if (likely(pool->alloc.count > 0)) {
499		page = pool->alloc.cache[--pool->alloc.count];
500		alloc_stat_inc(pool, slow);
501	} else {
502		page = NULL;
503	}
504
505	/* When page just alloc'ed is should/must have refcnt 1. */
506	return page;
507}
508
509/* For using page_pool replace: alloc_pages() API calls, but provide
510 * synchronization guarantee for allocation side.
511 */
512struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
513{
514	struct page *page;
515
516	/* Fast-path: Get a page from cache */
517	page = __page_pool_get_cached(pool);
518	if (page)
519		return page;
520
521	/* Slow-path: cache empty, do real allocation */
522	page = __page_pool_alloc_pages_slow(pool, gfp);
523	return page;
 
 
 
 
 
 
 
 
 
524}
525EXPORT_SYMBOL(page_pool_alloc_pages);
 
526
527/* Calculate distance between two u32 values, valid if distance is below 2^(31)
528 *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
529 */
530#define _distance(a, b)	(s32)((a) - (b))
531
532s32 page_pool_inflight(const struct page_pool *pool, bool strict)
533{
534	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
535	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
536	s32 inflight;
537
538	inflight = _distance(hold_cnt, release_cnt);
539
540	if (strict) {
541		trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
542		WARN(inflight < 0, "Negative(%d) inflight packet-pages",
543		     inflight);
544	} else {
545		inflight = max(0, inflight);
546	}
547
548	return inflight;
549}
550
551static __always_inline
552void __page_pool_release_page_dma(struct page_pool *pool, struct page *page)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
553{
554	dma_addr_t dma;
555
556	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
557		/* Always account for inflight pages, even if we didn't
558		 * map them
559		 */
560		return;
561
562	dma = page_pool_get_dma_addr(page);
563
564	/* When page is unmapped, it cannot be returned to our pool */
565	dma_unmap_page_attrs(pool->p.dev, dma,
566			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
567			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
568	page_pool_set_dma_addr(page, 0);
569}
570
571/* Disconnects a page (from a page_pool).  API users can have a need
572 * to disconnect a page (from a page_pool), to allow it to be used as
573 * a regular page (that will eventually be returned to the normal
574 * page-allocator via put_page).
575 */
576void page_pool_return_page(struct page_pool *pool, struct page *page)
577{
578	int count;
 
579
580	__page_pool_release_page_dma(pool, page);
581
582	page_pool_clear_pp_info(page);
 
 
583
584	/* This may be the last page returned, releasing the pool, so
585	 * it is not safe to reference pool afterwards.
586	 */
587	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
588	trace_page_pool_state_release(pool, page, count);
589
590	put_page(page);
 
 
 
591	/* An optimization would be to call __free_pages(page, pool->p.order)
592	 * knowing page is not part of page-cache (thus avoiding a
593	 * __page_cache_release() call).
594	 */
595}
596
597static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
598{
599	int ret;
600	/* BH protection not needed if current is softirq */
601	if (in_softirq())
602		ret = ptr_ring_produce(&pool->ring, page);
603	else
604		ret = ptr_ring_produce_bh(&pool->ring, page);
605
606	if (!ret) {
607		recycle_stat_inc(pool, ring);
608		return true;
609	}
610
611	return false;
612}
613
614/* Only allow direct recycling in special circumstances, into the
615 * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
616 *
617 * Caller must provide appropriate safe context.
618 */
619static bool page_pool_recycle_in_cache(struct page *page,
620				       struct page_pool *pool)
621{
622	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
623		recycle_stat_inc(pool, cache_full);
624		return false;
625	}
626
627	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
628	pool->alloc.cache[pool->alloc.count++] = page;
629	recycle_stat_inc(pool, cached);
630	return true;
631}
632
 
 
 
 
 
 
 
633/* If the page refcnt == 1, this will try to recycle the page.
634 * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
635 * the configured size min(dma_sync_size, pool->max_len).
636 * If the page refcnt != 1, then the page will be returned to memory
637 * subsystem.
638 */
639static __always_inline struct page *
640__page_pool_put_page(struct page_pool *pool, struct page *page,
641		     unsigned int dma_sync_size, bool allow_direct)
642{
643	lockdep_assert_no_hardirq();
644
645	/* This allocator is optimized for the XDP mode that uses
646	 * one-frame-per-page, but have fallbacks that act like the
647	 * regular page allocator APIs.
648	 *
649	 * refcnt == 1 means page_pool owns page, and can recycle it.
650	 *
651	 * page is NOT reusable when allocated when system is under
652	 * some pressure. (page_is_pfmemalloc)
653	 */
654	if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
655		/* Read barrier done in page_ref_count / READ_ONCE */
656
657		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
658			page_pool_dma_sync_for_device(pool, page,
659						      dma_sync_size);
660
661		if (allow_direct && in_softirq() &&
662		    page_pool_recycle_in_cache(page, pool))
663			return NULL;
664
665		/* Page found as candidate for recycling */
666		return page;
667	}
 
668	/* Fallback/non-XDP mode: API user have elevated refcnt.
669	 *
670	 * Many drivers split up the page into fragments, and some
671	 * want to keep doing this to save memory and do refcnt based
672	 * recycling. Support this use case too, to ease drivers
673	 * switching between XDP/non-XDP.
674	 *
675	 * In-case page_pool maintains the DMA mapping, API user must
676	 * call page_pool_put_page once.  In this elevated refcnt
677	 * case, the DMA is unmapped/released, as driver is likely
678	 * doing refcnt based recycle tricks, meaning another process
679	 * will be invoking put_page.
680	 */
681	recycle_stat_inc(pool, released_refcnt);
682	page_pool_return_page(pool, page);
683
684	return NULL;
685}
686
687void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
688				unsigned int dma_sync_size, bool allow_direct)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
689{
690	page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
691	if (page && !page_pool_recycle_in_ring(pool, page)) {
 
 
 
 
692		/* Cache full, fallback to free pages */
693		recycle_stat_inc(pool, ring_full);
694		page_pool_return_page(pool, page);
695	}
696}
 
 
 
 
 
 
 
 
697EXPORT_SYMBOL(page_pool_put_unrefed_page);
698
699/**
700 * page_pool_put_page_bulk() - release references on multiple pages
701 * @pool:	pool from which pages were allocated
702 * @data:	array holding page pointers
703 * @count:	number of pages in @data
704 *
705 * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
706 * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
707 * will release leftover pages to the page allocator.
708 * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
709 * completion loop for the XDP_REDIRECT use case.
710 *
711 * Please note the caller must not use data area after running
712 * page_pool_put_page_bulk(), as this function overwrites it.
713 */
714void page_pool_put_page_bulk(struct page_pool *pool, void **data,
715			     int count)
716{
717	int i, bulk_len = 0;
 
718	bool in_softirq;
719
 
 
720	for (i = 0; i < count; i++) {
721		struct page *page = virt_to_head_page(data[i]);
722
723		/* It is not the last user for the page frag case */
724		if (!page_pool_is_last_ref(page))
725			continue;
726
727		page = __page_pool_put_page(pool, page, -1, false);
728		/* Approved for bulk recycling in ptr_ring cache */
729		if (page)
730			data[bulk_len++] = page;
731	}
732
733	if (unlikely(!bulk_len))
734		return;
735
736	/* Bulk producer into ptr_ring page_pool cache */
737	in_softirq = page_pool_producer_lock(pool);
738	for (i = 0; i < bulk_len; i++) {
739		if (__ptr_ring_produce(&pool->ring, data[i])) {
740			/* ring full */
741			recycle_stat_inc(pool, ring_full);
742			break;
743		}
744	}
745	recycle_stat_add(pool, ring, i);
746	page_pool_producer_unlock(pool, in_softirq);
747
748	/* Hopefully all pages was return into ptr_ring */
749	if (likely(i == bulk_len))
750		return;
751
752	/* ptr_ring cache full, free remaining pages outside producer lock
753	 * since put_page() with refcnt == 1 can be an expensive operation
754	 */
755	for (; i < bulk_len; i++)
756		page_pool_return_page(pool, data[i]);
757}
758EXPORT_SYMBOL(page_pool_put_page_bulk);
759
760static struct page *page_pool_drain_frag(struct page_pool *pool,
761					 struct page *page)
762{
763	long drain_count = BIAS_MAX - pool->frag_users;
764
765	/* Some user is still using the page frag */
766	if (likely(page_pool_unref_page(page, drain_count)))
767		return NULL;
768
769	if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
770		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
771			page_pool_dma_sync_for_device(pool, page, -1);
772
773		return page;
 
 
774	}
775
776	page_pool_return_page(pool, page);
777	return NULL;
778}
779
780static void page_pool_free_frag(struct page_pool *pool)
781{
782	long drain_count = BIAS_MAX - pool->frag_users;
783	struct page *page = pool->frag_page;
784
785	pool->frag_page = NULL;
786
787	if (!page || page_pool_unref_page(page, drain_count))
788		return;
789
790	page_pool_return_page(pool, page);
791}
792
793struct page *page_pool_alloc_frag(struct page_pool *pool,
794				  unsigned int *offset,
795				  unsigned int size, gfp_t gfp)
796{
797	unsigned int max_size = PAGE_SIZE << pool->p.order;
798	struct page *page = pool->frag_page;
799
800	if (WARN_ON(size > max_size))
801		return NULL;
802
803	size = ALIGN(size, dma_get_cache_alignment());
804	*offset = pool->frag_offset;
805
806	if (page && *offset + size > max_size) {
807		page = page_pool_drain_frag(pool, page);
808		if (page) {
 
809			alloc_stat_inc(pool, fast);
810			goto frag_reset;
811		}
812	}
813
814	if (!page) {
815		page = page_pool_alloc_pages(pool, gfp);
816		if (unlikely(!page)) {
817			pool->frag_page = NULL;
818			return NULL;
819		}
820
821		pool->frag_page = page;
822
823frag_reset:
824		pool->frag_users = 1;
825		*offset = 0;
826		pool->frag_offset = size;
827		page_pool_fragment_page(page, BIAS_MAX);
828		return page;
829	}
830
831	pool->frag_users++;
832	pool->frag_offset = *offset + size;
833	alloc_stat_inc(pool, fast);
834	return page;
 
 
 
 
 
 
 
835}
836EXPORT_SYMBOL(page_pool_alloc_frag);
837
838static void page_pool_empty_ring(struct page_pool *pool)
839{
840	struct page *page;
841
842	/* Empty recycle ring */
843	while ((page = ptr_ring_consume_bh(&pool->ring))) {
844		/* Verify the refcnt invariant of cached pages */
845		if (!(page_ref_count(page) == 1))
846			pr_crit("%s() page_pool refcnt %d violation\n",
847				__func__, page_ref_count(page));
848
849		page_pool_return_page(pool, page);
850	}
851}
852
853static void __page_pool_destroy(struct page_pool *pool)
854{
855	if (pool->disconnect)
856		pool->disconnect(pool);
857
858	page_pool_unlist(pool);
859	page_pool_uninit(pool);
 
 
 
 
 
 
860	kfree(pool);
861}
862
863static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
864{
865	struct page *page;
866
867	if (pool->destroy_cnt)
868		return;
869
870	/* Empty alloc cache, assume caller made sure this is
871	 * no-longer in use, and page_pool_alloc_pages() cannot be
872	 * call concurrently.
873	 */
874	while (pool->alloc.count) {
875		page = pool->alloc.cache[--pool->alloc.count];
876		page_pool_return_page(pool, page);
877	}
878}
879
880static void page_pool_scrub(struct page_pool *pool)
881{
882	page_pool_empty_alloc_cache_once(pool);
883	pool->destroy_cnt++;
884
885	/* No more consumers should exist, but producers could still
886	 * be in-flight.
887	 */
888	page_pool_empty_ring(pool);
889}
890
891static int page_pool_release(struct page_pool *pool)
892{
893	int inflight;
894
895	page_pool_scrub(pool);
896	inflight = page_pool_inflight(pool, true);
897	if (!inflight)
898		__page_pool_destroy(pool);
899
900	return inflight;
901}
902
903static void page_pool_release_retry(struct work_struct *wq)
904{
905	struct delayed_work *dwq = to_delayed_work(wq);
906	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
907	void *netdev;
908	int inflight;
909
910	inflight = page_pool_release(pool);
911	if (!inflight)
912		return;
913
914	/* Periodic warning for page pools the user can't see */
915	netdev = READ_ONCE(pool->slow.netdev);
916	if (time_after_eq(jiffies, pool->defer_warn) &&
917	    (!netdev || netdev == NET_PTR_POISON)) {
918		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
919
920		pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
921			__func__, pool->user.id, inflight, sec);
922		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
923	}
924
925	/* Still not ready to be disconnected, retry later */
926	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
927}
928
929void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
930			   struct xdp_mem_info *mem)
931{
932	refcount_inc(&pool->user_cnt);
933	pool->disconnect = disconnect;
934	pool->xdp_mem_id = mem->id;
935}
936
937void page_pool_unlink_napi(struct page_pool *pool)
938{
 
 
 
 
 
939	if (!pool->p.napi)
940		return;
941
942	/* To avoid races with recycling and additional barriers make sure
943	 * pool and NAPI are unlinked when NAPI is disabled.
944	 */
945	WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) ||
946		READ_ONCE(pool->p.napi->list_owner) != -1);
947
 
948	WRITE_ONCE(pool->p.napi, NULL);
 
949}
950EXPORT_SYMBOL(page_pool_unlink_napi);
951
952void page_pool_destroy(struct page_pool *pool)
953{
954	if (!pool)
955		return;
956
957	if (!page_pool_put(pool))
958		return;
959
960	page_pool_unlink_napi(pool);
961	page_pool_free_frag(pool);
962
963	if (!page_pool_release(pool))
964		return;
965
966	page_pool_detached(pool);
967	pool->defer_start = jiffies;
968	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
969
970	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
971	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
972}
973EXPORT_SYMBOL(page_pool_destroy);
974
975/* Caller must provide appropriate safe context, e.g. NAPI. */
976void page_pool_update_nid(struct page_pool *pool, int new_nid)
977{
978	struct page *page;
979
980	trace_page_pool_update_nid(pool, new_nid);
981	pool->p.nid = new_nid;
982
983	/* Flush pool alloc cache, as refill will check NUMA node */
984	while (pool->alloc.count) {
985		page = pool->alloc.cache[--pool->alloc.count];
986		page_pool_return_page(pool, page);
987	}
988}
989EXPORT_SYMBOL(page_pool_update_nid);