dev.c - net/core/dev.c - Linux diff v4.6 - Bootlin Elixir Cross Referencer

   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
  99#include <net/busy_poll.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/stat.h>
 102#include <net/dst.h>
 103#include <net/dst_metadata.h>
 104#include <net/pkt_sched.h>
 105#include <net/checksum.h>
 106#include <net/xfrm.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/module.h>
 110#include <linux/netpoll.h>
 111#include <linux/rcupdate.h>
 112#include <linux/delay.h>
 113#include <net/iw_handler.h>
 114#include <asm/current.h>
 115#include <linux/audit.h>
 116#include <linux/dmaengine.h>
 117#include <linux/err.h>
 118#include <linux/ctype.h>
 119#include <linux/if_arp.h>
 120#include <linux/if_vlan.h>
 121#include <linux/ip.h>
 122#include <net/ip.h>
 123#include <net/mpls.h>
 124#include <linux/ipv6.h>
 125#include <linux/in.h>
 126#include <linux/jhash.h>
 127#include <linux/random.h>
 128#include <trace/events/napi.h>
 129#include <trace/events/net.h>
 130#include <trace/events/skb.h>
 131#include <linux/pci.h>
 132#include <linux/inetdevice.h>
 133#include <linux/cpu_rmap.h>
 134#include <linux/static_key.h>
 135#include <linux/hashtable.h>
 136#include <linux/vmalloc.h>
 137#include <linux/if_macvlan.h>
 138#include <linux/errqueue.h>
 139#include <linux/hrtimer.h>
 140#include <linux/netfilter_ingress.h>
 141#include <linux/sctp.h>
 142
 143#include "net-sysfs.h"
 144
 145/* Instead of increasing this, you should create a hash table. */
 146#define MAX_GRO_SKBS 8
 147
 148/* This should be increased if a protocol with a bigger head is added. */
 149#define GRO_MAX_HEAD (MAX_HEADER + 128)
 150
 151static DEFINE_SPINLOCK(ptype_lock);
 152static DEFINE_SPINLOCK(offload_lock);
 153struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 154struct list_head ptype_all __read_mostly;	/* Taps */
 155static struct list_head offload_base __read_mostly;
 156
 157static int netif_rx_internal(struct sk_buff *skb);
 158static int call_netdevice_notifiers_info(unsigned long val,
 159					 struct net_device *dev,
 160					 struct netdev_notifier_info *info);
 161
 162/*
 163 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 164 * semaphore.
 165 *
 166 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 167 *
 168 * Writers must hold the rtnl semaphore while they loop through the
 169 * dev_base_head list, and hold dev_base_lock for writing when they do the
 170 * actual updates.  This allows pure readers to access the list even
 171 * while a writer is preparing to update it.
 172 *
 173 * To put it another way, dev_base_lock is held for writing only to
 174 * protect against pure readers; the rtnl semaphore provides the
 175 * protection against other writers.
 176 *
 177 * See, for example usages, register_netdevice() and
 178 * unregister_netdevice(), which must be called with the rtnl
 179 * semaphore held.
 180 */
 181DEFINE_RWLOCK(dev_base_lock);
 182EXPORT_SYMBOL(dev_base_lock);
 183
 184/* protects napi_hash addition/deletion and napi_gen_id */
 185static DEFINE_SPINLOCK(napi_hash_lock);
 186
 187static unsigned int napi_gen_id = NR_CPUS;
 188static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 189
 190static seqcount_t devnet_rename_seq;
 191
 192static inline void dev_base_seq_inc(struct net *net)
 193{
 194	while (++net->dev_base_seq == 0);
 195}
 196
 197static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 198{
 199	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 200
 201	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 202}
 203
 204static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 205{
 206	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 207}
 208
 209static inline void rps_lock(struct softnet_data *sd)
 210{
 211#ifdef CONFIG_RPS
 212	spin_lock(&sd->input_pkt_queue.lock);
 213#endif
 214}
 215
 216static inline void rps_unlock(struct softnet_data *sd)
 217{
 218#ifdef CONFIG_RPS
 219	spin_unlock(&sd->input_pkt_queue.lock);
 220#endif
 221}
 222
 223/* Device list insertion */
 224static void list_netdevice(struct net_device *dev)
 225{
 226	struct net *net = dev_net(dev);
 227
 228	ASSERT_RTNL();
 229
 230	write_lock_bh(&dev_base_lock);
 231	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 232	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 233	hlist_add_head_rcu(&dev->index_hlist,
 234			   dev_index_hash(net, dev->ifindex));
 235	write_unlock_bh(&dev_base_lock);
 236
 237	dev_base_seq_inc(net);
 238}
 239
 240/* Device list removal
 241 * caller must respect a RCU grace period before freeing/reusing dev
 242 */
 243static void unlist_netdevice(struct net_device *dev)
 244{
 245	ASSERT_RTNL();
 246
 247	/* Unlink dev from the device chain */
 248	write_lock_bh(&dev_base_lock);
 249	list_del_rcu(&dev->dev_list);
 250	hlist_del_rcu(&dev->name_hlist);
 251	hlist_del_rcu(&dev->index_hlist);
 252	write_unlock_bh(&dev_base_lock);
 253
 254	dev_base_seq_inc(dev_net(dev));
 255}
 256
 257/*
 258 *	Our notifier list
 259 */
 260
 261static RAW_NOTIFIER_HEAD(netdev_chain);
 262
 263/*
 264 *	Device drivers call our routines to queue packets here. We empty the
 265 *	queue in the local softnet handler.
 266 */
 267
 268DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 269EXPORT_PER_CPU_SYMBOL(softnet_data);
 270
 271#ifdef CONFIG_LOCKDEP
 272/*
 273 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 274 * according to dev->type
 275 */
 276static const unsigned short netdev_lock_type[] =
 277	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 278	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 279	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 280	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 281	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 282	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 283	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 284	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 285	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 286	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 287	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 288	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 289	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 290	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 291	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 292
 293static const char *const netdev_lock_name[] =
 294	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 295	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 296	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 297	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 298	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 299	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 300	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 301	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 302	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 303	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 304	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 305	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 306	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 307	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 308	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 309
 310static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 311static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312
 313static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 314{
 315	int i;
 316
 317	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 318		if (netdev_lock_type[i] == dev_type)
 319			return i;
 320	/* the last key is used by default */
 321	return ARRAY_SIZE(netdev_lock_type) - 1;
 322}
 323
 324static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 325						 unsigned short dev_type)
 326{
 327	int i;
 328
 329	i = netdev_lock_pos(dev_type);
 330	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 331				   netdev_lock_name[i]);
 332}
 333
 334static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 335{
 336	int i;
 337
 338	i = netdev_lock_pos(dev->type);
 339	lockdep_set_class_and_name(&dev->addr_list_lock,
 340				   &netdev_addr_lock_key[i],
 341				   netdev_lock_name[i]);
 342}
 343#else
 344static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 345						 unsigned short dev_type)
 346{
 347}
 348static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 349{
 350}
 351#endif
 352
 353/*******************************************************************************
 354
 355		Protocol management and registration routines
 356
 357*******************************************************************************/
 358
 359/*
 360 *	Add a protocol ID to the list. Now that the input handler is
 361 *	smarter we can dispense with all the messy stuff that used to be
 362 *	here.
 363 *
 364 *	BEWARE!!! Protocol handlers, mangling input packets,
 365 *	MUST BE last in hash buckets and checking protocol handlers
 366 *	MUST start from promiscuous ptype_all chain in net_bh.
 367 *	It is true now, do not change it.
 368 *	Explanation follows: if protocol handler, mangling packet, will
 369 *	be the first on list, it is not able to sense, that packet
 370 *	is cloned and should be copied-on-write, so that it will
 371 *	change it and subsequent readers will get broken packet.
 372 *							--ANK (980803)
 373 */
 374
 375static inline struct list_head *ptype_head(const struct packet_type *pt)
 376{
 377	if (pt->type == htons(ETH_P_ALL))
 378		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 379	else
 380		return pt->dev ? &pt->dev->ptype_specific :
 381				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 382}
 383
 384/**
 385 *	dev_add_pack - add packet handler
 386 *	@pt: packet type declaration
 387 *
 388 *	Add a protocol handler to the networking stack. The passed &packet_type
 389 *	is linked into kernel lists and may not be freed until it has been
 390 *	removed from the kernel lists.
 391 *
 392 *	This call does not sleep therefore it can not
 393 *	guarantee all CPU's that are in middle of receiving packets
 394 *	will see the new packet type (until the next received packet).
 395 */
 396
 397void dev_add_pack(struct packet_type *pt)
 398{
 399	struct list_head *head = ptype_head(pt);
 400
 401	spin_lock(&ptype_lock);
 402	list_add_rcu(&pt->list, head);
 403	spin_unlock(&ptype_lock);
 404}
 405EXPORT_SYMBOL(dev_add_pack);
 406
 407/**
 408 *	__dev_remove_pack	 - remove packet handler
 409 *	@pt: packet type declaration
 410 *
 411 *	Remove a protocol handler that was previously added to the kernel
 412 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 413 *	from the kernel lists and can be freed or reused once this function
 414 *	returns.
 415 *
 416 *      The packet type might still be in use by receivers
 417 *	and must not be freed until after all the CPU's have gone
 418 *	through a quiescent state.
 419 */
 420void __dev_remove_pack(struct packet_type *pt)
 421{
 422	struct list_head *head = ptype_head(pt);
 423	struct packet_type *pt1;
 424
 425	spin_lock(&ptype_lock);
 426
 427	list_for_each_entry(pt1, head, list) {
 428		if (pt == pt1) {
 429			list_del_rcu(&pt->list);
 430			goto out;
 431		}
 432	}
 433
 434	pr_warn("dev_remove_pack: %p not found\n", pt);
 435out:
 436	spin_unlock(&ptype_lock);
 437}
 438EXPORT_SYMBOL(__dev_remove_pack);
 439
 440/**
 441 *	dev_remove_pack	 - remove packet handler
 442 *	@pt: packet type declaration
 443 *
 444 *	Remove a protocol handler that was previously added to the kernel
 445 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 446 *	from the kernel lists and can be freed or reused once this function
 447 *	returns.
 448 *
 449 *	This call sleeps to guarantee that no CPU is looking at the packet
 450 *	type after return.
 451 */
 452void dev_remove_pack(struct packet_type *pt)
 453{
 454	__dev_remove_pack(pt);
 455
 456	synchronize_net();
 457}
 458EXPORT_SYMBOL(dev_remove_pack);
 459
 460
 461/**
 462 *	dev_add_offload - register offload handlers
 463 *	@po: protocol offload declaration
 464 *
 465 *	Add protocol offload handlers to the networking stack. The passed
 466 *	&proto_offload is linked into kernel lists and may not be freed until
 467 *	it has been removed from the kernel lists.
 468 *
 469 *	This call does not sleep therefore it can not
 470 *	guarantee all CPU's that are in middle of receiving packets
 471 *	will see the new offload handlers (until the next received packet).
 472 */
 473void dev_add_offload(struct packet_offload *po)
 474{
 475	struct packet_offload *elem;
 476
 477	spin_lock(&offload_lock);
 478	list_for_each_entry(elem, &offload_base, list) {
 479		if (po->priority < elem->priority)
 480			break;
 481	}
 482	list_add_rcu(&po->list, elem->list.prev);
 483	spin_unlock(&offload_lock);
 484}
 485EXPORT_SYMBOL(dev_add_offload);
 486
 487/**
 488 *	__dev_remove_offload	 - remove offload handler
 489 *	@po: packet offload declaration
 490 *
 491 *	Remove a protocol offload handler that was previously added to the
 492 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 493 *	is removed from the kernel lists and can be freed or reused once this
 494 *	function returns.
 495 *
 496 *      The packet type might still be in use by receivers
 497 *	and must not be freed until after all the CPU's have gone
 498 *	through a quiescent state.
 499 */
 500static void __dev_remove_offload(struct packet_offload *po)
 501{
 502	struct list_head *head = &offload_base;
 503	struct packet_offload *po1;
 504
 505	spin_lock(&offload_lock);
 506
 507	list_for_each_entry(po1, head, list) {
 508		if (po == po1) {
 509			list_del_rcu(&po->list);
 510			goto out;
 511		}
 512	}
 513
 514	pr_warn("dev_remove_offload: %p not found\n", po);
 515out:
 516	spin_unlock(&offload_lock);
 517}
 518
 519/**
 520 *	dev_remove_offload	 - remove packet offload handler
 521 *	@po: packet offload declaration
 522 *
 523 *	Remove a packet offload handler that was previously added to the kernel
 524 *	offload handlers by dev_add_offload(). The passed &offload_type is
 525 *	removed from the kernel lists and can be freed or reused once this
 526 *	function returns.
 527 *
 528 *	This call sleeps to guarantee that no CPU is looking at the packet
 529 *	type after return.
 530 */
 531void dev_remove_offload(struct packet_offload *po)
 532{
 533	__dev_remove_offload(po);
 534
 535	synchronize_net();
 536}
 537EXPORT_SYMBOL(dev_remove_offload);
 538
 539/******************************************************************************
 540
 541		      Device Boot-time Settings Routines
 542
 543*******************************************************************************/
 544
 545/* Boot time configuration table */
 546static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 547
 548/**
 549 *	netdev_boot_setup_add	- add new setup entry
 550 *	@name: name of the device
 551 *	@map: configured settings for the device
 552 *
 553 *	Adds new setup entry to the dev_boot_setup list.  The function
 554 *	returns 0 on error and 1 on success.  This is a generic routine to
 555 *	all netdevices.
 556 */
 557static int netdev_boot_setup_add(char *name, struct ifmap *map)
 558{
 559	struct netdev_boot_setup *s;
 560	int i;
 561
 562	s = dev_boot_setup;
 563	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 564		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 565			memset(s[i].name, 0, sizeof(s[i].name));
 566			strlcpy(s[i].name, name, IFNAMSIZ);
 567			memcpy(&s[i].map, map, sizeof(s[i].map));
 568			break;
 569		}
 570	}
 571
 572	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 573}
 574
 575/**
 576 *	netdev_boot_setup_check	- check boot time settings
 577 *	@dev: the netdevice
 578 *
 579 * 	Check boot time settings for the device.
 580 *	The found settings are set for the device to be used
 581 *	later in the device probing.
 582 *	Returns 0 if no settings found, 1 if they are.
 583 */
 584int netdev_boot_setup_check(struct net_device *dev)
 585{
 586	struct netdev_boot_setup *s = dev_boot_setup;
 587	int i;
 588
 589	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 590		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 591		    !strcmp(dev->name, s[i].name)) {
 592			dev->irq 	= s[i].map.irq;
 593			dev->base_addr 	= s[i].map.base_addr;
 594			dev->mem_start 	= s[i].map.mem_start;
 595			dev->mem_end 	= s[i].map.mem_end;
 596			return 1;
 597		}
 598	}
 599	return 0;
 600}
 601EXPORT_SYMBOL(netdev_boot_setup_check);
 602
 603
 604/**
 605 *	netdev_boot_base	- get address from boot time settings
 606 *	@prefix: prefix for network device
 607 *	@unit: id for network device
 608 *
 609 * 	Check boot time settings for the base address of device.
 610 *	The found settings are set for the device to be used
 611 *	later in the device probing.
 612 *	Returns 0 if no settings found.
 613 */
 614unsigned long netdev_boot_base(const char *prefix, int unit)
 615{
 616	const struct netdev_boot_setup *s = dev_boot_setup;
 617	char name[IFNAMSIZ];
 618	int i;
 619
 620	sprintf(name, "%s%d", prefix, unit);
 621
 622	/*
 623	 * If device already registered then return base of 1
 624	 * to indicate not to probe for this interface
 625	 */
 626	if (__dev_get_by_name(&init_net, name))
 627		return 1;
 628
 629	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 630		if (!strcmp(name, s[i].name))
 631			return s[i].map.base_addr;
 632	return 0;
 633}
 634
 635/*
 636 * Saves at boot time configured settings for any netdevice.
 637 */
 638int __init netdev_boot_setup(char *str)
 639{
 640	int ints[5];
 641	struct ifmap map;
 642
 643	str = get_options(str, ARRAY_SIZE(ints), ints);
 644	if (!str || !*str)
 645		return 0;
 646
 647	/* Save settings */
 648	memset(&map, 0, sizeof(map));
 649	if (ints[0] > 0)
 650		map.irq = ints[1];
 651	if (ints[0] > 1)
 652		map.base_addr = ints[2];
 653	if (ints[0] > 2)
 654		map.mem_start = ints[3];
 655	if (ints[0] > 3)
 656		map.mem_end = ints[4];
 657
 658	/* Add new entry to the list */
 659	return netdev_boot_setup_add(str, &map);
 660}
 661
 662__setup("netdev=", netdev_boot_setup);
 663
 664/*******************************************************************************
 665
 666			    Device Interface Subroutines
 667
 668*******************************************************************************/
 669
 670/**
 671 *	dev_get_iflink	- get 'iflink' value of a interface
 672 *	@dev: targeted interface
 673 *
 674 *	Indicates the ifindex the interface is linked to.
 675 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
 676 */
 677
 678int dev_get_iflink(const struct net_device *dev)
 679{
 680	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 681		return dev->netdev_ops->ndo_get_iflink(dev);
 682
 683	return dev->ifindex;
 684}
 685EXPORT_SYMBOL(dev_get_iflink);
 686
 687/**
 688 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
 689 *	@dev: targeted interface
 690 *	@skb: The packet.
 691 *
 692 *	For better visibility of tunnel traffic OVS needs to retrieve
 693 *	egress tunnel information for a packet. Following API allows
 694 *	user to get this info.
 695 */
 696int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 697{
 698	struct ip_tunnel_info *info;
 699
 700	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 701		return -EINVAL;
 702
 703	info = skb_tunnel_info_unclone(skb);
 704	if (!info)
 705		return -ENOMEM;
 706	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 707		return -EINVAL;
 708
 709	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 710}
 711EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 712
 713/**
 714 *	__dev_get_by_name	- find a device by its name
 715 *	@net: the applicable net namespace
 716 *	@name: name to find
 717 *
 718 *	Find an interface by name. Must be called under RTNL semaphore
 719 *	or @dev_base_lock. If the name is found a pointer to the device
 720 *	is returned. If the name is not found then %NULL is returned. The
 721 *	reference counters are not incremented so the caller must be
 722 *	careful with locks.
 723 */
 724
 725struct net_device *__dev_get_by_name(struct net *net, const char *name)
 726{
 727	struct net_device *dev;
 728	struct hlist_head *head = dev_name_hash(net, name);
 729
 730	hlist_for_each_entry(dev, head, name_hlist)
 731		if (!strncmp(dev->name, name, IFNAMSIZ))
 732			return dev;
 733
 734	return NULL;
 735}
 736EXPORT_SYMBOL(__dev_get_by_name);
 737
 738/**
 739 *	dev_get_by_name_rcu	- find a device by its name
 740 *	@net: the applicable net namespace
 741 *	@name: name to find
 742 *
 743 *	Find an interface by name.
 744 *	If the name is found a pointer to the device is returned.
 745 * 	If the name is not found then %NULL is returned.
 746 *	The reference counters are not incremented so the caller must be
 747 *	careful with locks. The caller must hold RCU lock.
 748 */
 749
 750struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 751{
 752	struct net_device *dev;
 753	struct hlist_head *head = dev_name_hash(net, name);
 754
 755	hlist_for_each_entry_rcu(dev, head, name_hlist)
 756		if (!strncmp(dev->name, name, IFNAMSIZ))
 757			return dev;
 758
 759	return NULL;
 760}
 761EXPORT_SYMBOL(dev_get_by_name_rcu);
 762
 763/**
 764 *	dev_get_by_name		- find a device by its name
 765 *	@net: the applicable net namespace
 766 *	@name: name to find
 767 *
 768 *	Find an interface by name. This can be called from any
 769 *	context and does its own locking. The returned handle has
 770 *	the usage count incremented and the caller must use dev_put() to
 771 *	release it when it is no longer needed. %NULL is returned if no
 772 *	matching device is found.
 773 */
 774
 775struct net_device *dev_get_by_name(struct net *net, const char *name)
 776{
 777	struct net_device *dev;
 778
 779	rcu_read_lock();
 780	dev = dev_get_by_name_rcu(net, name);
 781	if (dev)
 782		dev_hold(dev);
 783	rcu_read_unlock();
 784	return dev;
 785}
 786EXPORT_SYMBOL(dev_get_by_name);
 787
 788/**
 789 *	__dev_get_by_index - find a device by its ifindex
 790 *	@net: the applicable net namespace
 791 *	@ifindex: index of device
 792 *
 793 *	Search for an interface by index. Returns %NULL if the device
 794 *	is not found or a pointer to the device. The device has not
 795 *	had its reference counter increased so the caller must be careful
 796 *	about locking. The caller must hold either the RTNL semaphore
 797 *	or @dev_base_lock.
 798 */
 799
 800struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 801{
 802	struct net_device *dev;
 803	struct hlist_head *head = dev_index_hash(net, ifindex);
 804
 805	hlist_for_each_entry(dev, head, index_hlist)
 806		if (dev->ifindex == ifindex)
 807			return dev;
 808
 809	return NULL;
 810}
 811EXPORT_SYMBOL(__dev_get_by_index);
 812
 813/**
 814 *	dev_get_by_index_rcu - find a device by its ifindex
 815 *	@net: the applicable net namespace
 816 *	@ifindex: index of device
 817 *
 818 *	Search for an interface by index. Returns %NULL if the device
 819 *	is not found or a pointer to the device. The device has not
 820 *	had its reference counter increased so the caller must be careful
 821 *	about locking. The caller must hold RCU lock.
 822 */
 823
 824struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 825{
 826	struct net_device *dev;
 827	struct hlist_head *head = dev_index_hash(net, ifindex);
 828
 829	hlist_for_each_entry_rcu(dev, head, index_hlist)
 830		if (dev->ifindex == ifindex)
 831			return dev;
 832
 833	return NULL;
 834}
 835EXPORT_SYMBOL(dev_get_by_index_rcu);
 836
 837
 838/**
 839 *	dev_get_by_index - find a device by its ifindex
 840 *	@net: the applicable net namespace
 841 *	@ifindex: index of device
 842 *
 843 *	Search for an interface by index. Returns NULL if the device
 844 *	is not found or a pointer to the device. The device returned has
 845 *	had a reference added and the pointer is safe until the user calls
 846 *	dev_put to indicate they have finished with it.
 847 */
 848
 849struct net_device *dev_get_by_index(struct net *net, int ifindex)
 850{
 851	struct net_device *dev;
 852
 853	rcu_read_lock();
 854	dev = dev_get_by_index_rcu(net, ifindex);
 855	if (dev)
 856		dev_hold(dev);
 857	rcu_read_unlock();
 858	return dev;
 859}
 860EXPORT_SYMBOL(dev_get_by_index);
 861
 862/**
 863 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 864 *	@net: network namespace
 865 *	@name: a pointer to the buffer where the name will be stored.
 866 *	@ifindex: the ifindex of the interface to get the name from.
 867 *
 868 *	The use of raw_seqcount_begin() and cond_resched() before
 869 *	retrying is required as we want to give the writers a chance
 870 *	to complete when CONFIG_PREEMPT is not set.
 871 */
 872int netdev_get_name(struct net *net, char *name, int ifindex)
 873{
 874	struct net_device *dev;
 875	unsigned int seq;
 876
 877retry:
 878	seq = raw_seqcount_begin(&devnet_rename_seq);
 879	rcu_read_lock();
 880	dev = dev_get_by_index_rcu(net, ifindex);
 881	if (!dev) {
 882		rcu_read_unlock();
 883		return -ENODEV;
 884	}
 885
 886	strcpy(name, dev->name);
 887	rcu_read_unlock();
 888	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 889		cond_resched();
 890		goto retry;
 891	}
 892
 893	return 0;
 894}
 895
 896/**
 897 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 898 *	@net: the applicable net namespace
 899 *	@type: media type of device
 900 *	@ha: hardware address
 901 *
 902 *	Search for an interface by MAC address. Returns NULL if the device
 903 *	is not found or a pointer to the device.
 904 *	The caller must hold RCU or RTNL.
 905 *	The returned device has not had its ref count increased
 906 *	and the caller must therefore be careful about locking
 907 *
 908 */
 909
 910struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 911				       const char *ha)
 912{
 913	struct net_device *dev;
 914
 915	for_each_netdev_rcu(net, dev)
 916		if (dev->type == type &&
 917		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 918			return dev;
 919
 920	return NULL;
 921}
 922EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 923
 924struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 925{
 926	struct net_device *dev;
 927
 928	ASSERT_RTNL();
 929	for_each_netdev(net, dev)
 930		if (dev->type == type)
 931			return dev;
 932
 933	return NULL;
 934}
 935EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 936
 937struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 938{
 939	struct net_device *dev, *ret = NULL;
 940
 941	rcu_read_lock();
 942	for_each_netdev_rcu(net, dev)
 943		if (dev->type == type) {
 944			dev_hold(dev);
 945			ret = dev;
 946			break;
 947		}
 948	rcu_read_unlock();
 949	return ret;
 950}
 951EXPORT_SYMBOL(dev_getfirstbyhwtype);
 952
 953/**
 954 *	__dev_get_by_flags - find any device with given flags
 955 *	@net: the applicable net namespace
 956 *	@if_flags: IFF_* values
 957 *	@mask: bitmask of bits in if_flags to check
 958 *
 959 *	Search for any interface with the given flags. Returns NULL if a device
 960 *	is not found or a pointer to the device. Must be called inside
 961 *	rtnl_lock(), and result refcount is unchanged.
 962 */
 963
 964struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 965				      unsigned short mask)
 966{
 967	struct net_device *dev, *ret;
 968
 969	ASSERT_RTNL();
 970
 971	ret = NULL;
 972	for_each_netdev(net, dev) {
 973		if (((dev->flags ^ if_flags) & mask) == 0) {
 974			ret = dev;
 975			break;
 976		}
 977	}
 978	return ret;
 979}
 980EXPORT_SYMBOL(__dev_get_by_flags);
 981
 982/**
 983 *	dev_valid_name - check if name is okay for network device
 984 *	@name: name string
 985 *
 986 *	Network device names need to be valid file names to
 987 *	to allow sysfs to work.  We also disallow any kind of
 988 *	whitespace.
 989 */
 990bool dev_valid_name(const char *name)
 991{
 992	if (*name == '\0')
 993		return false;
 994	if (strlen(name) >= IFNAMSIZ)
 995		return false;
 996	if (!strcmp(name, ".") || !strcmp(name, ".."))
 997		return false;
 998
 999	while (*name) {
1000		if (*name == '/' || *name == ':' || isspace(*name))
1001			return false;
1002		name++;
1003	}
1004	return true;
1005}
1006EXPORT_SYMBOL(dev_valid_name);
1007
1008/**
1009 *	__dev_alloc_name - allocate a name for a device
1010 *	@net: network namespace to allocate the device name in
1011 *	@name: name format string
1012 *	@buf:  scratch buffer and result name string
1013 *
1014 *	Passed a format string - eg "lt%d" it will try and find a suitable
1015 *	id. It scans list of devices to build up a free map, then chooses
1016 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1017 *	while allocating the name and adding the device in order to avoid
1018 *	duplicates.
1019 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1020 *	Returns the number of the unit assigned or a negative errno code.
1021 */
1022
1023static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1024{
1025	int i = 0;
1026	const char *p;
1027	const int max_netdevices = 8*PAGE_SIZE;
1028	unsigned long *inuse;
1029	struct net_device *d;
1030
1031	p = strnchr(name, IFNAMSIZ-1, '%');
1032	if (p) {
1033		/*
1034		 * Verify the string as this thing may have come from
1035		 * the user.  There must be either one "%d" and no other "%"
1036		 * characters.
1037		 */
1038		if (p[1] != 'd' || strchr(p + 2, '%'))
1039			return -EINVAL;
1040
1041		/* Use one page as a bit array of possible slots */
1042		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1043		if (!inuse)
1044			return -ENOMEM;
1045
1046		for_each_netdev(net, d) {
1047			if (!sscanf(d->name, name, &i))
1048				continue;
1049			if (i < 0 || i >= max_netdevices)
1050				continue;
1051
1052			/*  avoid cases where sscanf is not exact inverse of printf */
1053			snprintf(buf, IFNAMSIZ, name, i);
1054			if (!strncmp(buf, d->name, IFNAMSIZ))
1055				set_bit(i, inuse);
1056		}
1057
1058		i = find_first_zero_bit(inuse, max_netdevices);
1059		free_page((unsigned long) inuse);
1060	}
1061
1062	if (buf != name)
1063		snprintf(buf, IFNAMSIZ, name, i);
1064	if (!__dev_get_by_name(net, buf))
1065		return i;
1066
1067	/* It is possible to run out of possible slots
1068	 * when the name is long and there isn't enough space left
1069	 * for the digits, or if all bits are used.
1070	 */
1071	return -ENFILE;
1072}
1073
1074/**
1075 *	dev_alloc_name - allocate a name for a device
1076 *	@dev: device
1077 *	@name: name format string
1078 *
1079 *	Passed a format string - eg "lt%d" it will try and find a suitable
1080 *	id. It scans list of devices to build up a free map, then chooses
1081 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1082 *	while allocating the name and adding the device in order to avoid
1083 *	duplicates.
1084 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1085 *	Returns the number of the unit assigned or a negative errno code.
1086 */
1087
1088int dev_alloc_name(struct net_device *dev, const char *name)
1089{
1090	char buf[IFNAMSIZ];
1091	struct net *net;
1092	int ret;
1093
1094	BUG_ON(!dev_net(dev));
1095	net = dev_net(dev);
1096	ret = __dev_alloc_name(net, name, buf);
1097	if (ret >= 0)
1098		strlcpy(dev->name, buf, IFNAMSIZ);
1099	return ret;
1100}
1101EXPORT_SYMBOL(dev_alloc_name);
1102
1103static int dev_alloc_name_ns(struct net *net,
1104			     struct net_device *dev,
1105			     const char *name)
1106{
1107	char buf[IFNAMSIZ];
1108	int ret;
1109
1110	ret = __dev_alloc_name(net, name, buf);
1111	if (ret >= 0)
1112		strlcpy(dev->name, buf, IFNAMSIZ);
1113	return ret;
1114}
1115
1116static int dev_get_valid_name(struct net *net,
1117			      struct net_device *dev,
1118			      const char *name)
1119{
1120	BUG_ON(!net);
1121
1122	if (!dev_valid_name(name))
1123		return -EINVAL;
1124
1125	if (strchr(name, '%'))
1126		return dev_alloc_name_ns(net, dev, name);
1127	else if (__dev_get_by_name(net, name))
1128		return -EEXIST;
1129	else if (dev->name != name)
1130		strlcpy(dev->name, name, IFNAMSIZ);
1131
1132	return 0;
1133}
1134
1135/**
1136 *	dev_change_name - change name of a device
1137 *	@dev: device
1138 *	@newname: name (or format string) must be at least IFNAMSIZ
1139 *
1140 *	Change name of a device, can pass format strings "eth%d".
1141 *	for wildcarding.
1142 */
1143int dev_change_name(struct net_device *dev, const char *newname)
1144{
1145	unsigned char old_assign_type;
1146	char oldname[IFNAMSIZ];
1147	int err = 0;
1148	int ret;
1149	struct net *net;
1150
1151	ASSERT_RTNL();
1152	BUG_ON(!dev_net(dev));
1153
1154	net = dev_net(dev);
1155	if (dev->flags & IFF_UP)
1156		return -EBUSY;
1157
1158	write_seqcount_begin(&devnet_rename_seq);
1159
1160	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1161		write_seqcount_end(&devnet_rename_seq);
1162		return 0;
1163	}
1164
1165	memcpy(oldname, dev->name, IFNAMSIZ);
1166
1167	err = dev_get_valid_name(net, dev, newname);
1168	if (err < 0) {
1169		write_seqcount_end(&devnet_rename_seq);
1170		return err;
1171	}
1172
1173	if (oldname[0] && !strchr(oldname, '%'))
1174		netdev_info(dev, "renamed from %s\n", oldname);
1175
1176	old_assign_type = dev->name_assign_type;
1177	dev->name_assign_type = NET_NAME_RENAMED;
1178
1179rollback:
1180	ret = device_rename(&dev->dev, dev->name);
1181	if (ret) {
1182		memcpy(dev->name, oldname, IFNAMSIZ);
1183		dev->name_assign_type = old_assign_type;
1184		write_seqcount_end(&devnet_rename_seq);
1185		return ret;
1186	}
1187
1188	write_seqcount_end(&devnet_rename_seq);
1189
1190	netdev_adjacent_rename_links(dev, oldname);
1191
1192	write_lock_bh(&dev_base_lock);
1193	hlist_del_rcu(&dev->name_hlist);
1194	write_unlock_bh(&dev_base_lock);
1195
1196	synchronize_rcu();
1197
1198	write_lock_bh(&dev_base_lock);
1199	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1200	write_unlock_bh(&dev_base_lock);
1201
1202	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1203	ret = notifier_to_errno(ret);
1204
1205	if (ret) {
1206		/* err >= 0 after dev_alloc_name() or stores the first errno */
1207		if (err >= 0) {
1208			err = ret;
1209			write_seqcount_begin(&devnet_rename_seq);
1210			memcpy(dev->name, oldname, IFNAMSIZ);
1211			memcpy(oldname, newname, IFNAMSIZ);
1212			dev->name_assign_type = old_assign_type;
1213			old_assign_type = NET_NAME_RENAMED;
1214			goto rollback;
1215		} else {
1216			pr_err("%s: name change rollback failed: %d\n",
1217			       dev->name, ret);
1218		}
1219	}
1220
1221	return err;
1222}
1223
1224/**
1225 *	dev_set_alias - change ifalias of a device
1226 *	@dev: device
1227 *	@alias: name up to IFALIASZ
1228 *	@len: limit of bytes to copy from info
1229 *
1230 *	Set ifalias for a device,
1231 */
1232int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1233{
1234	char *new_ifalias;
1235
1236	ASSERT_RTNL();
1237
1238	if (len >= IFALIASZ)
1239		return -EINVAL;
1240
1241	if (!len) {
1242		kfree(dev->ifalias);
1243		dev->ifalias = NULL;
1244		return 0;
1245	}
1246
1247	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1248	if (!new_ifalias)
1249		return -ENOMEM;
1250	dev->ifalias = new_ifalias;
1251
1252	strlcpy(dev->ifalias, alias, len+1);
1253	return len;
1254}
1255
1256
1257/**
1258 *	netdev_features_change - device changes features
1259 *	@dev: device to cause notification
1260 *
1261 *	Called to indicate a device has changed features.
1262 */
1263void netdev_features_change(struct net_device *dev)
1264{
1265	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1266}
1267EXPORT_SYMBOL(netdev_features_change);
1268
1269/**
1270 *	netdev_state_change - device changes state
1271 *	@dev: device to cause notification
1272 *
1273 *	Called to indicate a device has changed state. This function calls
1274 *	the notifier chains for netdev_chain and sends a NEWLINK message
1275 *	to the routing socket.
1276 */
1277void netdev_state_change(struct net_device *dev)
1278{
1279	if (dev->flags & IFF_UP) {
1280		struct netdev_notifier_change_info change_info;
1281
1282		change_info.flags_changed = 0;
1283		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1284					      &change_info.info);
1285		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1286	}
1287}
1288EXPORT_SYMBOL(netdev_state_change);
1289
1290/**
1291 * 	netdev_notify_peers - notify network peers about existence of @dev
1292 * 	@dev: network device
1293 *
1294 * Generate traffic such that interested network peers are aware of
1295 * @dev, such as by generating a gratuitous ARP. This may be used when
1296 * a device wants to inform the rest of the network about some sort of
1297 * reconfiguration such as a failover event or virtual machine
1298 * migration.
1299 */
1300void netdev_notify_peers(struct net_device *dev)
1301{
1302	rtnl_lock();
1303	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1304	rtnl_unlock();
1305}
1306EXPORT_SYMBOL(netdev_notify_peers);
1307
1308static int __dev_open(struct net_device *dev)
1309{
1310	const struct net_device_ops *ops = dev->netdev_ops;
1311	int ret;
1312
1313	ASSERT_RTNL();
1314
1315	if (!netif_device_present(dev))
1316		return -ENODEV;
1317
1318	/* Block netpoll from trying to do any rx path servicing.
1319	 * If we don't do this there is a chance ndo_poll_controller
1320	 * or ndo_poll may be running while we open the device
1321	 */
1322	netpoll_poll_disable(dev);
1323
1324	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1325	ret = notifier_to_errno(ret);
1326	if (ret)
1327		return ret;
1328
1329	set_bit(__LINK_STATE_START, &dev->state);
1330
1331	if (ops->ndo_validate_addr)
1332		ret = ops->ndo_validate_addr(dev);
1333
1334	if (!ret && ops->ndo_open)
1335		ret = ops->ndo_open(dev);
1336
1337	netpoll_poll_enable(dev);
1338
1339	if (ret)
1340		clear_bit(__LINK_STATE_START, &dev->state);
1341	else {
1342		dev->flags |= IFF_UP;
1343		dev_set_rx_mode(dev);
1344		dev_activate(dev);
1345		add_device_randomness(dev->dev_addr, dev->addr_len);
1346	}
1347
1348	return ret;
1349}
1350
1351/**
1352 *	dev_open	- prepare an interface for use.
1353 *	@dev:	device to open
1354 *
1355 *	Takes a device from down to up state. The device's private open
1356 *	function is invoked and then the multicast lists are loaded. Finally
1357 *	the device is moved into the up state and a %NETDEV_UP message is
1358 *	sent to the netdev notifier chain.
1359 *
1360 *	Calling this function on an active interface is a nop. On a failure
1361 *	a negative errno code is returned.
1362 */
1363int dev_open(struct net_device *dev)
1364{
1365	int ret;
1366
1367	if (dev->flags & IFF_UP)
1368		return 0;
1369
1370	ret = __dev_open(dev);
1371	if (ret < 0)
1372		return ret;
1373
1374	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1375	call_netdevice_notifiers(NETDEV_UP, dev);
1376
1377	return ret;
1378}
1379EXPORT_SYMBOL(dev_open);
1380
1381static int __dev_close_many(struct list_head *head)
1382{
1383	struct net_device *dev;
1384
1385	ASSERT_RTNL();
1386	might_sleep();
1387
1388	list_for_each_entry(dev, head, close_list) {
1389		/* Temporarily disable netpoll until the interface is down */
1390		netpoll_poll_disable(dev);
1391
1392		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1393
1394		clear_bit(__LINK_STATE_START, &dev->state);
1395
1396		/* Synchronize to scheduled poll. We cannot touch poll list, it
1397		 * can be even on different cpu. So just clear netif_running().
1398		 *
1399		 * dev->stop() will invoke napi_disable() on all of it's
1400		 * napi_struct instances on this device.
1401		 */
1402		smp_mb__after_atomic(); /* Commit netif_running(). */
1403	}
1404
1405	dev_deactivate_many(head);
1406
1407	list_for_each_entry(dev, head, close_list) {
1408		const struct net_device_ops *ops = dev->netdev_ops;
1409
1410		/*
1411		 *	Call the device specific close. This cannot fail.
1412		 *	Only if device is UP
1413		 *
1414		 *	We allow it to be called even after a DETACH hot-plug
1415		 *	event.
1416		 */
1417		if (ops->ndo_stop)
1418			ops->ndo_stop(dev);
1419
1420		dev->flags &= ~IFF_UP;
1421		netpoll_poll_enable(dev);
1422	}
1423
1424	return 0;
1425}
1426
1427static int __dev_close(struct net_device *dev)
1428{
1429	int retval;
1430	LIST_HEAD(single);
1431
1432	list_add(&dev->close_list, &single);
1433	retval = __dev_close_many(&single);
1434	list_del(&single);
1435
1436	return retval;
1437}
1438
1439int dev_close_many(struct list_head *head, bool unlink)
1440{
1441	struct net_device *dev, *tmp;
1442
1443	/* Remove the devices that don't need to be closed */
1444	list_for_each_entry_safe(dev, tmp, head, close_list)
1445		if (!(dev->flags & IFF_UP))
1446			list_del_init(&dev->close_list);
1447
1448	__dev_close_many(head);
1449
1450	list_for_each_entry_safe(dev, tmp, head, close_list) {
1451		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1452		call_netdevice_notifiers(NETDEV_DOWN, dev);
1453		if (unlink)
1454			list_del_init(&dev->close_list);
1455	}
1456
1457	return 0;
1458}
1459EXPORT_SYMBOL(dev_close_many);
1460
1461/**
1462 *	dev_close - shutdown an interface.
1463 *	@dev: device to shutdown
1464 *
1465 *	This function moves an active device into down state. A
1466 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1467 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1468 *	chain.
1469 */
1470int dev_close(struct net_device *dev)
1471{
1472	if (dev->flags & IFF_UP) {
1473		LIST_HEAD(single);
1474
1475		list_add(&dev->close_list, &single);
1476		dev_close_many(&single, true);
1477		list_del(&single);
1478	}
1479	return 0;
1480}
1481EXPORT_SYMBOL(dev_close);
1482
1483
1484/**
1485 *	dev_disable_lro - disable Large Receive Offload on a device
1486 *	@dev: device
1487 *
1488 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1489 *	called under RTNL.  This is needed if received packets may be
1490 *	forwarded to another interface.
1491 */
1492void dev_disable_lro(struct net_device *dev)
1493{
1494	struct net_device *lower_dev;
1495	struct list_head *iter;
1496
1497	dev->wanted_features &= ~NETIF_F_LRO;
1498	netdev_update_features(dev);
1499
1500	if (unlikely(dev->features & NETIF_F_LRO))
1501		netdev_WARN(dev, "failed to disable LRO!\n");
1502
1503	netdev_for_each_lower_dev(dev, lower_dev, iter)
1504		dev_disable_lro(lower_dev);
1505}
1506EXPORT_SYMBOL(dev_disable_lro);
1507
1508static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1509				   struct net_device *dev)
1510{
1511	struct netdev_notifier_info info;
1512
1513	netdev_notifier_info_init(&info, dev);
1514	return nb->notifier_call(nb, val, &info);
1515}
1516
1517static int dev_boot_phase = 1;
1518
1519/**
1520 *	register_netdevice_notifier - register a network notifier block
1521 *	@nb: notifier
1522 *
1523 *	Register a notifier to be called when network device events occur.
1524 *	The notifier passed is linked into the kernel structures and must
1525 *	not be reused until it has been unregistered. A negative errno code
1526 *	is returned on a failure.
1527 *
1528 * 	When registered all registration and up events are replayed
1529 *	to the new notifier to allow device to have a race free
1530 *	view of the network device list.
1531 */
1532
1533int register_netdevice_notifier(struct notifier_block *nb)
1534{
1535	struct net_device *dev;
1536	struct net_device *last;
1537	struct net *net;
1538	int err;
1539
1540	rtnl_lock();
1541	err = raw_notifier_chain_register(&netdev_chain, nb);
1542	if (err)
1543		goto unlock;
1544	if (dev_boot_phase)
1545		goto unlock;
1546	for_each_net(net) {
1547		for_each_netdev(net, dev) {
1548			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1549			err = notifier_to_errno(err);
1550			if (err)
1551				goto rollback;
1552
1553			if (!(dev->flags & IFF_UP))
1554				continue;
1555
1556			call_netdevice_notifier(nb, NETDEV_UP, dev);
1557		}
1558	}
1559
1560unlock:
1561	rtnl_unlock();
1562	return err;
1563
1564rollback:
1565	last = dev;
1566	for_each_net(net) {
1567		for_each_netdev(net, dev) {
1568			if (dev == last)
1569				goto outroll;
1570
1571			if (dev->flags & IFF_UP) {
1572				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1573							dev);
1574				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1575			}
1576			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1577		}
1578	}
1579
1580outroll:
1581	raw_notifier_chain_unregister(&netdev_chain, nb);
1582	goto unlock;
1583}
1584EXPORT_SYMBOL(register_netdevice_notifier);
1585
1586/**
1587 *	unregister_netdevice_notifier - unregister a network notifier block
1588 *	@nb: notifier
1589 *
1590 *	Unregister a notifier previously registered by
1591 *	register_netdevice_notifier(). The notifier is unlinked into the
1592 *	kernel structures and may then be reused. A negative errno code
1593 *	is returned on a failure.
1594 *
1595 * 	After unregistering unregister and down device events are synthesized
1596 *	for all devices on the device list to the removed notifier to remove
1597 *	the need for special case cleanup code.
1598 */
1599
1600int unregister_netdevice_notifier(struct notifier_block *nb)
1601{
1602	struct net_device *dev;
1603	struct net *net;
1604	int err;
1605
1606	rtnl_lock();
1607	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1608	if (err)
1609		goto unlock;
1610
1611	for_each_net(net) {
1612		for_each_netdev(net, dev) {
1613			if (dev->flags & IFF_UP) {
1614				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1615							dev);
1616				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1617			}
1618			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1619		}
1620	}
1621unlock:
1622	rtnl_unlock();
1623	return err;
1624}
1625EXPORT_SYMBOL(unregister_netdevice_notifier);
1626
1627/**
1628 *	call_netdevice_notifiers_info - call all network notifier blocks
1629 *	@val: value passed unmodified to notifier function
1630 *	@dev: net_device pointer passed unmodified to notifier function
1631 *	@info: notifier information data
1632 *
1633 *	Call all network notifier blocks.  Parameters and return value
1634 *	are as for raw_notifier_call_chain().
1635 */
1636
1637static int call_netdevice_notifiers_info(unsigned long val,
1638					 struct net_device *dev,
1639					 struct netdev_notifier_info *info)
1640{
1641	ASSERT_RTNL();
1642	netdev_notifier_info_init(info, dev);
1643	return raw_notifier_call_chain(&netdev_chain, val, info);
1644}
1645
1646/**
1647 *	call_netdevice_notifiers - call all network notifier blocks
1648 *      @val: value passed unmodified to notifier function
1649 *      @dev: net_device pointer passed unmodified to notifier function
1650 *
1651 *	Call all network notifier blocks.  Parameters and return value
1652 *	are as for raw_notifier_call_chain().
1653 */
1654
1655int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1656{
1657	struct netdev_notifier_info info;
1658
1659	return call_netdevice_notifiers_info(val, dev, &info);
1660}
1661EXPORT_SYMBOL(call_netdevice_notifiers);
1662
1663#ifdef CONFIG_NET_INGRESS
1664static struct static_key ingress_needed __read_mostly;
1665
1666void net_inc_ingress_queue(void)
1667{
1668	static_key_slow_inc(&ingress_needed);
1669}
1670EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1671
1672void net_dec_ingress_queue(void)
1673{
1674	static_key_slow_dec(&ingress_needed);
1675}
1676EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1677#endif
1678
1679#ifdef CONFIG_NET_EGRESS
1680static struct static_key egress_needed __read_mostly;
1681
1682void net_inc_egress_queue(void)
1683{
1684	static_key_slow_inc(&egress_needed);
1685}
1686EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1687
1688void net_dec_egress_queue(void)
1689{
1690	static_key_slow_dec(&egress_needed);
1691}
1692EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1693#endif
1694
1695static struct static_key netstamp_needed __read_mostly;
1696#ifdef HAVE_JUMP_LABEL
1697/* We are not allowed to call static_key_slow_dec() from irq context
1698 * If net_disable_timestamp() is called from irq context, defer the
1699 * static_key_slow_dec() calls.
1700 */
1701static atomic_t netstamp_needed_deferred;
1702#endif
1703
1704void net_enable_timestamp(void)
1705{
1706#ifdef HAVE_JUMP_LABEL
1707	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1708
1709	if (deferred) {
1710		while (--deferred)
1711			static_key_slow_dec(&netstamp_needed);
1712		return;
1713	}
1714#endif
1715	static_key_slow_inc(&netstamp_needed);
1716}
1717EXPORT_SYMBOL(net_enable_timestamp);
1718
1719void net_disable_timestamp(void)
1720{
1721#ifdef HAVE_JUMP_LABEL
1722	if (in_interrupt()) {
1723		atomic_inc(&netstamp_needed_deferred);
1724		return;
1725	}
1726#endif
1727	static_key_slow_dec(&netstamp_needed);
1728}
1729EXPORT_SYMBOL(net_disable_timestamp);
1730
1731static inline void net_timestamp_set(struct sk_buff *skb)
1732{
1733	skb->tstamp.tv64 = 0;
1734	if (static_key_false(&netstamp_needed))
1735		__net_timestamp(skb);
1736}
1737
1738#define net_timestamp_check(COND, SKB)			\
1739	if (static_key_false(&netstamp_needed)) {		\
1740		if ((COND) && !(SKB)->tstamp.tv64)	\
1741			__net_timestamp(SKB);		\
1742	}						\
1743
1744bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1745{
1746	unsigned int len;
1747
1748	if (!(dev->flags & IFF_UP))
1749		return false;
1750
1751	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1752	if (skb->len <= len)
1753		return true;
1754
1755	/* if TSO is enabled, we don't care about the length as the packet
1756	 * could be forwarded without being segmented before
1757	 */
1758	if (skb_is_gso(skb))
1759		return true;
1760
1761	return false;
1762}
1763EXPORT_SYMBOL_GPL(is_skb_forwardable);
1764
1765int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1766{
1767	if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1768	    unlikely(!is_skb_forwardable(dev, skb))) {
1769		atomic_long_inc(&dev->rx_dropped);
1770		kfree_skb(skb);
1771		return NET_RX_DROP;
1772	}
1773
1774	skb_scrub_packet(skb, true);
1775	skb->priority = 0;
1776	skb->protocol = eth_type_trans(skb, dev);
1777	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1778
1779	return 0;
1780}
1781EXPORT_SYMBOL_GPL(__dev_forward_skb);
1782
1783/**
1784 * dev_forward_skb - loopback an skb to another netif
1785 *
1786 * @dev: destination network device
1787 * @skb: buffer to forward
1788 *
1789 * return values:
1790 *	NET_RX_SUCCESS	(no congestion)
1791 *	NET_RX_DROP     (packet was dropped, but freed)
1792 *
1793 * dev_forward_skb can be used for injecting an skb from the
1794 * start_xmit function of one device into the receive queue
1795 * of another device.
1796 *
1797 * The receiving device may be in another namespace, so
1798 * we have to clear all information in the skb that could
1799 * impact namespace isolation.
1800 */
1801int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1802{
1803	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1804}
1805EXPORT_SYMBOL_GPL(dev_forward_skb);
1806
1807static inline int deliver_skb(struct sk_buff *skb,
1808			      struct packet_type *pt_prev,
1809			      struct net_device *orig_dev)
1810{
1811	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1812		return -ENOMEM;
1813	atomic_inc(&skb->users);
1814	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1815}
1816
1817static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1818					  struct packet_type **pt,
1819					  struct net_device *orig_dev,
1820					  __be16 type,
1821					  struct list_head *ptype_list)
1822{
1823	struct packet_type *ptype, *pt_prev = *pt;
1824
1825	list_for_each_entry_rcu(ptype, ptype_list, list) {
1826		if (ptype->type != type)
1827			continue;
1828		if (pt_prev)
1829			deliver_skb(skb, pt_prev, orig_dev);
1830		pt_prev = ptype;
1831	}
1832	*pt = pt_prev;
1833}
1834
1835static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1836{
1837	if (!ptype->af_packet_priv || !skb->sk)
1838		return false;
1839
1840	if (ptype->id_match)
1841		return ptype->id_match(ptype, skb->sk);
1842	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1843		return true;
1844
1845	return false;
1846}
1847
1848/*
1849 *	Support routine. Sends outgoing frames to any network
1850 *	taps currently in use.
1851 */
1852
1853static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1854{
1855	struct packet_type *ptype;
1856	struct sk_buff *skb2 = NULL;
1857	struct packet_type *pt_prev = NULL;
1858	struct list_head *ptype_list = &ptype_all;
1859
1860	rcu_read_lock();
1861again:
1862	list_for_each_entry_rcu(ptype, ptype_list, list) {
1863		/* Never send packets back to the socket
1864		 * they originated from - MvS (miquels@drinkel.ow.org)
1865		 */
1866		if (skb_loop_sk(ptype, skb))
1867			continue;
1868
1869		if (pt_prev) {
1870			deliver_skb(skb2, pt_prev, skb->dev);
1871			pt_prev = ptype;
1872			continue;
1873		}
1874
1875		/* need to clone skb, done only once */
1876		skb2 = skb_clone(skb, GFP_ATOMIC);
1877		if (!skb2)
1878			goto out_unlock;
1879
1880		net_timestamp_set(skb2);
1881
1882		/* skb->nh should be correctly
1883		 * set by sender, so that the second statement is
1884		 * just protection against buggy protocols.
1885		 */
1886		skb_reset_mac_header(skb2);
1887
1888		if (skb_network_header(skb2) < skb2->data ||
1889		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1890			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1891					     ntohs(skb2->protocol),
1892					     dev->name);
1893			skb_reset_network_header(skb2);
1894		}
1895
1896		skb2->transport_header = skb2->network_header;
1897		skb2->pkt_type = PACKET_OUTGOING;
1898		pt_prev = ptype;
1899	}
1900
1901	if (ptype_list == &ptype_all) {
1902		ptype_list = &dev->ptype_all;
1903		goto again;
1904	}
1905out_unlock:
1906	if (pt_prev)
1907		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1908	rcu_read_unlock();
1909}
1910
1911/**
1912 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1913 * @dev: Network device
1914 * @txq: number of queues available
1915 *
1916 * If real_num_tx_queues is changed the tc mappings may no longer be
1917 * valid. To resolve this verify the tc mapping remains valid and if
1918 * not NULL the mapping. With no priorities mapping to this
1919 * offset/count pair it will no longer be used. In the worst case TC0
1920 * is invalid nothing can be done so disable priority mappings. If is
1921 * expected that drivers will fix this mapping if they can before
1922 * calling netif_set_real_num_tx_queues.
1923 */
1924static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1925{
1926	int i;
1927	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1928
1929	/* If TC0 is invalidated disable TC mapping */
1930	if (tc->offset + tc->count > txq) {
1931		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1932		dev->num_tc = 0;
1933		return;
1934	}
1935
1936	/* Invalidated prio to tc mappings set to TC0 */
1937	for (i = 1; i < TC_BITMASK + 1; i++) {
1938		int q = netdev_get_prio_tc_map(dev, i);
1939
1940		tc = &dev->tc_to_txq[q];
1941		if (tc->offset + tc->count > txq) {
1942			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1943				i, q);
1944			netdev_set_prio_tc_map(dev, i, 0);
1945		}
1946	}
1947}
1948
1949#ifdef CONFIG_XPS
1950static DEFINE_MUTEX(xps_map_mutex);
1951#define xmap_dereference(P)		\
1952	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1953
1954static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1955					int cpu, u16 index)
1956{
1957	struct xps_map *map = NULL;
1958	int pos;
1959
1960	if (dev_maps)
1961		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1962
1963	for (pos = 0; map && pos < map->len; pos++) {
1964		if (map->queues[pos] == index) {
1965			if (map->len > 1) {
1966				map->queues[pos] = map->queues[--map->len];
1967			} else {
1968				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1969				kfree_rcu(map, rcu);
1970				map = NULL;
1971			}
1972			break;
1973		}
1974	}
1975
1976	return map;
1977}
1978
1979static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1980{
1981	struct xps_dev_maps *dev_maps;
1982	int cpu, i;
1983	bool active = false;
1984
1985	mutex_lock(&xps_map_mutex);
1986	dev_maps = xmap_dereference(dev->xps_maps);
1987
1988	if (!dev_maps)
1989		goto out_no_maps;
1990
1991	for_each_possible_cpu(cpu) {
1992		for (i = index; i < dev->num_tx_queues; i++) {
1993			if (!remove_xps_queue(dev_maps, cpu, i))
1994				break;
1995		}
1996		if (i == dev->num_tx_queues)
1997			active = true;
1998	}
1999
2000	if (!active) {
2001		RCU_INIT_POINTER(dev->xps_maps, NULL);
2002		kfree_rcu(dev_maps, rcu);
2003	}
2004
2005	for (i = index; i < dev->num_tx_queues; i++)
2006		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2007					     NUMA_NO_NODE);
2008
2009out_no_maps:
2010	mutex_unlock(&xps_map_mutex);
2011}
2012
2013static struct xps_map *expand_xps_map(struct xps_map *map,
2014				      int cpu, u16 index)
2015{
2016	struct xps_map *new_map;
2017	int alloc_len = XPS_MIN_MAP_ALLOC;
2018	int i, pos;
2019
2020	for (pos = 0; map && pos < map->len; pos++) {
2021		if (map->queues[pos] != index)
2022			continue;
2023		return map;
2024	}
2025
2026	/* Need to add queue to this CPU's existing map */
2027	if (map) {
2028		if (pos < map->alloc_len)
2029			return map;
2030
2031		alloc_len = map->alloc_len * 2;
2032	}
2033
2034	/* Need to allocate new map to store queue on this CPU's map */
2035	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2036			       cpu_to_node(cpu));
2037	if (!new_map)
2038		return NULL;
2039
2040	for (i = 0; i < pos; i++)
2041		new_map->queues[i] = map->queues[i];
2042	new_map->alloc_len = alloc_len;
2043	new_map->len = pos;
2044
2045	return new_map;
2046}
2047
2048int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2049			u16 index)
2050{
2051	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2052	struct xps_map *map, *new_map;
2053	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2054	int cpu, numa_node_id = -2;
2055	bool active = false;
2056
2057	mutex_lock(&xps_map_mutex);
2058
2059	dev_maps = xmap_dereference(dev->xps_maps);
2060
2061	/* allocate memory for queue storage */
2062	for_each_online_cpu(cpu) {
2063		if (!cpumask_test_cpu(cpu, mask))
2064			continue;
2065
2066		if (!new_dev_maps)
2067			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2068		if (!new_dev_maps) {
2069			mutex_unlock(&xps_map_mutex);
2070			return -ENOMEM;
2071		}
2072
2073		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2074				 NULL;
2075
2076		map = expand_xps_map(map, cpu, index);
2077		if (!map)
2078			goto error;
2079
2080		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2081	}
2082
2083	if (!new_dev_maps)
2084		goto out_no_new_maps;
2085
2086	for_each_possible_cpu(cpu) {
2087		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2088			/* add queue to CPU maps */
2089			int pos = 0;
2090
2091			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2092			while ((pos < map->len) && (map->queues[pos] != index))
2093				pos++;
2094
2095			if (pos == map->len)
2096				map->queues[map->len++] = index;
2097#ifdef CONFIG_NUMA
2098			if (numa_node_id == -2)
2099				numa_node_id = cpu_to_node(cpu);
2100			else if (numa_node_id != cpu_to_node(cpu))
2101				numa_node_id = -1;
2102#endif
2103		} else if (dev_maps) {
2104			/* fill in the new device map from the old device map */
2105			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2106			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2107		}
2108
2109	}
2110
2111	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2112
2113	/* Cleanup old maps */
2114	if (dev_maps) {
2115		for_each_possible_cpu(cpu) {
2116			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2117			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2118			if (map && map != new_map)
2119				kfree_rcu(map, rcu);
2120		}
2121
2122		kfree_rcu(dev_maps, rcu);
2123	}
2124
2125	dev_maps = new_dev_maps;
2126	active = true;
2127
2128out_no_new_maps:
2129	/* update Tx queue numa node */
2130	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2131				     (numa_node_id >= 0) ? numa_node_id :
2132				     NUMA_NO_NODE);
2133
2134	if (!dev_maps)
2135		goto out_no_maps;
2136
2137	/* removes queue from unused CPUs */
2138	for_each_possible_cpu(cpu) {
2139		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2140			continue;
2141
2142		if (remove_xps_queue(dev_maps, cpu, index))
2143			active = true;
2144	}
2145
2146	/* free map if not active */
2147	if (!active) {
2148		RCU_INIT_POINTER(dev->xps_maps, NULL);
2149		kfree_rcu(dev_maps, rcu);
2150	}
2151
2152out_no_maps:
2153	mutex_unlock(&xps_map_mutex);
2154
2155	return 0;
2156error:
2157	/* remove any maps that we added */
2158	for_each_possible_cpu(cpu) {
2159		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2160		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2161				 NULL;
2162		if (new_map && new_map != map)
2163			kfree(new_map);
2164	}
2165
2166	mutex_unlock(&xps_map_mutex);
2167
2168	kfree(new_dev_maps);
2169	return -ENOMEM;
2170}
2171EXPORT_SYMBOL(netif_set_xps_queue);
2172
2173#endif
2174/*
2175 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2176 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2177 */
2178int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2179{
2180	int rc;
2181
2182	if (txq < 1 || txq > dev->num_tx_queues)
2183		return -EINVAL;
2184
2185	if (dev->reg_state == NETREG_REGISTERED ||
2186	    dev->reg_state == NETREG_UNREGISTERING) {
2187		ASSERT_RTNL();
2188
2189		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2190						  txq);
2191		if (rc)
2192			return rc;
2193
2194		if (dev->num_tc)
2195			netif_setup_tc(dev, txq);
2196
2197		if (txq < dev->real_num_tx_queues) {
2198			qdisc_reset_all_tx_gt(dev, txq);
2199#ifdef CONFIG_XPS
2200			netif_reset_xps_queues_gt(dev, txq);
2201#endif
2202		}
2203	}
2204
2205	dev->real_num_tx_queues = txq;
2206	return 0;
2207}
2208EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2209
2210#ifdef CONFIG_SYSFS
2211/**
2212 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2213 *	@dev: Network device
2214 *	@rxq: Actual number of RX queues
2215 *
2216 *	This must be called either with the rtnl_lock held or before
2217 *	registration of the net device.  Returns 0 on success, or a
2218 *	negative error code.  If called before registration, it always
2219 *	succeeds.
2220 */
2221int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2222{
2223	int rc;
2224
2225	if (rxq < 1 || rxq > dev->num_rx_queues)
2226		return -EINVAL;
2227
2228	if (dev->reg_state == NETREG_REGISTERED) {
2229		ASSERT_RTNL();
2230
2231		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2232						  rxq);
2233		if (rc)
2234			return rc;
2235	}
2236
2237	dev->real_num_rx_queues = rxq;
2238	return 0;
2239}
2240EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2241#endif
2242
2243/**
2244 * netif_get_num_default_rss_queues - default number of RSS queues
2245 *
2246 * This routine should set an upper limit on the number of RSS queues
2247 * used by default by multiqueue devices.
2248 */
2249int netif_get_num_default_rss_queues(void)
2250{
2251	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2252}
2253EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2254
2255static inline void __netif_reschedule(struct Qdisc *q)
2256{
2257	struct softnet_data *sd;
2258	unsigned long flags;
2259
2260	local_irq_save(flags);
2261	sd = this_cpu_ptr(&softnet_data);
2262	q->next_sched = NULL;
2263	*sd->output_queue_tailp = q;
2264	sd->output_queue_tailp = &q->next_sched;
2265	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2266	local_irq_restore(flags);
2267}
2268
2269void __netif_schedule(struct Qdisc *q)
2270{
2271	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2272		__netif_reschedule(q);
2273}
2274EXPORT_SYMBOL(__netif_schedule);
2275
2276struct dev_kfree_skb_cb {
2277	enum skb_free_reason reason;
2278};
2279
2280static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2281{
2282	return (struct dev_kfree_skb_cb *)skb->cb;
2283}
2284
2285void netif_schedule_queue(struct netdev_queue *txq)
2286{
2287	rcu_read_lock();
2288	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2289		struct Qdisc *q = rcu_dereference(txq->qdisc);
2290
2291		__netif_schedule(q);
2292	}
2293	rcu_read_unlock();
2294}
2295EXPORT_SYMBOL(netif_schedule_queue);
2296
2297/**
2298 *	netif_wake_subqueue - allow sending packets on subqueue
2299 *	@dev: network device
2300 *	@queue_index: sub queue index
2301 *
2302 * Resume individual transmit queue of a device with multiple transmit queues.
2303 */
2304void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2305{
2306	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2307
2308	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2309		struct Qdisc *q;
2310
2311		rcu_read_lock();
2312		q = rcu_dereference(txq->qdisc);
2313		__netif_schedule(q);
2314		rcu_read_unlock();
2315	}
2316}
2317EXPORT_SYMBOL(netif_wake_subqueue);
2318
2319void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2320{
2321	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2322		struct Qdisc *q;
2323
2324		rcu_read_lock();
2325		q = rcu_dereference(dev_queue->qdisc);
2326		__netif_schedule(q);
2327		rcu_read_unlock();
2328	}
2329}
2330EXPORT_SYMBOL(netif_tx_wake_queue);
2331
2332void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2333{
2334	unsigned long flags;
2335
2336	if (likely(atomic_read(&skb->users) == 1)) {
2337		smp_rmb();
2338		atomic_set(&skb->users, 0);
2339	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2340		return;
2341	}
2342	get_kfree_skb_cb(skb)->reason = reason;
2343	local_irq_save(flags);
2344	skb->next = __this_cpu_read(softnet_data.completion_queue);
2345	__this_cpu_write(softnet_data.completion_queue, skb);
2346	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2347	local_irq_restore(flags);
2348}
2349EXPORT_SYMBOL(__dev_kfree_skb_irq);
2350
2351void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2352{
2353	if (in_irq() || irqs_disabled())
2354		__dev_kfree_skb_irq(skb, reason);
2355	else
2356		dev_kfree_skb(skb);
2357}
2358EXPORT_SYMBOL(__dev_kfree_skb_any);
2359
2360
2361/**
2362 * netif_device_detach - mark device as removed
2363 * @dev: network device
2364 *
2365 * Mark device as removed from system and therefore no longer available.
2366 */
2367void netif_device_detach(struct net_device *dev)
2368{
2369	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2370	    netif_running(dev)) {
2371		netif_tx_stop_all_queues(dev);
2372	}
2373}
2374EXPORT_SYMBOL(netif_device_detach);
2375
2376/**
2377 * netif_device_attach - mark device as attached
2378 * @dev: network device
2379 *
2380 * Mark device as attached from system and restart if needed.
2381 */
2382void netif_device_attach(struct net_device *dev)
2383{
2384	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2385	    netif_running(dev)) {
2386		netif_tx_wake_all_queues(dev);
2387		__netdev_watchdog_up(dev);
2388	}
2389}
2390EXPORT_SYMBOL(netif_device_attach);
2391
2392/*
2393 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2394 * to be used as a distribution range.
2395 */
2396u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2397		  unsigned int num_tx_queues)
2398{
2399	u32 hash;
2400	u16 qoffset = 0;
2401	u16 qcount = num_tx_queues;
2402
2403	if (skb_rx_queue_recorded(skb)) {
2404		hash = skb_get_rx_queue(skb);
2405		while (unlikely(hash >= num_tx_queues))
2406			hash -= num_tx_queues;
2407		return hash;
2408	}
2409
2410	if (dev->num_tc) {
2411		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2412		qoffset = dev->tc_to_txq[tc].offset;
2413		qcount = dev->tc_to_txq[tc].count;
2414	}
2415
2416	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2417}
2418EXPORT_SYMBOL(__skb_tx_hash);
2419
2420static void skb_warn_bad_offload(const struct sk_buff *skb)
2421{
2422	static const netdev_features_t null_features = 0;
2423	struct net_device *dev = skb->dev;
2424	const char *name = "";
2425
2426	if (!net_ratelimit())
2427		return;
2428
2429	if (dev) {
2430		if (dev->dev.parent)
2431			name = dev_driver_string(dev->dev.parent);
2432		else
2433			name = netdev_name(dev);
2434	}
2435	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2436	     "gso_type=%d ip_summed=%d\n",
2437	     name, dev ? &dev->features : &null_features,
2438	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2439	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2440	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2441}
2442
2443/*
2444 * Invalidate hardware checksum when packet is to be mangled, and
2445 * complete checksum manually on outgoing path.
2446 */
2447int skb_checksum_help(struct sk_buff *skb)
2448{
2449	__wsum csum;
2450	int ret = 0, offset;
2451
2452	if (skb->ip_summed == CHECKSUM_COMPLETE)
2453		goto out_set_summed;
2454
2455	if (unlikely(skb_shinfo(skb)->gso_size)) {
2456		skb_warn_bad_offload(skb);
2457		return -EINVAL;
2458	}
2459
2460	/* Before computing a checksum, we should make sure no frag could
2461	 * be modified by an external entity : checksum could be wrong.
2462	 */
2463	if (skb_has_shared_frag(skb)) {
2464		ret = __skb_linearize(skb);
2465		if (ret)
2466			goto out;
2467	}
2468
2469	offset = skb_checksum_start_offset(skb);
2470	BUG_ON(offset >= skb_headlen(skb));
2471	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2472
2473	offset += skb->csum_offset;
2474	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2475
2476	if (skb_cloned(skb) &&
2477	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2478		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2479		if (ret)
2480			goto out;
2481	}
2482
2483	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2484out_set_summed:
2485	skb->ip_summed = CHECKSUM_NONE;
2486out:
2487	return ret;
2488}
2489EXPORT_SYMBOL(skb_checksum_help);
2490
2491/* skb_csum_offload_check - Driver helper function to determine if a device
2492 * with limited checksum offload capabilities is able to offload the checksum
2493 * for a given packet.
2494 *
2495 * Arguments:
2496 *   skb - sk_buff for the packet in question
2497 *   spec - contains the description of what device can offload
2498 *   csum_encapped - returns true if the checksum being offloaded is
2499 *	      encpasulated. That is it is checksum for the transport header
2500 *	      in the inner headers.
2501 *   checksum_help - when set indicates that helper function should
2502 *	      call skb_checksum_help if offload checks fail
2503 *
2504 * Returns:
2505 *   true: Packet has passed the checksum checks and should be offloadable to
2506 *	   the device (a driver may still need to check for additional
2507 *	   restrictions of its device)
2508 *   false: Checksum is not offloadable. If checksum_help was set then
2509 *	   skb_checksum_help was called to resolve checksum for non-GSO
2510 *	   packets and when IP protocol is not SCTP
2511 */
2512bool __skb_csum_offload_chk(struct sk_buff *skb,
2513			    const struct skb_csum_offl_spec *spec,
2514			    bool *csum_encapped,
2515			    bool csum_help)
2516{
2517	struct iphdr *iph;
2518	struct ipv6hdr *ipv6;
2519	void *nhdr;
2520	int protocol;
2521	u8 ip_proto;
2522
2523	if (skb->protocol == htons(ETH_P_8021Q) ||
2524	    skb->protocol == htons(ETH_P_8021AD)) {
2525		if (!spec->vlan_okay)
2526			goto need_help;
2527	}
2528
2529	/* We check whether the checksum refers to a transport layer checksum in
2530	 * the outermost header or an encapsulated transport layer checksum that
2531	 * corresponds to the inner headers of the skb. If the checksum is for
2532	 * something else in the packet we need help.
2533	 */
2534	if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2535		/* Non-encapsulated checksum */
2536		protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2537		nhdr = skb_network_header(skb);
2538		*csum_encapped = false;
2539		if (spec->no_not_encapped)
2540			goto need_help;
2541	} else if (skb->encapsulation && spec->encap_okay &&
2542		   skb_checksum_start_offset(skb) ==
2543		   skb_inner_transport_offset(skb)) {
2544		/* Encapsulated checksum */
2545		*csum_encapped = true;
2546		switch (skb->inner_protocol_type) {
2547		case ENCAP_TYPE_ETHER:
2548			protocol = eproto_to_ipproto(skb->inner_protocol);
2549			break;
2550		case ENCAP_TYPE_IPPROTO:
2551			protocol = skb->inner_protocol;
2552			break;
2553		}
2554		nhdr = skb_inner_network_header(skb);
2555	} else {
2556		goto need_help;
2557	}
2558
2559	switch (protocol) {
2560	case IPPROTO_IP:
2561		if (!spec->ipv4_okay)
2562			goto need_help;
2563		iph = nhdr;
2564		ip_proto = iph->protocol;
2565		if (iph->ihl != 5 && !spec->ip_options_okay)
2566			goto need_help;
2567		break;
2568	case IPPROTO_IPV6:
2569		if (!spec->ipv6_okay)
2570			goto need_help;
2571		if (spec->no_encapped_ipv6 && *csum_encapped)
2572			goto need_help;
2573		ipv6 = nhdr;
2574		nhdr += sizeof(*ipv6);
2575		ip_proto = ipv6->nexthdr;
2576		break;
2577	default:
2578		goto need_help;
2579	}
2580
2581ip_proto_again:
2582	switch (ip_proto) {
2583	case IPPROTO_TCP:
2584		if (!spec->tcp_okay ||
2585		    skb->csum_offset != offsetof(struct tcphdr, check))
2586			goto need_help;
2587		break;
2588	case IPPROTO_UDP:
2589		if (!spec->udp_okay ||
2590		    skb->csum_offset != offsetof(struct udphdr, check))
2591			goto need_help;
2592		break;
2593	case IPPROTO_SCTP:
2594		if (!spec->sctp_okay ||
2595		    skb->csum_offset != offsetof(struct sctphdr, checksum))
2596			goto cant_help;
2597		break;
2598	case NEXTHDR_HOP:
2599	case NEXTHDR_ROUTING:
2600	case NEXTHDR_DEST: {
2601		u8 *opthdr = nhdr;
2602
2603		if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2604			goto need_help;
2605
2606		ip_proto = opthdr[0];
2607		nhdr += (opthdr[1] + 1) << 3;
2608
2609		goto ip_proto_again;
2610	}
2611	default:
2612		goto need_help;
2613	}
2614
2615	/* Passed the tests for offloading checksum */
2616	return true;
2617
2618need_help:
2619	if (csum_help && !skb_shinfo(skb)->gso_size)
2620		skb_checksum_help(skb);
2621cant_help:
2622	return false;
2623}
2624EXPORT_SYMBOL(__skb_csum_offload_chk);
2625
2626__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2627{
2628	__be16 type = skb->protocol;
2629
2630	/* Tunnel gso handlers can set protocol to ethernet. */
2631	if (type == htons(ETH_P_TEB)) {
2632		struct ethhdr *eth;
2633
2634		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2635			return 0;
2636
2637		eth = (struct ethhdr *)skb_mac_header(skb);
2638		type = eth->h_proto;
2639	}
2640
2641	return __vlan_get_protocol(skb, type, depth);
2642}
2643
2644/**
2645 *	skb_mac_gso_segment - mac layer segmentation handler.
2646 *	@skb: buffer to segment
2647 *	@features: features for the output path (see dev->features)
2648 */
2649struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2650				    netdev_features_t features)
2651{
2652	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2653	struct packet_offload *ptype;
2654	int vlan_depth = skb->mac_len;
2655	__be16 type = skb_network_protocol(skb, &vlan_depth);
2656
2657	if (unlikely(!type))
2658		return ERR_PTR(-EINVAL);
2659
2660	__skb_pull(skb, vlan_depth);
2661
2662	rcu_read_lock();
2663	list_for_each_entry_rcu(ptype, &offload_base, list) {
2664		if (ptype->type == type && ptype->callbacks.gso_segment) {
2665			segs = ptype->callbacks.gso_segment(skb, features);
2666			break;
2667		}
2668	}
2669	rcu_read_unlock();
2670
2671	__skb_push(skb, skb->data - skb_mac_header(skb));
2672
2673	return segs;
2674}
2675EXPORT_SYMBOL(skb_mac_gso_segment);
2676
2677
2678/* openvswitch calls this on rx path, so we need a different check.
2679 */
2680static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2681{
2682	if (tx_path)
2683		return skb->ip_summed != CHECKSUM_PARTIAL;
2684	else
2685		return skb->ip_summed == CHECKSUM_NONE;
2686}
2687
2688/**
2689 *	__skb_gso_segment - Perform segmentation on skb.
2690 *	@skb: buffer to segment
2691 *	@features: features for the output path (see dev->features)
2692 *	@tx_path: whether it is called in TX path
2693 *
2694 *	This function segments the given skb and returns a list of segments.
2695 *
2696 *	It may return NULL if the skb requires no segmentation.  This is
2697 *	only possible when GSO is used for verifying header integrity.
2698 *
2699 *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2700 */
2701struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2702				  netdev_features_t features, bool tx_path)
2703{
2704	if (unlikely(skb_needs_check(skb, tx_path))) {
2705		int err;
2706
2707		skb_warn_bad_offload(skb);
2708
2709		err = skb_cow_head(skb, 0);
2710		if (err < 0)
2711			return ERR_PTR(err);
2712	}
2713
2714	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2715		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2716
2717	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2718	SKB_GSO_CB(skb)->encap_level = 0;
2719
2720	skb_reset_mac_header(skb);
2721	skb_reset_mac_len(skb);
2722
2723	return skb_mac_gso_segment(skb, features);
2724}
2725EXPORT_SYMBOL(__skb_gso_segment);
2726
2727/* Take action when hardware reception checksum errors are detected. */
2728#ifdef CONFIG_BUG
2729void netdev_rx_csum_fault(struct net_device *dev)
2730{
2731	if (net_ratelimit()) {
2732		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2733		dump_stack();
2734	}
2735}
2736EXPORT_SYMBOL(netdev_rx_csum_fault);
2737#endif
2738
2739/* Actually, we should eliminate this check as soon as we know, that:
2740 * 1. IOMMU is present and allows to map all the memory.
2741 * 2. No high memory really exists on this machine.
2742 */
2743
2744static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2745{
2746#ifdef CONFIG_HIGHMEM
2747	int i;
2748	if (!(dev->features & NETIF_F_HIGHDMA)) {
2749		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2750			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2751			if (PageHighMem(skb_frag_page(frag)))
2752				return 1;
2753		}
2754	}
2755
2756	if (PCI_DMA_BUS_IS_PHYS) {
2757		struct device *pdev = dev->dev.parent;
2758
2759		if (!pdev)
2760			return 0;
2761		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2762			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2763			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2764			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2765				return 1;
2766		}
2767	}
2768#endif
2769	return 0;
2770}
2771
2772/* If MPLS offload request, verify we are testing hardware MPLS features
2773 * instead of standard features for the netdev.
2774 */
2775#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2776static netdev_features_t net_mpls_features(struct sk_buff *skb,
2777					   netdev_features_t features,
2778					   __be16 type)
2779{
2780	if (eth_p_mpls(type))
2781		features &= skb->dev->mpls_features;
2782
2783	return features;
2784}
2785#else
2786static netdev_features_t net_mpls_features(struct sk_buff *skb,
2787					   netdev_features_t features,
2788					   __be16 type)
2789{
2790	return features;
2791}
2792#endif
2793
2794static netdev_features_t harmonize_features(struct sk_buff *skb,
2795	netdev_features_t features)
2796{
2797	int tmp;
2798	__be16 type;
2799
2800	type = skb_network_protocol(skb, &tmp);
2801	features = net_mpls_features(skb, features, type);
2802
2803	if (skb->ip_summed != CHECKSUM_NONE &&
2804	    !can_checksum_protocol(features, type)) {
2805		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2806	} else if (illegal_highdma(skb->dev, skb)) {
2807		features &= ~NETIF_F_SG;
2808	}
2809
2810	return features;
2811}
2812
2813netdev_features_t passthru_features_check(struct sk_buff *skb,
2814					  struct net_device *dev,
2815					  netdev_features_t features)
2816{
2817	return features;
2818}
2819EXPORT_SYMBOL(passthru_features_check);
2820
2821static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2822					     struct net_device *dev,
2823					     netdev_features_t features)
2824{
2825	return vlan_features_check(skb, features);
2826}
2827
2828netdev_features_t netif_skb_features(struct sk_buff *skb)
2829{
2830	struct net_device *dev = skb->dev;
2831	netdev_features_t features = dev->features;
2832	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2833
2834	if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2835		features &= ~NETIF_F_GSO_MASK;
2836
2837	/* If encapsulation offload request, verify we are testing
2838	 * hardware encapsulation features instead of standard
2839	 * features for the netdev
2840	 */
2841	if (skb->encapsulation)
2842		features &= dev->hw_enc_features;
2843
2844	if (skb_vlan_tagged(skb))
2845		features = netdev_intersect_features(features,
2846						     dev->vlan_features |
2847						     NETIF_F_HW_VLAN_CTAG_TX |
2848						     NETIF_F_HW_VLAN_STAG_TX);
2849
2850	if (dev->netdev_ops->ndo_features_check)
2851		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2852								features);
2853	else
2854		features &= dflt_features_check(skb, dev, features);
2855
2856	return harmonize_features(skb, features);
2857}
2858EXPORT_SYMBOL(netif_skb_features);
2859
2860static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2861		    struct netdev_queue *txq, bool more)
2862{
2863	unsigned int len;
2864	int rc;
2865
2866	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2867		dev_queue_xmit_nit(skb, dev);
2868
2869	len = skb->len;
2870	trace_net_dev_start_xmit(skb, dev);
2871	rc = netdev_start_xmit(skb, dev, txq, more);
2872	trace_net_dev_xmit(skb, rc, dev, len);
2873
2874	return rc;
2875}
2876
2877struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2878				    struct netdev_queue *txq, int *ret)
2879{
2880	struct sk_buff *skb = first;
2881	int rc = NETDEV_TX_OK;
2882
2883	while (skb) {
2884		struct sk_buff *next = skb->next;
2885
2886		skb->next = NULL;
2887		rc = xmit_one(skb, dev, txq, next != NULL);
2888		if (unlikely(!dev_xmit_complete(rc))) {
2889			skb->next = next;
2890			goto out;
2891		}
2892
2893		skb = next;
2894		if (netif_xmit_stopped(txq) && skb) {
2895			rc = NETDEV_TX_BUSY;
2896			break;
2897		}
2898	}
2899
2900out:
2901	*ret = rc;
2902	return skb;
2903}
2904
2905static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2906					  netdev_features_t features)
2907{
2908	if (skb_vlan_tag_present(skb) &&
2909	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2910		skb = __vlan_hwaccel_push_inside(skb);
2911	return skb;
2912}
2913
2914static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2915{
2916	netdev_features_t features;
2917
2918	if (skb->next)
2919		return skb;
2920
2921	features = netif_skb_features(skb);
2922	skb = validate_xmit_vlan(skb, features);
2923	if (unlikely(!skb))
2924		goto out_null;
2925
2926	if (netif_needs_gso(skb, features)) {
2927		struct sk_buff *segs;
2928
2929		segs = skb_gso_segment(skb, features);
2930		if (IS_ERR(segs)) {
2931			goto out_kfree_skb;
2932		} else if (segs) {
2933			consume_skb(skb);
2934			skb = segs;
2935		}
2936	} else {
2937		if (skb_needs_linearize(skb, features) &&
2938		    __skb_linearize(skb))
2939			goto out_kfree_skb;
2940
2941		/* If packet is not checksummed and device does not
2942		 * support checksumming for this protocol, complete
2943		 * checksumming here.
2944		 */
2945		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2946			if (skb->encapsulation)
2947				skb_set_inner_transport_header(skb,
2948							       skb_checksum_start_offset(skb));
2949			else
2950				skb_set_transport_header(skb,
2951							 skb_checksum_start_offset(skb));
2952			if (!(features & NETIF_F_CSUM_MASK) &&
2953			    skb_checksum_help(skb))
2954				goto out_kfree_skb;
2955		}
2956	}
2957
2958	return skb;
2959
2960out_kfree_skb:
2961	kfree_skb(skb);
2962out_null:
2963	return NULL;
2964}
2965
2966struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2967{
2968	struct sk_buff *next, *head = NULL, *tail;
2969
2970	for (; skb != NULL; skb = next) {
2971		next = skb->next;
2972		skb->next = NULL;
2973
2974		/* in case skb wont be segmented, point to itself */
2975		skb->prev = skb;
2976
2977		skb = validate_xmit_skb(skb, dev);
2978		if (!skb)
2979			continue;
2980
2981		if (!head)
2982			head = skb;
2983		else
2984			tail->next = skb;
2985		/* If skb was segmented, skb->prev points to
2986		 * the last segment. If not, it still contains skb.
2987		 */
2988		tail = skb->prev;
2989	}
2990	return head;
2991}
2992
2993static void qdisc_pkt_len_init(struct sk_buff *skb)
2994{
2995	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2996
2997	qdisc_skb_cb(skb)->pkt_len = skb->len;
2998
2999	/* To get more precise estimation of bytes sent on wire,
3000	 * we add to pkt_len the headers size of all segments
3001	 */
3002	if (shinfo->gso_size)  {
3003		unsigned int hdr_len;
3004		u16 gso_segs = shinfo->gso_segs;
3005
3006		/* mac layer + network layer */
3007		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3008
3009		/* + transport layer */
3010		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3011			hdr_len += tcp_hdrlen(skb);
3012		else
3013			hdr_len += sizeof(struct udphdr);
3014
3015		if (shinfo->gso_type & SKB_GSO_DODGY)
3016			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3017						shinfo->gso_size);
3018
3019		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3020	}
3021}
3022
3023static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3024				 struct net_device *dev,
3025				 struct netdev_queue *txq)
3026{
3027	spinlock_t *root_lock = qdisc_lock(q);
3028	bool contended;
3029	int rc;
3030
3031	qdisc_calculate_pkt_len(skb, q);
3032	/*
3033	 * Heuristic to force contended enqueues to serialize on a
3034	 * separate lock before trying to get qdisc main lock.
3035	 * This permits __QDISC___STATE_RUNNING owner to get the lock more
3036	 * often and dequeue packets faster.
3037	 */
3038	contended = qdisc_is_running(q);
3039	if (unlikely(contended))
3040		spin_lock(&q->busylock);
3041
3042	spin_lock(root_lock);
3043	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3044		kfree_skb(skb);
3045		rc = NET_XMIT_DROP;
3046	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3047		   qdisc_run_begin(q)) {
3048		/*
3049		 * This is a work-conserving queue; there are no old skbs
3050		 * waiting to be sent out; and the qdisc is not running -
3051		 * xmit the skb directly.
3052		 */
3053
3054		qdisc_bstats_update(q, skb);
3055
3056		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3057			if (unlikely(contended)) {
3058				spin_unlock(&q->busylock);
3059				contended = false;
3060			}
3061			__qdisc_run(q);
3062		} else
3063			qdisc_run_end(q);
3064
3065		rc = NET_XMIT_SUCCESS;
3066	} else {
3067		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
3068		if (qdisc_run_begin(q)) {
3069			if (unlikely(contended)) {
3070				spin_unlock(&q->busylock);
3071				contended = false;
3072			}
3073			__qdisc_run(q);
3074		}
3075	}
3076	spin_unlock(root_lock);
3077	if (unlikely(contended))
3078		spin_unlock(&q->busylock);
3079	return rc;
3080}
3081
3082#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3083static void skb_update_prio(struct sk_buff *skb)
3084{
3085	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3086
3087	if (!skb->priority && skb->sk && map) {
3088		unsigned int prioidx =
3089			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3090
3091		if (prioidx < map->priomap_len)
3092			skb->priority = map->priomap[prioidx];
3093	}
3094}
3095#else
3096#define skb_update_prio(skb)
3097#endif
3098
3099DEFINE_PER_CPU(int, xmit_recursion);
3100EXPORT_SYMBOL(xmit_recursion);
3101
3102#define RECURSION_LIMIT 10
3103
3104/**
3105 *	dev_loopback_xmit - loop back @skb
3106 *	@net: network namespace this loopback is happening in
3107 *	@sk:  sk needed to be a netfilter okfn
3108 *	@skb: buffer to transmit
3109 */
3110int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3111{
3112	skb_reset_mac_header(skb);
3113	__skb_pull(skb, skb_network_offset(skb));
3114	skb->pkt_type = PACKET_LOOPBACK;
3115	skb->ip_summed = CHECKSUM_UNNECESSARY;
3116	WARN_ON(!skb_dst(skb));
3117	skb_dst_force(skb);
3118	netif_rx_ni(skb);
3119	return 0;
3120}
3121EXPORT_SYMBOL(dev_loopback_xmit);
3122
3123#ifdef CONFIG_NET_EGRESS
3124static struct sk_buff *
3125sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3126{
3127	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3128	struct tcf_result cl_res;
3129
3130	if (!cl)
3131		return skb;
3132
3133	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3134	 * earlier by the caller.
3135	 */
3136	qdisc_bstats_cpu_update(cl->q, skb);
3137
3138	switch (tc_classify(skb, cl, &cl_res, false)) {
3139	case TC_ACT_OK:
3140	case TC_ACT_RECLASSIFY:
3141		skb->tc_index = TC_H_MIN(cl_res.classid);
3142		break;
3143	case TC_ACT_SHOT:
3144		qdisc_qstats_cpu_drop(cl->q);
3145		*ret = NET_XMIT_DROP;
3146		goto drop;
3147	case TC_ACT_STOLEN:
3148	case TC_ACT_QUEUED:
3149		*ret = NET_XMIT_SUCCESS;
3150drop:
3151		kfree_skb(skb);
3152		return NULL;
3153	case TC_ACT_REDIRECT:
3154		/* No need to push/pop skb's mac_header here on egress! */
3155		skb_do_redirect(skb);
3156		*ret = NET_XMIT_SUCCESS;
3157		return NULL;
3158	default:
3159		break;
3160	}
3161
3162	return skb;
3163}
3164#endif /* CONFIG_NET_EGRESS */
3165
3166static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3167{
3168#ifdef CONFIG_XPS
3169	struct xps_dev_maps *dev_maps;
3170	struct xps_map *map;
3171	int queue_index = -1;
3172
3173	rcu_read_lock();
3174	dev_maps = rcu_dereference(dev->xps_maps);
3175	if (dev_maps) {
3176		map = rcu_dereference(
3177		    dev_maps->cpu_map[skb->sender_cpu - 1]);
3178		if (map) {
3179			if (map->len == 1)
3180				queue_index = map->queues[0];
3181			else
3182				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3183									   map->len)];
3184			if (unlikely(queue_index >= dev->real_num_tx_queues))
3185				queue_index = -1;
3186		}
3187	}
3188	rcu_read_unlock();
3189
3190	return queue_index;
3191#else
3192	return -1;
3193#endif
3194}
3195
3196static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3197{
3198	struct sock *sk = skb->sk;
3199	int queue_index = sk_tx_queue_get(sk);
3200
3201	if (queue_index < 0 || skb->ooo_okay ||
3202	    queue_index >= dev->real_num_tx_queues) {
3203		int new_index = get_xps_queue(dev, skb);
3204		if (new_index < 0)
3205			new_index = skb_tx_hash(dev, skb);
3206
3207		if (queue_index != new_index && sk &&
3208		    sk_fullsock(sk) &&
3209		    rcu_access_pointer(sk->sk_dst_cache))
3210			sk_tx_queue_set(sk, new_index);
3211
3212		queue_index = new_index;
3213	}
3214
3215	return queue_index;
3216}
3217
3218struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3219				    struct sk_buff *skb,
3220				    void *accel_priv)
3221{
3222	int queue_index = 0;
3223
3224#ifdef CONFIG_XPS
3225	u32 sender_cpu = skb->sender_cpu - 1;
3226
3227	if (sender_cpu >= (u32)NR_CPUS)
3228		skb->sender_cpu = raw_smp_processor_id() + 1;
3229#endif
3230
3231	if (dev->real_num_tx_queues != 1) {
3232		const struct net_device_ops *ops = dev->netdev_ops;
3233		if (ops->ndo_select_queue)
3234			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3235							    __netdev_pick_tx);
3236		else
3237			queue_index = __netdev_pick_tx(dev, skb);
3238
3239		if (!accel_priv)
3240			queue_index = netdev_cap_txqueue(dev, queue_index);
3241	}
3242
3243	skb_set_queue_mapping(skb, queue_index);
3244	return netdev_get_tx_queue(dev, queue_index);
3245}
3246
3247/**
3248 *	__dev_queue_xmit - transmit a buffer
3249 *	@skb: buffer to transmit
3250 *	@accel_priv: private data used for L2 forwarding offload
3251 *
3252 *	Queue a buffer for transmission to a network device. The caller must
3253 *	have set the device and priority and built the buffer before calling
3254 *	this function. The function can be called from an interrupt.
3255 *
3256 *	A negative errno code is returned on a failure. A success does not
3257 *	guarantee the frame will be transmitted as it may be dropped due
3258 *	to congestion or traffic shaping.
3259 *
3260 * -----------------------------------------------------------------------------------
3261 *      I notice this method can also return errors from the queue disciplines,
3262 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3263 *      be positive.
3264 *
3265 *      Regardless of the return value, the skb is consumed, so it is currently
3266 *      difficult to retry a send to this method.  (You can bump the ref count
3267 *      before sending to hold a reference for retry if you are careful.)
3268 *
3269 *      When calling this method, interrupts MUST be enabled.  This is because
3270 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3271 *          --BLG
3272 */
3273static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3274{
3275	struct net_device *dev = skb->dev;
3276	struct netdev_queue *txq;
3277	struct Qdisc *q;
3278	int rc = -ENOMEM;
3279
3280	skb_reset_mac_header(skb);
3281
3282	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3283		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3284
3285	/* Disable soft irqs for various locks below. Also
3286	 * stops preemption for RCU.
3287	 */
3288	rcu_read_lock_bh();
3289
3290	skb_update_prio(skb);
3291
3292	qdisc_pkt_len_init(skb);
3293#ifdef CONFIG_NET_CLS_ACT
3294	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3295# ifdef CONFIG_NET_EGRESS
3296	if (static_key_false(&egress_needed)) {
3297		skb = sch_handle_egress(skb, &rc, dev);
3298		if (!skb)
3299			goto out;
3300	}
3301# endif
3302#endif
3303	/* If device/qdisc don't need skb->dst, release it right now while
3304	 * its hot in this cpu cache.
3305	 */
3306	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3307		skb_dst_drop(skb);
3308	else
3309		skb_dst_force(skb);
3310
3311#ifdef CONFIG_NET_SWITCHDEV
3312	/* Don't forward if offload device already forwarded */
3313	if (skb->offload_fwd_mark &&
3314	    skb->offload_fwd_mark == dev->offload_fwd_mark) {
3315		consume_skb(skb);
3316		rc = NET_XMIT_SUCCESS;
3317		goto out;
3318	}
3319#endif
3320
3321	txq = netdev_pick_tx(dev, skb, accel_priv);
3322	q = rcu_dereference_bh(txq->qdisc);
3323
3324	trace_net_dev_queue(skb);
3325	if (q->enqueue) {
3326		rc = __dev_xmit_skb(skb, q, dev, txq);
3327		goto out;
3328	}
3329
3330	/* The device has no queue. Common case for software devices:
3331	   loopback, all the sorts of tunnels...
3332
3333	   Really, it is unlikely that netif_tx_lock protection is necessary
3334	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3335	   counters.)
3336	   However, it is possible, that they rely on protection
3337	   made by us here.
3338
3339	   Check this and shot the lock. It is not prone from deadlocks.
3340	   Either shot noqueue qdisc, it is even simpler 8)
3341	 */
3342	if (dev->flags & IFF_UP) {
3343		int cpu = smp_processor_id(); /* ok because BHs are off */
3344
3345		if (txq->xmit_lock_owner != cpu) {
3346
3347			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3348				goto recursion_alert;
3349
3350			skb = validate_xmit_skb(skb, dev);
3351			if (!skb)
3352				goto drop;
3353
3354			HARD_TX_LOCK(dev, txq, cpu);
3355
3356			if (!netif_xmit_stopped(txq)) {
3357				__this_cpu_inc(xmit_recursion);
3358				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3359				__this_cpu_dec(xmit_recursion);
3360				if (dev_xmit_complete(rc)) {
3361					HARD_TX_UNLOCK(dev, txq);
3362					goto out;
3363				}
3364			}
3365			HARD_TX_UNLOCK(dev, txq);
3366			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3367					     dev->name);
3368		} else {
3369			/* Recursion is detected! It is possible,
3370			 * unfortunately
3371			 */
3372recursion_alert:
3373			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3374					     dev->name);
3375		}
3376	}
3377
3378	rc = -ENETDOWN;
3379drop:
3380	rcu_read_unlock_bh();
3381
3382	atomic_long_inc(&dev->tx_dropped);
3383	kfree_skb_list(skb);
3384	return rc;
3385out:
3386	rcu_read_unlock_bh();
3387	return rc;
3388}
3389
3390int dev_queue_xmit(struct sk_buff *skb)
3391{
3392	return __dev_queue_xmit(skb, NULL);
3393}
3394EXPORT_SYMBOL(dev_queue_xmit);
3395
3396int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3397{
3398	return __dev_queue_xmit(skb, accel_priv);
3399}
3400EXPORT_SYMBOL(dev_queue_xmit_accel);
3401
3402
3403/*=======================================================================
3404			Receiver routines
3405  =======================================================================*/
3406
3407int netdev_max_backlog __read_mostly = 1000;
3408EXPORT_SYMBOL(netdev_max_backlog);
3409
3410int netdev_tstamp_prequeue __read_mostly = 1;
3411int netdev_budget __read_mostly = 300;
3412int weight_p __read_mostly = 64;            /* old backlog weight */
3413
3414/* Called with irq disabled */
3415static inline void ____napi_schedule(struct softnet_data *sd,
3416				     struct napi_struct *napi)
3417{
3418	list_add_tail(&napi->poll_list, &sd->poll_list);
3419	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3420}
3421
3422#ifdef CONFIG_RPS
3423
3424/* One global table that all flow-based protocols share. */
3425struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3426EXPORT_SYMBOL(rps_sock_flow_table);
3427u32 rps_cpu_mask __read_mostly;
3428EXPORT_SYMBOL(rps_cpu_mask);
3429
3430struct static_key rps_needed __read_mostly;
3431
3432static struct rps_dev_flow *
3433set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3434	    struct rps_dev_flow *rflow, u16 next_cpu)
3435{
3436	if (next_cpu < nr_cpu_ids) {
3437#ifdef CONFIG_RFS_ACCEL
3438		struct netdev_rx_queue *rxqueue;
3439		struct rps_dev_flow_table *flow_table;
3440		struct rps_dev_flow *old_rflow;
3441		u32 flow_id;
3442		u16 rxq_index;
3443		int rc;
3444
3445		/* Should we steer this flow to a different hardware queue? */
3446		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3447		    !(dev->features & NETIF_F_NTUPLE))
3448			goto out;
3449		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3450		if (rxq_index == skb_get_rx_queue(skb))
3451			goto out;
3452
3453		rxqueue = dev->_rx + rxq_index;
3454		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3455		if (!flow_table)
3456			goto out;
3457		flow_id = skb_get_hash(skb) & flow_table->mask;
3458		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3459							rxq_index, flow_id);
3460		if (rc < 0)
3461			goto out;
3462		old_rflow = rflow;
3463		rflow = &flow_table->flows[flow_id];
3464		rflow->filter = rc;
3465		if (old_rflow->filter == rflow->filter)
3466			old_rflow->filter = RPS_NO_FILTER;
3467	out:
3468#endif
3469		rflow->last_qtail =
3470			per_cpu(softnet_data, next_cpu).input_queue_head;
3471	}
3472
3473	rflow->cpu = next_cpu;
3474	return rflow;
3475}
3476
3477/*
3478 * get_rps_cpu is called from netif_receive_skb and returns the target
3479 * CPU from the RPS map of the receiving queue for a given skb.
3480 * rcu_read_lock must be held on entry.
3481 */
3482static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3483		       struct rps_dev_flow **rflowp)
3484{
3485	const struct rps_sock_flow_table *sock_flow_table;
3486	struct netdev_rx_queue *rxqueue = dev->_rx;
3487	struct rps_dev_flow_table *flow_table;
3488	struct rps_map *map;
3489	int cpu = -1;
3490	u32 tcpu;
3491	u32 hash;
3492
3493	if (skb_rx_queue_recorded(skb)) {
3494		u16 index = skb_get_rx_queue(skb);
3495
3496		if (unlikely(index >= dev->real_num_rx_queues)) {
3497			WARN_ONCE(dev->real_num_rx_queues > 1,
3498				  "%s received packet on queue %u, but number "
3499				  "of RX queues is %u\n",
3500				  dev->name, index, dev->real_num_rx_queues);
3501			goto done;
3502		}
3503		rxqueue += index;
3504	}
3505
3506	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3507
3508	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3509	map = rcu_dereference(rxqueue->rps_map);
3510	if (!flow_table && !map)
3511		goto done;
3512
3513	skb_reset_network_header(skb);
3514	hash = skb_get_hash(skb);
3515	if (!hash)
3516		goto done;
3517
3518	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3519	if (flow_table && sock_flow_table) {
3520		struct rps_dev_flow *rflow;
3521		u32 next_cpu;
3522		u32 ident;
3523
3524		/* First check into global flow table if there is a match */
3525		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3526		if ((ident ^ hash) & ~rps_cpu_mask)
3527			goto try_rps;
3528
3529		next_cpu = ident & rps_cpu_mask;
3530
3531		/* OK, now we know there is a match,
3532		 * we can look at the local (per receive queue) flow table
3533		 */
3534		rflow = &flow_table->flows[hash & flow_table->mask];
3535		tcpu = rflow->cpu;
3536
3537		/*
3538		 * If the desired CPU (where last recvmsg was done) is
3539		 * different from current CPU (one in the rx-queue flow
3540		 * table entry), switch if one of the following holds:
3541		 *   - Current CPU is unset (>= nr_cpu_ids).
3542		 *   - Current CPU is offline.
3543		 *   - The current CPU's queue tail has advanced beyond the
3544		 *     last packet that was enqueued using this table entry.
3545		 *     This guarantees that all previous packets for the flow
3546		 *     have been dequeued, thus preserving in order delivery.
3547		 */
3548		if (unlikely(tcpu != next_cpu) &&
3549		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3550		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3551		      rflow->last_qtail)) >= 0)) {
3552			tcpu = next_cpu;
3553			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3554		}
3555
3556		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3557			*rflowp = rflow;
3558			cpu = tcpu;
3559			goto done;
3560		}
3561	}
3562
3563try_rps:
3564
3565	if (map) {
3566		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3567		if (cpu_online(tcpu)) {
3568			cpu = tcpu;
3569			goto done;
3570		}
3571	}
3572
3573done:
3574	return cpu;
3575}
3576
3577#ifdef CONFIG_RFS_ACCEL
3578
3579/**
3580 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3581 * @dev: Device on which the filter was set
3582 * @rxq_index: RX queue index
3583 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3584 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3585 *
3586 * Drivers that implement ndo_rx_flow_steer() should periodically call
3587 * this function for each installed filter and remove the filters for
3588 * which it returns %true.
3589 */
3590bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3591			 u32 flow_id, u16 filter_id)
3592{
3593	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3594	struct rps_dev_flow_table *flow_table;
3595	struct rps_dev_flow *rflow;
3596	bool expire = true;
3597	unsigned int cpu;
3598
3599	rcu_read_lock();
3600	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3601	if (flow_table && flow_id <= flow_table->mask) {
3602		rflow = &flow_table->flows[flow_id];
3603		cpu = ACCESS_ONCE(rflow->cpu);
3604		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3605		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3606			   rflow->last_qtail) <
3607		     (int)(10 * flow_table->mask)))
3608			expire = false;
3609	}
3610	rcu_read_unlock();
3611	return expire;
3612}
3613EXPORT_SYMBOL(rps_may_expire_flow);
3614
3615#endif /* CONFIG_RFS_ACCEL */
3616
3617/* Called from hardirq (IPI) context */
3618static void rps_trigger_softirq(void *data)
3619{
3620	struct softnet_data *sd = data;
3621
3622	____napi_schedule(sd, &sd->backlog);
3623	sd->received_rps++;
3624}
3625
3626#endif /* CONFIG_RPS */
3627
3628/*
3629 * Check if this softnet_data structure is another cpu one
3630 * If yes, queue it to our IPI list and return 1
3631 * If no, return 0
3632 */
3633static int rps_ipi_queued(struct softnet_data *sd)
3634{
3635#ifdef CONFIG_RPS
3636	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3637
3638	if (sd != mysd) {
3639		sd->rps_ipi_next = mysd->rps_ipi_list;
3640		mysd->rps_ipi_list = sd;
3641
3642		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3643		return 1;
3644	}
3645#endif /* CONFIG_RPS */
3646	return 0;
3647}
3648
3649#ifdef CONFIG_NET_FLOW_LIMIT
3650int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3651#endif
3652
3653static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3654{
3655#ifdef CONFIG_NET_FLOW_LIMIT
3656	struct sd_flow_limit *fl;
3657	struct softnet_data *sd;
3658	unsigned int old_flow, new_flow;
3659
3660	if (qlen < (netdev_max_backlog >> 1))
3661		return false;
3662
3663	sd = this_cpu_ptr(&softnet_data);
3664
3665	rcu_read_lock();
3666	fl = rcu_dereference(sd->flow_limit);
3667	if (fl) {
3668		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3669		old_flow = fl->history[fl->history_head];
3670		fl->history[fl->history_head] = new_flow;
3671
3672		fl->history_head++;
3673		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3674
3675		if (likely(fl->buckets[old_flow]))
3676			fl->buckets[old_flow]--;
3677
3678		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3679			fl->count++;
3680			rcu_read_unlock();
3681			return true;
3682		}
3683	}
3684	rcu_read_unlock();
3685#endif
3686	return false;
3687}
3688
3689/*
3690 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3691 * queue (may be a remote CPU queue).
3692 */
3693static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3694			      unsigned int *qtail)
3695{
3696	struct softnet_data *sd;
3697	unsigned long flags;
3698	unsigned int qlen;
3699
3700	sd = &per_cpu(softnet_data, cpu);
3701
3702	local_irq_save(flags);
3703
3704	rps_lock(sd);
3705	if (!netif_running(skb->dev))
3706		goto drop;
3707	qlen = skb_queue_len(&sd->input_pkt_queue);
3708	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3709		if (qlen) {
3710enqueue:
3711			__skb_queue_tail(&sd->input_pkt_queue, skb);
3712			input_queue_tail_incr_save(sd, qtail);
3713			rps_unlock(sd);
3714			local_irq_restore(flags);
3715			return NET_RX_SUCCESS;
3716		}
3717
3718		/* Schedule NAPI for backlog device
3719		 * We can use non atomic operation since we own the queue lock
3720		 */
3721		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3722			if (!rps_ipi_queued(sd))
3723				____napi_schedule(sd, &sd->backlog);
3724		}
3725		goto enqueue;
3726	}
3727
3728drop:
3729	sd->dropped++;
3730	rps_unlock(sd);
3731
3732	local_irq_restore(flags);
3733
3734	atomic_long_inc(&skb->dev->rx_dropped);
3735	kfree_skb(skb);
3736	return NET_RX_DROP;
3737}
3738
3739static int netif_rx_internal(struct sk_buff *skb)
3740{
3741	int ret;
3742
3743	net_timestamp_check(netdev_tstamp_prequeue, skb);
3744
3745	trace_netif_rx(skb);
3746#ifdef CONFIG_RPS
3747	if (static_key_false(&rps_needed)) {
3748		struct rps_dev_flow voidflow, *rflow = &voidflow;
3749		int cpu;
3750
3751		preempt_disable();
3752		rcu_read_lock();
3753
3754		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3755		if (cpu < 0)
3756			cpu = smp_processor_id();
3757
3758		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3759
3760		rcu_read_unlock();
3761		preempt_enable();
3762	} else
3763#endif
3764	{
3765		unsigned int qtail;
3766		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3767		put_cpu();
3768	}
3769	return ret;
3770}
3771
3772/**
3773 *	netif_rx	-	post buffer to the network code
3774 *	@skb: buffer to post
3775 *
3776 *	This function receives a packet from a device driver and queues it for
3777 *	the upper (protocol) levels to process.  It always succeeds. The buffer
3778 *	may be dropped during processing for congestion control or by the
3779 *	protocol layers.
3780 *
3781 *	return values:
3782 *	NET_RX_SUCCESS	(no congestion)
3783 *	NET_RX_DROP     (packet was dropped)
3784 *
3785 */
3786
3787int netif_rx(struct sk_buff *skb)
3788{
3789	trace_netif_rx_entry(skb);
3790
3791	return netif_rx_internal(skb);
3792}
3793EXPORT_SYMBOL(netif_rx);
3794
3795int netif_rx_ni(struct sk_buff *skb)
3796{
3797	int err;
3798
3799	trace_netif_rx_ni_entry(skb);
3800
3801	preempt_disable();
3802	err = netif_rx_internal(skb);
3803	if (local_softirq_pending())
3804		do_softirq();
3805	preempt_enable();
3806
3807	return err;
3808}
3809EXPORT_SYMBOL(netif_rx_ni);
3810
3811static void net_tx_action(struct softirq_action *h)
3812{
3813	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3814
3815	if (sd->completion_queue) {
3816		struct sk_buff *clist;
3817
3818		local_irq_disable();
3819		clist = sd->completion_queue;
3820		sd->completion_queue = NULL;
3821		local_irq_enable();
3822
3823		while (clist) {
3824			struct sk_buff *skb = clist;
3825			clist = clist->next;
3826
3827			WARN_ON(atomic_read(&skb->users));
3828			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3829				trace_consume_skb(skb);
3830			else
3831				trace_kfree_skb(skb, net_tx_action);
3832
3833			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3834				__kfree_skb(skb);
3835			else
3836				__kfree_skb_defer(skb);
3837		}
3838
3839		__kfree_skb_flush();
3840	}
3841
3842	if (sd->output_queue) {
3843		struct Qdisc *head;
3844
3845		local_irq_disable();
3846		head = sd->output_queue;
3847		sd->output_queue = NULL;
3848		sd->output_queue_tailp = &sd->output_queue;
3849		local_irq_enable();
3850
3851		while (head) {
3852			struct Qdisc *q = head;
3853			spinlock_t *root_lock;
3854
3855			head = head->next_sched;
3856
3857			root_lock = qdisc_lock(q);
3858			if (spin_trylock(root_lock)) {
3859				smp_mb__before_atomic();
3860				clear_bit(__QDISC_STATE_SCHED,
3861					  &q->state);
3862				qdisc_run(q);
3863				spin_unlock(root_lock);
3864			} else {
3865				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3866					      &q->state)) {
3867					__netif_reschedule(q);
3868				} else {
3869					smp_mb__before_atomic();
3870					clear_bit(__QDISC_STATE_SCHED,
3871						  &q->state);
3872				}
3873			}
3874		}
3875	}
3876}
3877
3878#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3879    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3880/* This hook is defined here for ATM LANE */
3881int (*br_fdb_test_addr_hook)(struct net_device *dev,
3882			     unsigned char *addr) __read_mostly;
3883EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3884#endif
3885
3886static inline struct sk_buff *
3887sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3888		   struct net_device *orig_dev)
3889{
3890#ifdef CONFIG_NET_CLS_ACT
3891	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3892	struct tcf_result cl_res;
3893
3894	/* If there's at least one ingress present somewhere (so
3895	 * we get here via enabled static key), remaining devices
3896	 * that are not configured with an ingress qdisc will bail
3897	 * out here.
3898	 */
3899	if (!cl)
3900		return skb;
3901	if (*pt_prev) {
3902		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3903		*pt_prev = NULL;
3904	}
3905
3906	qdisc_skb_cb(skb)->pkt_len = skb->len;
3907	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3908	qdisc_bstats_cpu_update(cl->q, skb);
3909
3910	switch (tc_classify(skb, cl, &cl_res, false)) {
3911	case TC_ACT_OK:
3912	case TC_ACT_RECLASSIFY:
3913		skb->tc_index = TC_H_MIN(cl_res.classid);
3914		break;
3915	case TC_ACT_SHOT:
3916		qdisc_qstats_cpu_drop(cl->q);
3917	case TC_ACT_STOLEN:
3918	case TC_ACT_QUEUED:
3919		kfree_skb(skb);
3920		return NULL;
3921	case TC_ACT_REDIRECT:
3922		/* skb_mac_header check was done by cls/act_bpf, so
3923		 * we can safely push the L2 header back before
3924		 * redirecting to another netdev
3925		 */
3926		__skb_push(skb, skb->mac_len);
3927		skb_do_redirect(skb);
3928		return NULL;
3929	default:
3930		break;
3931	}
3932#endif /* CONFIG_NET_CLS_ACT */
3933	return skb;
3934}
3935
3936/**
3937 *	netdev_rx_handler_register - register receive handler
3938 *	@dev: device to register a handler for
3939 *	@rx_handler: receive handler to register
3940 *	@rx_handler_data: data pointer that is used by rx handler
3941 *
3942 *	Register a receive handler for a device. This handler will then be
3943 *	called from __netif_receive_skb. A negative errno code is returned
3944 *	on a failure.
3945 *
3946 *	The caller must hold the rtnl_mutex.
3947 *
3948 *	For a general description of rx_handler, see enum rx_handler_result.
3949 */
3950int netdev_rx_handler_register(struct net_device *dev,
3951			       rx_handler_func_t *rx_handler,
3952			       void *rx_handler_data)
3953{
3954	ASSERT_RTNL();
3955
3956	if (dev->rx_handler)
3957		return -EBUSY;
3958
3959	/* Note: rx_handler_data must be set before rx_handler */
3960	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3961	rcu_assign_pointer(dev->rx_handler, rx_handler);
3962
3963	return 0;
3964}
3965EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3966
3967/**
3968 *	netdev_rx_handler_unregister - unregister receive handler
3969 *	@dev: device to unregister a handler from
3970 *
3971 *	Unregister a receive handler from a device.
3972 *
3973 *	The caller must hold the rtnl_mutex.
3974 */
3975void netdev_rx_handler_unregister(struct net_device *dev)
3976{
3977
3978	ASSERT_RTNL();
3979	RCU_INIT_POINTER(dev->rx_handler, NULL);
3980	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3981	 * section has a guarantee to see a non NULL rx_handler_data
3982	 * as well.
3983	 */
3984	synchronize_net();
3985	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3986}
3987EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3988
3989/*
3990 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3991 * the special handling of PFMEMALLOC skbs.
3992 */
3993static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3994{
3995	switch (skb->protocol) {
3996	case htons(ETH_P_ARP):
3997	case htons(ETH_P_IP):
3998	case htons(ETH_P_IPV6):
3999	case htons(ETH_P_8021Q):
4000	case htons(ETH_P_8021AD):
4001		return true;
4002	default:
4003		return false;
4004	}
4005}
4006
4007static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4008			     int *ret, struct net_device *orig_dev)
4009{
4010#ifdef CONFIG_NETFILTER_INGRESS
4011	if (nf_hook_ingress_active(skb)) {
4012		if (*pt_prev) {
4013			*ret = deliver_skb(skb, *pt_prev, orig_dev);
4014			*pt_prev = NULL;
4015		}
4016
4017		return nf_hook_ingress(skb);
4018	}
4019#endif /* CONFIG_NETFILTER_INGRESS */
4020	return 0;
4021}
4022
4023static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4024{
4025	struct packet_type *ptype, *pt_prev;
4026	rx_handler_func_t *rx_handler;
4027	struct net_device *orig_dev;
4028	bool deliver_exact = false;
4029	int ret = NET_RX_DROP;
4030	__be16 type;
4031
4032	net_timestamp_check(!netdev_tstamp_prequeue, skb);
4033
4034	trace_netif_receive_skb(skb);
4035
4036	orig_dev = skb->dev;
4037
4038	skb_reset_network_header(skb);
4039	if (!skb_transport_header_was_set(skb))
4040		skb_reset_transport_header(skb);
4041	skb_reset_mac_len(skb);
4042
4043	pt_prev = NULL;
4044
4045another_round:
4046	skb->skb_iif = skb->dev->ifindex;
4047
4048	__this_cpu_inc(softnet_data.processed);
4049
4050	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4051	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4052		skb = skb_vlan_untag(skb);
4053		if (unlikely(!skb))
4054			goto out;
4055	}
4056
4057#ifdef CONFIG_NET_CLS_ACT
4058	if (skb->tc_verd & TC_NCLS) {
4059		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4060		goto ncls;
4061	}
4062#endif
4063
4064	if (pfmemalloc)
4065		goto skip_taps;
4066
4067	list_for_each_entry_rcu(ptype, &ptype_all, list) {
4068		if (pt_prev)
4069			ret = deliver_skb(skb, pt_prev, orig_dev);
4070		pt_prev = ptype;
4071	}
4072
4073	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4074		if (pt_prev)
4075			ret = deliver_skb(skb, pt_prev, orig_dev);
4076		pt_prev = ptype;
4077	}
4078
4079skip_taps:
4080#ifdef CONFIG_NET_INGRESS
4081	if (static_key_false(&ingress_needed)) {
4082		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4083		if (!skb)
4084			goto out;
4085
4086		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4087			goto out;
4088	}
4089#endif
4090#ifdef CONFIG_NET_CLS_ACT
4091	skb->tc_verd = 0;
4092ncls:
4093#endif
4094	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4095		goto drop;
4096
4097	if (skb_vlan_tag_present(skb)) {
4098		if (pt_prev) {
4099			ret = deliver_skb(skb, pt_prev, orig_dev);
4100			pt_prev = NULL;
4101		}
4102		if (vlan_do_receive(&skb))
4103			goto another_round;
4104		else if (unlikely(!skb))
4105			goto out;
4106	}
4107
4108	rx_handler = rcu_dereference(skb->dev->rx_handler);
4109	if (rx_handler) {
4110		if (pt_prev) {
4111			ret = deliver_skb(skb, pt_prev, orig_dev);
4112			pt_prev = NULL;
4113		}
4114		switch (rx_handler(&skb)) {
4115		case RX_HANDLER_CONSUMED:
4116			ret = NET_RX_SUCCESS;
4117			goto out;
4118		case RX_HANDLER_ANOTHER:
4119			goto another_round;
4120		case RX_HANDLER_EXACT:
4121			deliver_exact = true;
4122		case RX_HANDLER_PASS:
4123			break;
4124		default:
4125			BUG();
4126		}
4127	}
4128
4129	if (unlikely(skb_vlan_tag_present(skb))) {
4130		if (skb_vlan_tag_get_id(skb))
4131			skb->pkt_type = PACKET_OTHERHOST;
4132		/* Note: we might in the future use prio bits
4133		 * and set skb->priority like in vlan_do_receive()
4134		 * For the time being, just ignore Priority Code Point
4135		 */
4136		skb->vlan_tci = 0;
4137	}
4138
4139	type = skb->protocol;
4140
4141	/* deliver only exact match when indicated */
4142	if (likely(!deliver_exact)) {
4143		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4144				       &ptype_base[ntohs(type) &
4145						   PTYPE_HASH_MASK]);
4146	}
4147
4148	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4149			       &orig_dev->ptype_specific);
4150
4151	if (unlikely(skb->dev != orig_dev)) {
4152		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4153				       &skb->dev->ptype_specific);
4154	}
4155
4156	if (pt_prev) {
4157		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4158			goto drop;
4159		else
4160			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4161	} else {
4162drop:
4163		if (!deliver_exact)
4164			atomic_long_inc(&skb->dev->rx_dropped);
4165		else
4166			atomic_long_inc(&skb->dev->rx_nohandler);
4167		kfree_skb(skb);
4168		/* Jamal, now you will not able to escape explaining
4169		 * me how you were going to use this. :-)
4170		 */
4171		ret = NET_RX_DROP;
4172	}
4173
4174out:
4175	return ret;
4176}
4177
4178static int __netif_receive_skb(struct sk_buff *skb)
4179{
4180	int ret;
4181
4182	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4183		unsigned long pflags = current->flags;
4184
4185		/*
4186		 * PFMEMALLOC skbs are special, they should
4187		 * - be delivered to SOCK_MEMALLOC sockets only
4188		 * - stay away from userspace
4189		 * - have bounded memory usage
4190		 *
4191		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4192		 * context down to all allocation sites.
4193		 */
4194		current->flags |= PF_MEMALLOC;
4195		ret = __netif_receive_skb_core(skb, true);
4196		tsk_restore_flags(current, pflags, PF_MEMALLOC);
4197	} else
4198		ret = __netif_receive_skb_core(skb, false);
4199
4200	return ret;
4201}
4202
4203static int netif_receive_skb_internal(struct sk_buff *skb)
4204{
4205	int ret;
4206
4207	net_timestamp_check(netdev_tstamp_prequeue, skb);
4208
4209	if (skb_defer_rx_timestamp(skb))
4210		return NET_RX_SUCCESS;
4211
4212	rcu_read_lock();
4213
4214#ifdef CONFIG_RPS
4215	if (static_key_false(&rps_needed)) {
4216		struct rps_dev_flow voidflow, *rflow = &voidflow;
4217		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4218
4219		if (cpu >= 0) {
4220			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4221			rcu_read_unlock();
4222			return ret;
4223		}
4224	}
4225#endif
4226	ret = __netif_receive_skb(skb);
4227	rcu_read_unlock();
4228	return ret;
4229}
4230
4231/**
4232 *	netif_receive_skb - process receive buffer from network
4233 *	@skb: buffer to process
4234 *
4235 *	netif_receive_skb() is the main receive data processing function.
4236 *	It always succeeds. The buffer may be dropped during processing
4237 *	for congestion control or by the protocol layers.
4238 *
4239 *	This function may only be called from softirq context and interrupts
4240 *	should be enabled.
4241 *
4242 *	Return values (usually ignored):
4243 *	NET_RX_SUCCESS: no congestion
4244 *	NET_RX_DROP: packet was dropped
4245 */
4246int netif_receive_skb(struct sk_buff *skb)
4247{
4248	trace_netif_receive_skb_entry(skb);
4249
4250	return netif_receive_skb_internal(skb);
4251}
4252EXPORT_SYMBOL(netif_receive_skb);
4253
4254/* Network device is going away, flush any packets still pending
4255 * Called with irqs disabled.
4256 */
4257static void flush_backlog(void *arg)
4258{
4259	struct net_device *dev = arg;
4260	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4261	struct sk_buff *skb, *tmp;
4262
4263	rps_lock(sd);
4264	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4265		if (skb->dev == dev) {
4266			__skb_unlink(skb, &sd->input_pkt_queue);
4267			kfree_skb(skb);
4268			input_queue_head_incr(sd);
4269		}
4270	}
4271	rps_unlock(sd);
4272
4273	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4274		if (skb->dev == dev) {
4275			__skb_unlink(skb, &sd->process_queue);
4276			kfree_skb(skb);
4277			input_queue_head_incr(sd);
4278		}
4279	}
4280}
4281
4282static int napi_gro_complete(struct sk_buff *skb)
4283{
4284	struct packet_offload *ptype;
4285	__be16 type = skb->protocol;
4286	struct list_head *head = &offload_base;
4287	int err = -ENOENT;
4288
4289	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4290
4291	if (NAPI_GRO_CB(skb)->count == 1) {
4292		skb_shinfo(skb)->gso_size = 0;
4293		goto out;
4294	}
4295
4296	rcu_read_lock();
4297	list_for_each_entry_rcu(ptype, head, list) {
4298		if (ptype->type != type || !ptype->callbacks.gro_complete)
4299			continue;
4300
4301		err = ptype->callbacks.gro_complete(skb, 0);
4302		break;
4303	}
4304	rcu_read_unlock();
4305
4306	if (err) {
4307		WARN_ON(&ptype->list == head);
4308		kfree_skb(skb);
4309		return NET_RX_SUCCESS;
4310	}
4311
4312out:
4313	return netif_receive_skb_internal(skb);
4314}
4315
4316/* napi->gro_list contains packets ordered by age.
4317 * youngest packets at the head of it.
4318 * Complete skbs in reverse order to reduce latencies.
4319 */
4320void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4321{
4322	struct sk_buff *skb, *prev = NULL;
4323
4324	/* scan list and build reverse chain */
4325	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4326		skb->prev = prev;
4327		prev = skb;
4328	}
4329
4330	for (skb = prev; skb; skb = prev) {
4331		skb->next = NULL;
4332
4333		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4334			return;
4335
4336		prev = skb->prev;
4337		napi_gro_complete(skb);
4338		napi->gro_count--;
4339	}
4340
4341	napi->gro_list = NULL;
4342}
4343EXPORT_SYMBOL(napi_gro_flush);
4344
4345static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4346{
4347	struct sk_buff *p;
4348	unsigned int maclen = skb->dev->hard_header_len;
4349	u32 hash = skb_get_hash_raw(skb);
4350
4351	for (p = napi->gro_list; p; p = p->next) {
4352		unsigned long diffs;
4353
4354		NAPI_GRO_CB(p)->flush = 0;
4355
4356		if (hash != skb_get_hash_raw(p)) {
4357			NAPI_GRO_CB(p)->same_flow = 0;
4358			continue;
4359		}
4360
4361		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4362		diffs |= p->vlan_tci ^ skb->vlan_tci;
4363		diffs |= skb_metadata_dst_cmp(p, skb);
4364		if (maclen == ETH_HLEN)
4365			diffs |= compare_ether_header(skb_mac_header(p),
4366						      skb_mac_header(skb));
4367		else if (!diffs)
4368			diffs = memcmp(skb_mac_header(p),
4369				       skb_mac_header(skb),
4370				       maclen);
4371		NAPI_GRO_CB(p)->same_flow = !diffs;
4372	}
4373}
4374
4375static void skb_gro_reset_offset(struct sk_buff *skb)
4376{
4377	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4378	const skb_frag_t *frag0 = &pinfo->frags[0];
4379
4380	NAPI_GRO_CB(skb)->data_offset = 0;
4381	NAPI_GRO_CB(skb)->frag0 = NULL;
4382	NAPI_GRO_CB(skb)->frag0_len = 0;
4383
4384	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4385	    pinfo->nr_frags &&
4386	    !PageHighMem(skb_frag_page(frag0))) {
4387		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4388		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4389	}
4390}
4391
4392static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4393{
4394	struct skb_shared_info *pinfo = skb_shinfo(skb);
4395
4396	BUG_ON(skb->end - skb->tail < grow);
4397
4398	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4399
4400	skb->data_len -= grow;
4401	skb->tail += grow;
4402
4403	pinfo->frags[0].page_offset += grow;
4404	skb_frag_size_sub(&pinfo->frags[0], grow);
4405
4406	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4407		skb_frag_unref(skb, 0);
4408		memmove(pinfo->frags, pinfo->frags + 1,
4409			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4410	}
4411}
4412
4413static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4414{
4415	struct sk_buff **pp = NULL;
4416	struct packet_offload *ptype;
4417	__be16 type = skb->protocol;
4418	struct list_head *head = &offload_base;
4419	int same_flow;
4420	enum gro_result ret;
4421	int grow;
4422
4423	if (!(skb->dev->features & NETIF_F_GRO))
4424		goto normal;
4425
4426	if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4427		goto normal;
4428
4429	gro_list_prepare(napi, skb);
4430
4431	rcu_read_lock();
4432	list_for_each_entry_rcu(ptype, head, list) {
4433		if (ptype->type != type || !ptype->callbacks.gro_receive)
4434			continue;
4435
4436		skb_set_network_header(skb, skb_gro_offset(skb));
4437		skb_reset_mac_len(skb);
4438		NAPI_GRO_CB(skb)->same_flow = 0;
4439		NAPI_GRO_CB(skb)->flush = 0;
4440		NAPI_GRO_CB(skb)->free = 0;
4441		NAPI_GRO_CB(skb)->encap_mark = 0;
4442		NAPI_GRO_CB(skb)->is_fou = 0;
4443		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4444
4445		/* Setup for GRO checksum validation */
4446		switch (skb->ip_summed) {
4447		case CHECKSUM_COMPLETE:
4448			NAPI_GRO_CB(skb)->csum = skb->csum;
4449			NAPI_GRO_CB(skb)->csum_valid = 1;
4450			NAPI_GRO_CB(skb)->csum_cnt = 0;
4451			break;
4452		case CHECKSUM_UNNECESSARY:
4453			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4454			NAPI_GRO_CB(skb)->csum_valid = 0;
4455			break;
4456		default:
4457			NAPI_GRO_CB(skb)->csum_cnt = 0;
4458			NAPI_GRO_CB(skb)->csum_valid = 0;
4459		}
4460
4461		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4462		break;
4463	}
4464	rcu_read_unlock();
4465
4466	if (&ptype->list == head)
4467		goto normal;
4468
4469	same_flow = NAPI_GRO_CB(skb)->same_flow;
4470	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4471
4472	if (pp) {
4473		struct sk_buff *nskb = *pp;
4474
4475		*pp = nskb->next;
4476		nskb->next = NULL;
4477		napi_gro_complete(nskb);
4478		napi->gro_count--;
4479	}
4480
4481	if (same_flow)
4482		goto ok;
4483
4484	if (NAPI_GRO_CB(skb)->flush)
4485		goto normal;
4486
4487	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4488		struct sk_buff *nskb = napi->gro_list;
4489
4490		/* locate the end of the list to select the 'oldest' flow */
4491		while (nskb->next) {
4492			pp = &nskb->next;
4493			nskb = *pp;
4494		}
4495		*pp = NULL;
4496		nskb->next = NULL;
4497		napi_gro_complete(nskb);
4498	} else {
4499		napi->gro_count++;
4500	}
4501	NAPI_GRO_CB(skb)->count = 1;
4502	NAPI_GRO_CB(skb)->age = jiffies;
4503	NAPI_GRO_CB(skb)->last = skb;
4504	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4505	skb->next = napi->gro_list;
4506	napi->gro_list = skb;
4507	ret = GRO_HELD;
4508
4509pull:
4510	grow = skb_gro_offset(skb) - skb_headlen(skb);
4511	if (grow > 0)
4512		gro_pull_from_frag0(skb, grow);
4513ok:
4514	return ret;
4515
4516normal:
4517	ret = GRO_NORMAL;
4518	goto pull;
4519}
4520
4521struct packet_offload *gro_find_receive_by_type(__be16 type)
4522{
4523	struct list_head *offload_head = &offload_base;
4524	struct packet_offload *ptype;
4525
4526	list_for_each_entry_rcu(ptype, offload_head, list) {
4527		if (ptype->type != type || !ptype->callbacks.gro_receive)
4528			continue;
4529		return ptype;
4530	}
4531	return NULL;
4532}
4533EXPORT_SYMBOL(gro_find_receive_by_type);
4534
4535struct packet_offload *gro_find_complete_by_type(__be16 type)
4536{
4537	struct list_head *offload_head = &offload_base;
4538	struct packet_offload *ptype;
4539
4540	list_for_each_entry_rcu(ptype, offload_head, list) {
4541		if (ptype->type != type || !ptype->callbacks.gro_complete)
4542			continue;
4543		return ptype;
4544	}
4545	return NULL;
4546}
4547EXPORT_SYMBOL(gro_find_complete_by_type);
4548
4549static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4550{
4551	switch (ret) {
4552	case GRO_NORMAL:
4553		if (netif_receive_skb_internal(skb))
4554			ret = GRO_DROP;
4555		break;
4556
4557	case GRO_DROP:
4558		kfree_skb(skb);
4559		break;
4560
4561	case GRO_MERGED_FREE:
4562		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4563			skb_dst_drop(skb);
4564			kmem_cache_free(skbuff_head_cache, skb);
4565		} else {
4566			__kfree_skb(skb);
4567		}
4568		break;
4569
4570	case GRO_HELD:
4571	case GRO_MERGED:
4572		break;
4573	}
4574
4575	return ret;
4576}
4577
4578gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4579{
4580	skb_mark_napi_id(skb, napi);
4581	trace_napi_gro_receive_entry(skb);
4582
4583	skb_gro_reset_offset(skb);
4584
4585	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4586}
4587EXPORT_SYMBOL(napi_gro_receive);
4588
4589static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4590{
4591	if (unlikely(skb->pfmemalloc)) {
4592		consume_skb(skb);
4593		return;
4594	}
4595	__skb_pull(skb, skb_headlen(skb));
4596	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4597	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4598	skb->vlan_tci = 0;
4599	skb->dev = napi->dev;
4600	skb->skb_iif = 0;
4601	skb->encapsulation = 0;
4602	skb_shinfo(skb)->gso_type = 0;
4603	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4604
4605	napi->skb = skb;
4606}
4607
4608struct sk_buff *napi_get_frags(struct napi_struct *napi)
4609{
4610	struct sk_buff *skb = napi->skb;
4611
4612	if (!skb) {
4613		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4614		if (skb) {
4615			napi->skb = skb;
4616			skb_mark_napi_id(skb, napi);
4617		}
4618	}
4619	return skb;
4620}
4621EXPORT_SYMBOL(napi_get_frags);
4622
4623static gro_result_t napi_frags_finish(struct napi_struct *napi,
4624				      struct sk_buff *skb,
4625				      gro_result_t ret)
4626{
4627	switch (ret) {
4628	case GRO_NORMAL:
4629	case GRO_HELD:
4630		__skb_push(skb, ETH_HLEN);
4631		skb->protocol = eth_type_trans(skb, skb->dev);
4632		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4633			ret = GRO_DROP;
4634		break;
4635
4636	case GRO_DROP:
4637	case GRO_MERGED_FREE:
4638		napi_reuse_skb(napi, skb);
4639		break;
4640
4641	case GRO_MERGED:
4642		break;
4643	}
4644
4645	return ret;
4646}
4647
4648/* Upper GRO stack assumes network header starts at gro_offset=0
4649 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4650 * We copy ethernet header into skb->data to have a common layout.
4651 */
4652static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4653{
4654	struct sk_buff *skb = napi->skb;
4655	const struct ethhdr *eth;
4656	unsigned int hlen = sizeof(*eth);
4657
4658	napi->skb = NULL;
4659
4660	skb_reset_mac_header(skb);
4661	skb_gro_reset_offset(skb);
4662
4663	eth = skb_gro_header_fast(skb, 0);
4664	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4665		eth = skb_gro_header_slow(skb, hlen, 0);
4666		if (unlikely(!eth)) {
4667			napi_reuse_skb(napi, skb);
4668			return NULL;
4669		}
4670	} else {
4671		gro_pull_from_frag0(skb, hlen);
4672		NAPI_GRO_CB(skb)->frag0 += hlen;
4673		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4674	}
4675	__skb_pull(skb, hlen);
4676
4677	/*
4678	 * This works because the only protocols we care about don't require
4679	 * special handling.
4680	 * We'll fix it up properly in napi_frags_finish()
4681	 */
4682	skb->protocol = eth->h_proto;
4683
4684	return skb;
4685}
4686
4687gro_result_t napi_gro_frags(struct napi_struct *napi)
4688{
4689	struct sk_buff *skb = napi_frags_skb(napi);
4690
4691	if (!skb)
4692		return GRO_DROP;
4693
4694	trace_napi_gro_frags_entry(skb);
4695
4696	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4697}
4698EXPORT_SYMBOL(napi_gro_frags);
4699
4700/* Compute the checksum from gro_offset and return the folded value
4701 * after adding in any pseudo checksum.
4702 */
4703__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4704{
4705	__wsum wsum;
4706	__sum16 sum;
4707
4708	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4709
4710	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4711	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4712	if (likely(!sum)) {
4713		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4714		    !skb->csum_complete_sw)
4715			netdev_rx_csum_fault(skb->dev);
4716	}
4717
4718	NAPI_GRO_CB(skb)->csum = wsum;
4719	NAPI_GRO_CB(skb)->csum_valid = 1;
4720
4721	return sum;
4722}
4723EXPORT_SYMBOL(__skb_gro_checksum_complete);
4724
4725/*
4726 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4727 * Note: called with local irq disabled, but exits with local irq enabled.
4728 */
4729static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4730{
4731#ifdef CONFIG_RPS
4732	struct softnet_data *remsd = sd->rps_ipi_list;
4733
4734	if (remsd) {
4735		sd->rps_ipi_list = NULL;
4736
4737		local_irq_enable();
4738
4739		/* Send pending IPI's to kick RPS processing on remote cpus. */
4740		while (remsd) {
4741			struct softnet_data *next = remsd->rps_ipi_next;
4742
4743			if (cpu_online(remsd->cpu))
4744				smp_call_function_single_async(remsd->cpu,
4745							   &remsd->csd);
4746			remsd = next;
4747		}
4748	} else
4749#endif
4750		local_irq_enable();
4751}
4752
4753static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4754{
4755#ifdef CONFIG_RPS
4756	return sd->rps_ipi_list != NULL;
4757#else
4758	return false;
4759#endif
4760}
4761
4762static int process_backlog(struct napi_struct *napi, int quota)
4763{
4764	int work = 0;
4765	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4766
4767	/* Check if we have pending ipi, its better to send them now,
4768	 * not waiting net_rx_action() end.
4769	 */
4770	if (sd_has_rps_ipi_waiting(sd)) {
4771		local_irq_disable();
4772		net_rps_action_and_irq_enable(sd);
4773	}
4774
4775	napi->weight = weight_p;
4776	local_irq_disable();
4777	while (1) {
4778		struct sk_buff *skb;
4779
4780		while ((skb = __skb_dequeue(&sd->process_queue))) {
4781			rcu_read_lock();
4782			local_irq_enable();
4783			__netif_receive_skb(skb);
4784			rcu_read_unlock();
4785			local_irq_disable();
4786			input_queue_head_incr(sd);
4787			if (++work >= quota) {
4788				local_irq_enable();
4789				return work;
4790			}
4791		}
4792
4793		rps_lock(sd);
4794		if (skb_queue_empty(&sd->input_pkt_queue)) {
4795			/*
4796			 * Inline a custom version of __napi_complete().
4797			 * only current cpu owns and manipulates this napi,
4798			 * and NAPI_STATE_SCHED is the only possible flag set
4799			 * on backlog.
4800			 * We can use a plain write instead of clear_bit(),
4801			 * and we dont need an smp_mb() memory barrier.
4802			 */
4803			napi->state = 0;
4804			rps_unlock(sd);
4805
4806			break;
4807		}
4808
4809		skb_queue_splice_tail_init(&sd->input_pkt_queue,
4810					   &sd->process_queue);
4811		rps_unlock(sd);
4812	}
4813	local_irq_enable();
4814
4815	return work;
4816}
4817
4818/**
4819 * __napi_schedule - schedule for receive
4820 * @n: entry to schedule
4821 *
4822 * The entry's receive function will be scheduled to run.
4823 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4824 */
4825void __napi_schedule(struct napi_struct *n)
4826{
4827	unsigned long flags;
4828
4829	local_irq_save(flags);
4830	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4831	local_irq_restore(flags);
4832}
4833EXPORT_SYMBOL(__napi_schedule);
4834
4835/**
4836 * __napi_schedule_irqoff - schedule for receive
4837 * @n: entry to schedule
4838 *
4839 * Variant of __napi_schedule() assuming hard irqs are masked
4840 */
4841void __napi_schedule_irqoff(struct napi_struct *n)
4842{
4843	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4844}
4845EXPORT_SYMBOL(__napi_schedule_irqoff);
4846
4847void __napi_complete(struct napi_struct *n)
4848{
4849	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4850
4851	list_del_init(&n->poll_list);
4852	smp_mb__before_atomic();
4853	clear_bit(NAPI_STATE_SCHED, &n->state);
4854}
4855EXPORT_SYMBOL(__napi_complete);
4856
4857void napi_complete_done(struct napi_struct *n, int work_done)
4858{
4859	unsigned long flags;
4860
4861	/*
4862	 * don't let napi dequeue from the cpu poll list
4863	 * just in case its running on a different cpu
4864	 */
4865	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4866		return;
4867
4868	if (n->gro_list) {
4869		unsigned long timeout = 0;
4870
4871		if (work_done)
4872			timeout = n->dev->gro_flush_timeout;
4873
4874		if (timeout)
4875			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4876				      HRTIMER_MODE_REL_PINNED);
4877		else
4878			napi_gro_flush(n, false);
4879	}
4880	if (likely(list_empty(&n->poll_list))) {
4881		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4882	} else {
4883		/* If n->poll_list is not empty, we need to mask irqs */
4884		local_irq_save(flags);
4885		__napi_complete(n);
4886		local_irq_restore(flags);
4887	}
4888}
4889EXPORT_SYMBOL(napi_complete_done);
4890
4891/* must be called under rcu_read_lock(), as we dont take a reference */
4892static struct napi_struct *napi_by_id(unsigned int napi_id)
4893{
4894	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4895	struct napi_struct *napi;
4896
4897	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4898		if (napi->napi_id == napi_id)
4899			return napi;
4900
4901	return NULL;
4902}
4903
4904#if defined(CONFIG_NET_RX_BUSY_POLL)
4905#define BUSY_POLL_BUDGET 8
4906bool sk_busy_loop(struct sock *sk, int nonblock)
4907{
4908	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4909	int (*busy_poll)(struct napi_struct *dev);
4910	struct napi_struct *napi;
4911	int rc = false;
4912
4913	rcu_read_lock();
4914
4915	napi = napi_by_id(sk->sk_napi_id);
4916	if (!napi)
4917		goto out;
4918
4919	/* Note: ndo_busy_poll method is optional in linux-4.5 */
4920	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4921
4922	do {
4923		rc = 0;
4924		local_bh_disable();
4925		if (busy_poll) {
4926			rc = busy_poll(napi);
4927		} else if (napi_schedule_prep(napi)) {
4928			void *have = netpoll_poll_lock(napi);
4929
4930			if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4931				rc = napi->poll(napi, BUSY_POLL_BUDGET);
4932				trace_napi_poll(napi);
4933				if (rc == BUSY_POLL_BUDGET) {
4934					napi_complete_done(napi, rc);
4935					napi_schedule(napi);
4936				}
4937			}
4938			netpoll_poll_unlock(have);
4939		}
4940		if (rc > 0)
4941			NET_ADD_STATS_BH(sock_net(sk),
4942					 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
4943		local_bh_enable();
4944
4945		if (rc == LL_FLUSH_FAILED)
4946			break; /* permanent failure */
4947
4948		cpu_relax();
4949	} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
4950		 !need_resched() && !busy_loop_timeout(end_time));
4951
4952	rc = !skb_queue_empty(&sk->sk_receive_queue);
4953out:
4954	rcu_read_unlock();
4955	return rc;
4956}
4957EXPORT_SYMBOL(sk_busy_loop);
4958
4959#endif /* CONFIG_NET_RX_BUSY_POLL */
4960
4961void napi_hash_add(struct napi_struct *napi)
4962{
4963	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
4964	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
4965		return;
4966
4967	spin_lock(&napi_hash_lock);
4968
4969	/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
4970	do {
4971		if (unlikely(++napi_gen_id < NR_CPUS + 1))
4972			napi_gen_id = NR_CPUS + 1;
4973	} while (napi_by_id(napi_gen_id));
4974	napi->napi_id = napi_gen_id;
4975
4976	hlist_add_head_rcu(&napi->napi_hash_node,
4977			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4978
4979	spin_unlock(&napi_hash_lock);
4980}
4981EXPORT_SYMBOL_GPL(napi_hash_add);
4982
4983/* Warning : caller is responsible to make sure rcu grace period
4984 * is respected before freeing memory containing @napi
4985 */
4986bool napi_hash_del(struct napi_struct *napi)
4987{
4988	bool rcu_sync_needed = false;
4989
4990	spin_lock(&napi_hash_lock);
4991
4992	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
4993		rcu_sync_needed = true;
4994		hlist_del_rcu(&napi->napi_hash_node);
4995	}
4996	spin_unlock(&napi_hash_lock);
4997	return rcu_sync_needed;
4998}
4999EXPORT_SYMBOL_GPL(napi_hash_del);
5000
5001static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5002{
5003	struct napi_struct *napi;
5004
5005	napi = container_of(timer, struct napi_struct, timer);
5006	if (napi->gro_list)
5007		napi_schedule(napi);
5008
5009	return HRTIMER_NORESTART;
5010}
5011
5012void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5013		    int (*poll)(struct napi_struct *, int), int weight)
5014{
5015	INIT_LIST_HEAD(&napi->poll_list);
5016	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5017	napi->timer.function = napi_watchdog;
5018	napi->gro_count = 0;
5019	napi->gro_list = NULL;
5020	napi->skb = NULL;
5021	napi->poll = poll;
5022	if (weight > NAPI_POLL_WEIGHT)
5023		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5024			    weight, dev->name);
5025	napi->weight = weight;
5026	list_add(&napi->dev_list, &dev->napi_list);
5027	napi->dev = dev;
5028#ifdef CONFIG_NETPOLL
5029	spin_lock_init(&napi->poll_lock);
5030	napi->poll_owner = -1;
5031#endif
5032	set_bit(NAPI_STATE_SCHED, &napi->state);
5033	napi_hash_add(napi);
5034}
5035EXPORT_SYMBOL(netif_napi_add);
5036
5037void napi_disable(struct napi_struct *n)
5038{
5039	might_sleep();
5040	set_bit(NAPI_STATE_DISABLE, &n->state);
5041
5042	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5043		msleep(1);
5044	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5045		msleep(1);
5046
5047	hrtimer_cancel(&n->timer);
5048
5049	clear_bit(NAPI_STATE_DISABLE, &n->state);
5050}
5051EXPORT_SYMBOL(napi_disable);
5052
5053/* Must be called in process context */
5054void netif_napi_del(struct napi_struct *napi)
5055{
5056	might_sleep();
5057	if (napi_hash_del(napi))
5058		synchronize_net();
5059	list_del_init(&napi->dev_list);
5060	napi_free_frags(napi);
5061
5062	kfree_skb_list(napi->gro_list);
5063	napi->gro_list = NULL;
5064	napi->gro_count = 0;
5065}
5066EXPORT_SYMBOL(netif_napi_del);
5067
5068static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5069{
5070	void *have;
5071	int work, weight;
5072
5073	list_del_init(&n->poll_list);
5074
5075	have = netpoll_poll_lock(n);
5076
5077	weight = n->weight;
5078
5079	/* This NAPI_STATE_SCHED test is for avoiding a race
5080	 * with netpoll's poll_napi().  Only the entity which
5081	 * obtains the lock and sees NAPI_STATE_SCHED set will
5082	 * actually make the ->poll() call.  Therefore we avoid
5083	 * accidentally calling ->poll() when NAPI is not scheduled.
5084	 */
5085	work = 0;
5086	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5087		work = n->poll(n, weight);
5088		trace_napi_poll(n);
5089	}
5090
5091	WARN_ON_ONCE(work > weight);
5092
5093	if (likely(work < weight))
5094		goto out_unlock;
5095
5096	/* Drivers must not modify the NAPI state if they
5097	 * consume the entire weight.  In such cases this code
5098	 * still "owns" the NAPI instance and therefore can
5099	 * move the instance around on the list at-will.
5100	 */
5101	if (unlikely(napi_disable_pending(n))) {
5102		napi_complete(n);
5103		goto out_unlock;
5104	}
5105
5106	if (n->gro_list) {
5107		/* flush too old packets
5108		 * If HZ < 1000, flush all packets.
5109		 */
5110		napi_gro_flush(n, HZ >= 1000);
5111	}
5112
5113	/* Some drivers may have called napi_schedule
5114	 * prior to exhausting their budget.
5115	 */
5116	if (unlikely(!list_empty(&n->poll_list))) {
5117		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5118			     n->dev ? n->dev->name : "backlog");
5119		goto out_unlock;
5120	}
5121
5122	list_add_tail(&n->poll_list, repoll);
5123
5124out_unlock:
5125	netpoll_poll_unlock(have);
5126
5127	return work;
5128}
5129
5130static void net_rx_action(struct softirq_action *h)
5131{
5132	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5133	unsigned long time_limit = jiffies + 2;
5134	int budget = netdev_budget;
5135	LIST_HEAD(list);
5136	LIST_HEAD(repoll);
5137
5138	local_irq_disable();
5139	list_splice_init(&sd->poll_list, &list);
5140	local_irq_enable();
5141
5142	for (;;) {
5143		struct napi_struct *n;
5144
5145		if (list_empty(&list)) {
5146			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5147				return;
5148			break;
5149		}
5150
5151		n = list_first_entry(&list, struct napi_struct, poll_list);
5152		budget -= napi_poll(n, &repoll);
5153
5154		/* If softirq window is exhausted then punt.
5155		 * Allow this to run for 2 jiffies since which will allow
5156		 * an average latency of 1.5/HZ.
5157		 */
5158		if (unlikely(budget <= 0 ||
5159			     time_after_eq(jiffies, time_limit))) {
5160			sd->time_squeeze++;
5161			break;
5162		}
5163	}
5164
5165	__kfree_skb_flush();
5166	local_irq_disable();
5167
5168	list_splice_tail_init(&sd->poll_list, &list);
5169	list_splice_tail(&repoll, &list);
5170	list_splice(&list, &sd->poll_list);
5171	if (!list_empty(&sd->poll_list))
5172		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
5173
5174	net_rps_action_and_irq_enable(sd);
5175}
5176
5177struct netdev_adjacent {
5178	struct net_device *dev;
5179
5180	/* upper master flag, there can only be one master device per list */
5181	bool master;
5182
5183	/* counter for the number of times this device was added to us */
5184	u16 ref_nr;
5185
5186	/* private field for the users */
5187	void *private;
5188
5189	struct list_head list;
5190	struct rcu_head rcu;
5191};
5192
5193static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5194						 struct list_head *adj_list)
5195{
5196	struct netdev_adjacent *adj;
5197
5198	list_for_each_entry(adj, adj_list, list) {
5199		if (adj->dev == adj_dev)
5200			return adj;
5201	}
5202	return NULL;
5203}
5204
5205/**
5206 * netdev_has_upper_dev - Check if device is linked to an upper device
5207 * @dev: device
5208 * @upper_dev: upper device to check
5209 *
5210 * Find out if a device is linked to specified upper device and return true
5211 * in case it is. Note that this checks only immediate upper device,
5212 * not through a complete stack of devices. The caller must hold the RTNL lock.
5213 */
5214bool netdev_has_upper_dev(struct net_device *dev,
5215			  struct net_device *upper_dev)
5216{
5217	ASSERT_RTNL();
5218
5219	return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5220}
5221EXPORT_SYMBOL(netdev_has_upper_dev);
5222
5223/**
5224 * netdev_has_any_upper_dev - Check if device is linked to some device
5225 * @dev: device
5226 *
5227 * Find out if a device is linked to an upper device and return true in case
5228 * it is. The caller must hold the RTNL lock.
5229 */
5230static bool netdev_has_any_upper_dev(struct net_device *dev)
5231{
5232	ASSERT_RTNL();
5233
5234	return !list_empty(&dev->all_adj_list.upper);
5235}
5236
5237/**
5238 * netdev_master_upper_dev_get - Get master upper device
5239 * @dev: device
5240 *
5241 * Find a master upper device and return pointer to it or NULL in case
5242 * it's not there. The caller must hold the RTNL lock.
5243 */
5244struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5245{
5246	struct netdev_adjacent *upper;
5247
5248	ASSERT_RTNL();
5249
5250	if (list_empty(&dev->adj_list.upper))
5251		return NULL;
5252
5253	upper = list_first_entry(&dev->adj_list.upper,
5254				 struct netdev_adjacent, list);
5255	if (likely(upper->master))
5256		return upper->dev;
5257	return NULL;
5258}
5259EXPORT_SYMBOL(netdev_master_upper_dev_get);
5260
5261void *netdev_adjacent_get_private(struct list_head *adj_list)
5262{
5263	struct netdev_adjacent *adj;
5264
5265	adj = list_entry(adj_list, struct netdev_adjacent, list);
5266
5267	return adj->private;
5268}
5269EXPORT_SYMBOL(netdev_adjacent_get_private);
5270
5271/**
5272 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5273 * @dev: device
5274 * @iter: list_head ** of the current position
5275 *
5276 * Gets the next device from the dev's upper list, starting from iter
5277 * position. The caller must hold RCU read lock.
5278 */
5279struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5280						 struct list_head **iter)
5281{
5282	struct netdev_adjacent *upper;
5283
5284	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5285
5286	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5287
5288	if (&upper->list == &dev->adj_list.upper)
5289		return NULL;
5290
5291	*iter = &upper->list;
5292
5293	return upper->dev;
5294}
5295EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5296
5297/**
5298 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5299 * @dev: device
5300 * @iter: list_head ** of the current position
5301 *
5302 * Gets the next device from the dev's upper list, starting from iter
5303 * position. The caller must hold RCU read lock.
5304 */
5305struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5306						     struct list_head **iter)
5307{
5308	struct netdev_adjacent *upper;
5309
5310	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5311
5312	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5313
5314	if (&upper->list == &dev->all_adj_list.upper)
5315		return NULL;
5316
5317	*iter = &upper->list;
5318
5319	return upper->dev;
5320}
5321EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5322
5323/**
5324 * netdev_lower_get_next_private - Get the next ->private from the
5325 *				   lower neighbour list
5326 * @dev: device
5327 * @iter: list_head ** of the current position
5328 *
5329 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5330 * list, starting from iter position. The caller must hold either hold the
5331 * RTNL lock or its own locking that guarantees that the neighbour lower
5332 * list will remain unchanged.
5333 */
5334void *netdev_lower_get_next_private(struct net_device *dev,
5335				    struct list_head **iter)
5336{
5337	struct netdev_adjacent *lower;
5338
5339	lower = list_entry(*iter, struct netdev_adjacent, list);
5340
5341	if (&lower->list == &dev->adj_list.lower)
5342		return NULL;
5343
5344	*iter = lower->list.next;
5345
5346	return lower->private;
5347}
5348EXPORT_SYMBOL(netdev_lower_get_next_private);
5349
5350/**
5351 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5352 *				       lower neighbour list, RCU
5353 *				       variant
5354 * @dev: device
5355 * @iter: list_head ** of the current position
5356 *
5357 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5358 * list, starting from iter position. The caller must hold RCU read lock.
5359 */
5360void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5361					struct list_head **iter)
5362{
5363	struct netdev_adjacent *lower;
5364
5365	WARN_ON_ONCE(!rcu_read_lock_held());
5366
5367	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5368
5369	if (&lower->list == &dev->adj_list.lower)
5370		return NULL;
5371
5372	*iter = &lower->list;
5373
5374	return lower->private;
5375}
5376EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5377
5378/**
5379 * netdev_lower_get_next - Get the next device from the lower neighbour
5380 *                         list
5381 * @dev: device
5382 * @iter: list_head ** of the current position
5383 *
5384 * Gets the next netdev_adjacent from the dev's lower neighbour
5385 * list, starting from iter position. The caller must hold RTNL lock or
5386 * its own locking that guarantees that the neighbour lower
5387 * list will remain unchanged.
5388 */
5389void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5390{
5391	struct netdev_adjacent *lower;
5392
5393	lower = list_entry(*iter, struct netdev_adjacent, list);
5394
5395	if (&lower->list == &dev->adj_list.lower)
5396		return NULL;
5397
5398	*iter = lower->list.next;
5399
5400	return lower->dev;
5401}
5402EXPORT_SYMBOL(netdev_lower_get_next);
5403
5404/**
5405 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5406 *				       lower neighbour list, RCU
5407 *				       variant
5408 * @dev: device
5409 *
5410 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5411 * list. The caller must hold RCU read lock.
5412 */
5413void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5414{
5415	struct netdev_adjacent *lower;
5416
5417	lower = list_first_or_null_rcu(&dev->adj_list.lower,
5418			struct netdev_adjacent, list);
5419	if (lower)
5420		return lower->private;
5421	return NULL;
5422}
5423EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5424
5425/**
5426 * netdev_master_upper_dev_get_rcu - Get master upper device
5427 * @dev: device
5428 *
5429 * Find a master upper device and return pointer to it or NULL in case
5430 * it's not there. The caller must hold the RCU read lock.
5431 */
5432struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5433{
5434	struct netdev_adjacent *upper;
5435
5436	upper = list_first_or_null_rcu(&dev->adj_list.upper,
5437				       struct netdev_adjacent, list);
5438	if (upper && likely(upper->master))
5439		return upper->dev;
5440	return NULL;
5441}
5442EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5443
5444static int netdev_adjacent_sysfs_add(struct net_device *dev,
5445			      struct net_device *adj_dev,
5446			      struct list_head *dev_list)
5447{
5448	char linkname[IFNAMSIZ+7];
5449	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5450		"upper_%s" : "lower_%s", adj_dev->name);
5451	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5452				 linkname);
5453}
5454static void netdev_adjacent_sysfs_del(struct net_device *dev,
5455			       char *name,
5456			       struct list_head *dev_list)
5457{
5458	char linkname[IFNAMSIZ+7];
5459	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5460		"upper_%s" : "lower_%s", name);
5461	sysfs_remove_link(&(dev->dev.kobj), linkname);
5462}
5463
5464static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5465						 struct net_device *adj_dev,
5466						 struct list_head *dev_list)
5467{
5468	return (dev_list == &dev->adj_list.upper ||
5469		dev_list == &dev->adj_list.lower) &&
5470		net_eq(dev_net(dev), dev_net(adj_dev));
5471}
5472
5473static int __netdev_adjacent_dev_insert(struct net_device *dev,
5474					struct net_device *adj_dev,
5475					struct list_head *dev_list,
5476					void *private, bool master)
5477{
5478	struct netdev_adjacent *adj;
5479	int ret;
5480
5481	adj = __netdev_find_adj(adj_dev, dev_list);
5482
5483	if (adj) {
5484		adj->ref_nr++;
5485		return 0;
5486	}
5487
5488	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5489	if (!adj)
5490		return -ENOMEM;
5491
5492	adj->dev = adj_dev;
5493	adj->master = master;
5494	adj->ref_nr = 1;
5495	adj->private = private;
5496	dev_hold(adj_dev);
5497
5498	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5499		 adj_dev->name, dev->name, adj_dev->name);
5500
5501	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5502		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5503		if (ret)
5504			goto free_adj;
5505	}
5506
5507	/* Ensure that master link is always the first item in list. */
5508	if (master) {
5509		ret = sysfs_create_link(&(dev->dev.kobj),
5510					&(adj_dev->dev.kobj), "master");
5511		if (ret)
5512			goto remove_symlinks;
5513
5514		list_add_rcu(&adj->list, dev_list);
5515	} else {
5516		list_add_tail_rcu(&adj->list, dev_list);
5517	}
5518
5519	return 0;
5520
5521remove_symlinks:
5522	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5523		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5524free_adj:
5525	kfree(adj);
5526	dev_put(adj_dev);
5527
5528	return ret;
5529}
5530
5531static void __netdev_adjacent_dev_remove(struct net_device *dev,
5532					 struct net_device *adj_dev,
5533					 struct list_head *dev_list)
5534{
5535	struct netdev_adjacent *adj;
5536
5537	adj = __netdev_find_adj(adj_dev, dev_list);
5538
5539	if (!adj) {
5540		pr_err("tried to remove device %s from %s\n",
5541		       dev->name, adj_dev->name);
5542		BUG();
5543	}
5544
5545	if (adj->ref_nr > 1) {
5546		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5547			 adj->ref_nr-1);
5548		adj->ref_nr--;
5549		return;
5550	}
5551
5552	if (adj->master)
5553		sysfs_remove_link(&(dev->dev.kobj), "master");
5554
5555	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5556		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5557
5558	list_del_rcu(&adj->list);
5559	pr_debug("dev_put for %s, because link removed from %s to %s\n",
5560		 adj_dev->name, dev->name, adj_dev->name);
5561	dev_put(adj_dev);
5562	kfree_rcu(adj, rcu);
5563}
5564
5565static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5566					    struct net_device *upper_dev,
5567					    struct list_head *up_list,
5568					    struct list_head *down_list,
5569					    void *private, bool master)
5570{
5571	int ret;
5572
5573	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5574					   master);
5575	if (ret)
5576		return ret;
5577
5578	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5579					   false);
5580	if (ret) {
5581		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5582		return ret;
5583	}
5584
5585	return 0;
5586}
5587
5588static int __netdev_adjacent_dev_link(struct net_device *dev,
5589				      struct net_device *upper_dev)
5590{
5591	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5592						&dev->all_adj_list.upper,
5593						&upper_dev->all_adj_list.lower,
5594						NULL, false);
5595}
5596
5597static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5598					       struct net_device *upper_dev,
5599					       struct list_head *up_list,
5600					       struct list_head *down_list)
5601{
5602	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5603	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5604}
5605
5606static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5607					 struct net_device *upper_dev)
5608{
5609	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5610					   &dev->all_adj_list.upper,
5611					   &upper_dev->all_adj_list.lower);
5612}
5613
5614static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5615						struct net_device *upper_dev,
5616						void *private, bool master)
5617{
5618	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5619
5620	if (ret)
5621		return ret;
5622
5623	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5624					       &dev->adj_list.upper,
5625					       &upper_dev->adj_list.lower,
5626					       private, master);
5627	if (ret) {
5628		__netdev_adjacent_dev_unlink(dev, upper_dev);
5629		return ret;
5630	}
5631
5632	return 0;
5633}
5634
5635static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5636						   struct net_device *upper_dev)
5637{
5638	__netdev_adjacent_dev_unlink(dev, upper_dev);
5639	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5640					   &dev->adj_list.upper,
5641					   &upper_dev->adj_list.lower);
5642}
5643
5644static int __netdev_upper_dev_link(struct net_device *dev,
5645				   struct net_device *upper_dev, bool master,
5646				   void *upper_priv, void *upper_info)
5647{
5648	struct netdev_notifier_changeupper_info changeupper_info;
5649	struct netdev_adjacent *i, *j, *to_i, *to_j;
5650	int ret = 0;
5651
5652	ASSERT_RTNL();
5653
5654	if (dev == upper_dev)
5655		return -EBUSY;
5656
5657	/* To prevent loops, check if dev is not upper device to upper_dev. */
5658	if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5659		return -EBUSY;
5660
5661	if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5662		return -EEXIST;
5663
5664	if (master && netdev_master_upper_dev_get(dev))
5665		return -EBUSY;
5666
5667	changeupper_info.upper_dev = upper_dev;
5668	changeupper_info.master = master;
5669	changeupper_info.linking = true;
5670	changeupper_info.upper_info = upper_info;
5671
5672	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5673					    &changeupper_info.info);
5674	ret = notifier_to_errno(ret);
5675	if (ret)
5676		return ret;
5677
5678	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5679						   master);
5680	if (ret)
5681		return ret;
5682
5683	/* Now that we linked these devs, make all the upper_dev's
5684	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5685	 * versa, and don't forget the devices itself. All of these
5686	 * links are non-neighbours.
5687	 */
5688	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5689		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5690			pr_debug("Interlinking %s with %s, non-neighbour\n",
5691				 i->dev->name, j->dev->name);
5692			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5693			if (ret)
5694				goto rollback_mesh;
5695		}
5696	}
5697
5698	/* add dev to every upper_dev's upper device */
5699	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5700		pr_debug("linking %s's upper device %s with %s\n",
5701			 upper_dev->name, i->dev->name, dev->name);
5702		ret = __netdev_adjacent_dev_link(dev, i->dev);
5703		if (ret)
5704			goto rollback_upper_mesh;
5705	}
5706
5707	/* add upper_dev to every dev's lower device */
5708	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5709		pr_debug("linking %s's lower device %s with %s\n", dev->name,
5710			 i->dev->name, upper_dev->name);
5711		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5712		if (ret)
5713			goto rollback_lower_mesh;
5714	}
5715
5716	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5717					    &changeupper_info.info);
5718	ret = notifier_to_errno(ret);
5719	if (ret)
5720		goto rollback_lower_mesh;
5721
5722	return 0;
5723
5724rollback_lower_mesh:
5725	to_i = i;
5726	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5727		if (i == to_i)
5728			break;
5729		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5730	}
5731
5732	i = NULL;
5733
5734rollback_upper_mesh:
5735	to_i = i;
5736	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5737		if (i == to_i)
5738			break;
5739		__netdev_adjacent_dev_unlink(dev, i->dev);
5740	}
5741
5742	i = j = NULL;
5743
5744rollback_mesh:
5745	to_i = i;
5746	to_j = j;
5747	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5748		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5749			if (i == to_i && j == to_j)
5750				break;
5751			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5752		}
5753		if (i == to_i)
5754			break;
5755	}
5756
5757	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5758
5759	return ret;
5760}
5761
5762/**
5763 * netdev_upper_dev_link - Add a link to the upper device
5764 * @dev: device
5765 * @upper_dev: new upper device
5766 *
5767 * Adds a link to device which is upper to this one. The caller must hold
5768 * the RTNL lock. On a failure a negative errno code is returned.
5769 * On success the reference counts are adjusted and the function
5770 * returns zero.
5771 */
5772int netdev_upper_dev_link(struct net_device *dev,
5773			  struct net_device *upper_dev)
5774{
5775	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5776}
5777EXPORT_SYMBOL(netdev_upper_dev_link);
5778
5779/**
5780 * netdev_master_upper_dev_link - Add a master link to the upper device
5781 * @dev: device
5782 * @upper_dev: new upper device
5783 * @upper_priv: upper device private
5784 * @upper_info: upper info to be passed down via notifier
5785 *
5786 * Adds a link to device which is upper to this one. In this case, only
5787 * one master upper device can be linked, although other non-master devices
5788 * might be linked as well. The caller must hold the RTNL lock.
5789 * On a failure a negative errno code is returned. On success the reference
5790 * counts are adjusted and the function returns zero.
5791 */
5792int netdev_master_upper_dev_link(struct net_device *dev,
5793				 struct net_device *upper_dev,
5794				 void *upper_priv, void *upper_info)
5795{
5796	return __netdev_upper_dev_link(dev, upper_dev, true,
5797				       upper_priv, upper_info);
5798}
5799EXPORT_SYMBOL(netdev_master_upper_dev_link);
5800
5801/**
5802 * netdev_upper_dev_unlink - Removes a link to upper device
5803 * @dev: device
5804 * @upper_dev: new upper device
5805 *
5806 * Removes a link to device which is upper to this one. The caller must hold
5807 * the RTNL lock.
5808 */
5809void netdev_upper_dev_unlink(struct net_device *dev,
5810			     struct net_device *upper_dev)
5811{
5812	struct netdev_notifier_changeupper_info changeupper_info;
5813	struct netdev_adjacent *i, *j;
5814	ASSERT_RTNL();
5815
5816	changeupper_info.upper_dev = upper_dev;
5817	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5818	changeupper_info.linking = false;
5819
5820	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5821				      &changeupper_info.info);
5822
5823	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5824
5825	/* Here is the tricky part. We must remove all dev's lower
5826	 * devices from all upper_dev's upper devices and vice
5827	 * versa, to maintain the graph relationship.
5828	 */
5829	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5830		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5831			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5832
5833	/* remove also the devices itself from lower/upper device
5834	 * list
5835	 */
5836	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5837		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5838
5839	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5840		__netdev_adjacent_dev_unlink(dev, i->dev);
5841
5842	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5843				      &changeupper_info.info);
5844}
5845EXPORT_SYMBOL(netdev_upper_dev_unlink);
5846
5847/**
5848 * netdev_bonding_info_change - Dispatch event about slave change
5849 * @dev: device
5850 * @bonding_info: info to dispatch
5851 *
5852 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5853 * The caller must hold the RTNL lock.
5854 */
5855void netdev_bonding_info_change(struct net_device *dev,
5856				struct netdev_bonding_info *bonding_info)
5857{
5858	struct netdev_notifier_bonding_info	info;
5859
5860	memcpy(&info.bonding_info, bonding_info,
5861	       sizeof(struct netdev_bonding_info));
5862	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5863				      &info.info);
5864}
5865EXPORT_SYMBOL(netdev_bonding_info_change);
5866
5867static void netdev_adjacent_add_links(struct net_device *dev)
5868{
5869	struct netdev_adjacent *iter;
5870
5871	struct net *net = dev_net(dev);
5872
5873	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5874		if (!net_eq(net,dev_net(iter->dev)))
5875			continue;
5876		netdev_adjacent_sysfs_add(iter->dev, dev,
5877					  &iter->dev->adj_list.lower);
5878		netdev_adjacent_sysfs_add(dev, iter->dev,
5879					  &dev->adj_list.upper);
5880	}
5881
5882	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5883		if (!net_eq(net,dev_net(iter->dev)))
5884			continue;
5885		netdev_adjacent_sysfs_add(iter->dev, dev,
5886					  &iter->dev->adj_list.upper);
5887		netdev_adjacent_sysfs_add(dev, iter->dev,
5888					  &dev->adj_list.lower);
5889	}
5890}
5891
5892static void netdev_adjacent_del_links(struct net_device *dev)
5893{
5894	struct netdev_adjacent *iter;
5895
5896	struct net *net = dev_net(dev);
5897
5898	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5899		if (!net_eq(net,dev_net(iter->dev)))
5900			continue;
5901		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5902					  &iter->dev->adj_list.lower);
5903		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5904					  &dev->adj_list.upper);
5905	}
5906
5907	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5908		if (!net_eq(net,dev_net(iter->dev)))
5909			continue;
5910		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5911					  &iter->dev->adj_list.upper);
5912		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5913					  &dev->adj_list.lower);
5914	}
5915}
5916
5917void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5918{
5919	struct netdev_adjacent *iter;
5920
5921	struct net *net = dev_net(dev);
5922
5923	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5924		if (!net_eq(net,dev_net(iter->dev)))
5925			continue;
5926		netdev_adjacent_sysfs_del(iter->dev, oldname,
5927					  &iter->dev->adj_list.lower);
5928		netdev_adjacent_sysfs_add(iter->dev, dev,
5929					  &iter->dev->adj_list.lower);
5930	}
5931
5932	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5933		if (!net_eq(net,dev_net(iter->dev)))
5934			continue;
5935		netdev_adjacent_sysfs_del(iter->dev, oldname,
5936					  &iter->dev->adj_list.upper);
5937		netdev_adjacent_sysfs_add(iter->dev, dev,
5938					  &iter->dev->adj_list.upper);
5939	}
5940}
5941
5942void *netdev_lower_dev_get_private(struct net_device *dev,
5943				   struct net_device *lower_dev)
5944{
5945	struct netdev_adjacent *lower;
5946
5947	if (!lower_dev)
5948		return NULL;
5949	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5950	if (!lower)
5951		return NULL;
5952
5953	return lower->private;
5954}
5955EXPORT_SYMBOL(netdev_lower_dev_get_private);
5956
5957
5958int dev_get_nest_level(struct net_device *dev,
5959		       bool (*type_check)(const struct net_device *dev))
5960{
5961	struct net_device *lower = NULL;
5962	struct list_head *iter;
5963	int max_nest = -1;
5964	int nest;
5965
5966	ASSERT_RTNL();
5967
5968	netdev_for_each_lower_dev(dev, lower, iter) {
5969		nest = dev_get_nest_level(lower, type_check);
5970		if (max_nest < nest)
5971			max_nest = nest;
5972	}
5973
5974	if (type_check(dev))
5975		max_nest++;
5976
5977	return max_nest;
5978}
5979EXPORT_SYMBOL(dev_get_nest_level);
5980
5981/**
5982 * netdev_lower_change - Dispatch event about lower device state change
5983 * @lower_dev: device
5984 * @lower_state_info: state to dispatch
5985 *
5986 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
5987 * The caller must hold the RTNL lock.
5988 */
5989void netdev_lower_state_changed(struct net_device *lower_dev,
5990				void *lower_state_info)
5991{
5992	struct netdev_notifier_changelowerstate_info changelowerstate_info;
5993
5994	ASSERT_RTNL();
5995	changelowerstate_info.lower_state_info = lower_state_info;
5996	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
5997				      &changelowerstate_info.info);
5998}
5999EXPORT_SYMBOL(netdev_lower_state_changed);
6000
6001static void dev_change_rx_flags(struct net_device *dev, int flags)
6002{
6003	const struct net_device_ops *ops = dev->netdev_ops;
6004
6005	if (ops->ndo_change_rx_flags)
6006		ops->ndo_change_rx_flags(dev, flags);
6007}
6008
6009static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6010{
6011	unsigned int old_flags = dev->flags;
6012	kuid_t uid;
6013	kgid_t gid;
6014
6015	ASSERT_RTNL();
6016
6017	dev->flags |= IFF_PROMISC;
6018	dev->promiscuity += inc;
6019	if (dev->promiscuity == 0) {
6020		/*
6021		 * Avoid overflow.
6022		 * If inc causes overflow, untouch promisc and return error.
6023		 */
6024		if (inc < 0)
6025			dev->flags &= ~IFF_PROMISC;
6026		else {
6027			dev->promiscuity -= inc;
6028			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6029				dev->name);
6030			return -EOVERFLOW;
6031		}
6032	}
6033	if (dev->flags != old_flags) {
6034		pr_info("device %s %s promiscuous mode\n",
6035			dev->name,
6036			dev->flags & IFF_PROMISC ? "entered" : "left");
6037		if (audit_enabled) {
6038			current_uid_gid(&uid, &gid);
6039			audit_log(current->audit_context, GFP_ATOMIC,
6040				AUDIT_ANOM_PROMISCUOUS,
6041				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6042				dev->name, (dev->flags & IFF_PROMISC),
6043				(old_flags & IFF_PROMISC),
6044				from_kuid(&init_user_ns, audit_get_loginuid(current)),
6045				from_kuid(&init_user_ns, uid),
6046				from_kgid(&init_user_ns, gid),
6047				audit_get_sessionid(current));
6048		}
6049
6050		dev_change_rx_flags(dev, IFF_PROMISC);
6051	}
6052	if (notify)
6053		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
6054	return 0;
6055}
6056
6057/**
6058 *	dev_set_promiscuity	- update promiscuity count on a device
6059 *	@dev: device
6060 *	@inc: modifier
6061 *
6062 *	Add or remove promiscuity from a device. While the count in the device
6063 *	remains above zero the interface remains promiscuous. Once it hits zero
6064 *	the device reverts back to normal filtering operation. A negative inc
6065 *	value is used to drop promiscuity on the device.
6066 *	Return 0 if successful or a negative errno code on error.
6067 */
6068int dev_set_promiscuity(struct net_device *dev, int inc)
6069{
6070	unsigned int old_flags = dev->flags;
6071	int err;
6072
6073	err = __dev_set_promiscuity(dev, inc, true);
6074	if (err < 0)
6075		return err;
6076	if (dev->flags != old_flags)
6077		dev_set_rx_mode(dev);
6078	return err;
6079}
6080EXPORT_SYMBOL(dev_set_promiscuity);
6081
6082static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6083{
6084	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6085
6086	ASSERT_RTNL();
6087
6088	dev->flags |= IFF_ALLMULTI;
6089	dev->allmulti += inc;
6090	if (dev->allmulti == 0) {
6091		/*
6092		 * Avoid overflow.
6093		 * If inc causes overflow, untouch allmulti and return error.
6094		 */
6095		if (inc < 0)
6096			dev->flags &= ~IFF_ALLMULTI;
6097		else {
6098			dev->allmulti -= inc;
6099			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6100				dev->name);
6101			return -EOVERFLOW;
6102		}
6103	}
6104	if (dev->flags ^ old_flags) {
6105		dev_change_rx_flags(dev, IFF_ALLMULTI);
6106		dev_set_rx_mode(dev);
6107		if (notify)
6108			__dev_notify_flags(dev, old_flags,
6109					   dev->gflags ^ old_gflags);
6110	}
6111	return 0;
6112}
6113
6114/**
6115 *	dev_set_allmulti	- update allmulti count on a device
6116 *	@dev: device
6117 *	@inc: modifier
6118 *
6119 *	Add or remove reception of all multicast frames to a device. While the
6120 *	count in the device remains above zero the interface remains listening
6121 *	to all interfaces. Once it hits zero the device reverts back to normal
6122 *	filtering operation. A negative @inc value is used to drop the counter
6123 *	when releasing a resource needing all multicasts.
6124 *	Return 0 if successful or a negative errno code on error.
6125 */
6126
6127int dev_set_allmulti(struct net_device *dev, int inc)
6128{
6129	return __dev_set_allmulti(dev, inc, true);
6130}
6131EXPORT_SYMBOL(dev_set_allmulti);
6132
6133/*
6134 *	Upload unicast and multicast address lists to device and
6135 *	configure RX filtering. When the device doesn't support unicast
6136 *	filtering it is put in promiscuous mode while unicast addresses
6137 *	are present.
6138 */
6139void __dev_set_rx_mode(struct net_device *dev)
6140{
6141	const struct net_device_ops *ops = dev->netdev_ops;
6142
6143	/* dev_open will call this function so the list will stay sane. */
6144	if (!(dev->flags&IFF_UP))
6145		return;
6146
6147	if (!netif_device_present(dev))
6148		return;
6149
6150	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6151		/* Unicast addresses changes may only happen under the rtnl,
6152		 * therefore calling __dev_set_promiscuity here is safe.
6153		 */
6154		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6155			__dev_set_promiscuity(dev, 1, false);
6156			dev->uc_promisc = true;
6157		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6158			__dev_set_promiscuity(dev, -1, false);
6159			dev->uc_promisc = false;
6160		}
6161	}
6162
6163	if (ops->ndo_set_rx_mode)
6164		ops->ndo_set_rx_mode(dev);
6165}
6166
6167void dev_set_rx_mode(struct net_device *dev)
6168{
6169	netif_addr_lock_bh(dev);
6170	__dev_set_rx_mode(dev);
6171	netif_addr_unlock_bh(dev);
6172}
6173
6174/**
6175 *	dev_get_flags - get flags reported to userspace
6176 *	@dev: device
6177 *
6178 *	Get the combination of flag bits exported through APIs to userspace.
6179 */
6180unsigned int dev_get_flags(const struct net_device *dev)
6181{
6182	unsigned int flags;
6183
6184	flags = (dev->flags & ~(IFF_PROMISC |
6185				IFF_ALLMULTI |
6186				IFF_RUNNING |
6187				IFF_LOWER_UP |
6188				IFF_DORMANT)) |
6189		(dev->gflags & (IFF_PROMISC |
6190				IFF_ALLMULTI));
6191
6192	if (netif_running(dev)) {
6193		if (netif_oper_up(dev))
6194			flags |= IFF_RUNNING;
6195		if (netif_carrier_ok(dev))
6196			flags |= IFF_LOWER_UP;
6197		if (netif_dormant(dev))
6198			flags |= IFF_DORMANT;
6199	}
6200
6201	return flags;
6202}
6203EXPORT_SYMBOL(dev_get_flags);
6204
6205int __dev_change_flags(struct net_device *dev, unsigned int flags)
6206{
6207	unsigned int old_flags = dev->flags;
6208	int ret;
6209
6210	ASSERT_RTNL();
6211
6212	/*
6213	 *	Set the flags on our device.
6214	 */
6215
6216	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6217			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6218			       IFF_AUTOMEDIA)) |
6219		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6220				    IFF_ALLMULTI));
6221
6222	/*
6223	 *	Load in the correct multicast list now the flags have changed.
6224	 */
6225
6226	if ((old_flags ^ flags) & IFF_MULTICAST)
6227		dev_change_rx_flags(dev, IFF_MULTICAST);
6228
6229	dev_set_rx_mode(dev);
6230
6231	/*
6232	 *	Have we downed the interface. We handle IFF_UP ourselves
6233	 *	according to user attempts to set it, rather than blindly
6234	 *	setting it.
6235	 */
6236
6237	ret = 0;
6238	if ((old_flags ^ flags) & IFF_UP)
6239		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6240
6241	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6242		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6243		unsigned int old_flags = dev->flags;
6244
6245		dev->gflags ^= IFF_PROMISC;
6246
6247		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6248			if (dev->flags != old_flags)
6249				dev_set_rx_mode(dev);
6250	}
6251
6252	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6253	   is important. Some (broken) drivers set IFF_PROMISC, when
6254	   IFF_ALLMULTI is requested not asking us and not reporting.
6255	 */
6256	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6257		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6258
6259		dev->gflags ^= IFF_ALLMULTI;
6260		__dev_set_allmulti(dev, inc, false);
6261	}
6262
6263	return ret;
6264}
6265
6266void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6267			unsigned int gchanges)
6268{
6269	unsigned int changes = dev->flags ^ old_flags;
6270
6271	if (gchanges)
6272		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6273
6274	if (changes & IFF_UP) {
6275		if (dev->flags & IFF_UP)
6276			call_netdevice_notifiers(NETDEV_UP, dev);
6277		else
6278			call_netdevice_notifiers(NETDEV_DOWN, dev);
6279	}
6280
6281	if (dev->flags & IFF_UP &&
6282	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6283		struct netdev_notifier_change_info change_info;
6284
6285		change_info.flags_changed = changes;
6286		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6287					      &change_info.info);
6288	}
6289}
6290
6291/**
6292 *	dev_change_flags - change device settings
6293 *	@dev: device
6294 *	@flags: device state flags
6295 *
6296 *	Change settings on device based state flags. The flags are
6297 *	in the userspace exported format.
6298 */
6299int dev_change_flags(struct net_device *dev, unsigned int flags)
6300{
6301	int ret;
6302	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6303
6304	ret = __dev_change_flags(dev, flags);
6305	if (ret < 0)
6306		return ret;
6307
6308	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6309	__dev_notify_flags(dev, old_flags, changes);
6310	return ret;
6311}
6312EXPORT_SYMBOL(dev_change_flags);
6313
6314static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6315{
6316	const struct net_device_ops *ops = dev->netdev_ops;
6317
6318	if (ops->ndo_change_mtu)
6319		return ops->ndo_change_mtu(dev, new_mtu);
6320
6321	dev->mtu = new_mtu;
6322	return 0;
6323}
6324
6325/**
6326 *	dev_set_mtu - Change maximum transfer unit
6327 *	@dev: device
6328 *	@new_mtu: new transfer unit
6329 *
6330 *	Change the maximum transfer size of the network device.
6331 */
6332int dev_set_mtu(struct net_device *dev, int new_mtu)
6333{
6334	int err, orig_mtu;
6335
6336	if (new_mtu == dev->mtu)
6337		return 0;
6338
6339	/*	MTU must be positive.	 */
6340	if (new_mtu < 0)
6341		return -EINVAL;
6342
6343	if (!netif_device_present(dev))
6344		return -ENODEV;
6345
6346	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6347	err = notifier_to_errno(err);
6348	if (err)
6349		return err;
6350
6351	orig_mtu = dev->mtu;
6352	err = __dev_set_mtu(dev, new_mtu);
6353
6354	if (!err) {
6355		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6356		err = notifier_to_errno(err);
6357		if (err) {
6358			/* setting mtu back and notifying everyone again,
6359			 * so that they have a chance to revert changes.
6360			 */
6361			__dev_set_mtu(dev, orig_mtu);
6362			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6363		}
6364	}
6365	return err;
6366}
6367EXPORT_SYMBOL(dev_set_mtu);
6368
6369/**
6370 *	dev_set_group - Change group this device belongs to
6371 *	@dev: device
6372 *	@new_group: group this device should belong to
6373 */
6374void dev_set_group(struct net_device *dev, int new_group)
6375{
6376	dev->group = new_group;
6377}
6378EXPORT_SYMBOL(dev_set_group);
6379
6380/**
6381 *	dev_set_mac_address - Change Media Access Control Address
6382 *	@dev: device
6383 *	@sa: new address
6384 *
6385 *	Change the hardware (MAC) address of the device
6386 */
6387int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6388{
6389	const struct net_device_ops *ops = dev->netdev_ops;
6390	int err;
6391
6392	if (!ops->ndo_set_mac_address)
6393		return -EOPNOTSUPP;
6394	if (sa->sa_family != dev->type)
6395		return -EINVAL;
6396	if (!netif_device_present(dev))
6397		return -ENODEV;
6398	err = ops->ndo_set_mac_address(dev, sa);
6399	if (err)
6400		return err;
6401	dev->addr_assign_type = NET_ADDR_SET;
6402	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6403	add_device_randomness(dev->dev_addr, dev->addr_len);
6404	return 0;
6405}
6406EXPORT_SYMBOL(dev_set_mac_address);
6407
6408/**
6409 *	dev_change_carrier - Change device carrier
6410 *	@dev: device
6411 *	@new_carrier: new value
6412 *
6413 *	Change device carrier
6414 */
6415int dev_change_carrier(struct net_device *dev, bool new_carrier)
6416{
6417	const struct net_device_ops *ops = dev->netdev_ops;
6418
6419	if (!ops->ndo_change_carrier)
6420		return -EOPNOTSUPP;
6421	if (!netif_device_present(dev))
6422		return -ENODEV;
6423	return ops->ndo_change_carrier(dev, new_carrier);
6424}
6425EXPORT_SYMBOL(dev_change_carrier);
6426
6427/**
6428 *	dev_get_phys_port_id - Get device physical port ID
6429 *	@dev: device
6430 *	@ppid: port ID
6431 *
6432 *	Get device physical port ID
6433 */
6434int dev_get_phys_port_id(struct net_device *dev,
6435			 struct netdev_phys_item_id *ppid)
6436{
6437	const struct net_device_ops *ops = dev->netdev_ops;
6438
6439	if (!ops->ndo_get_phys_port_id)
6440		return -EOPNOTSUPP;
6441	return ops->ndo_get_phys_port_id(dev, ppid);
6442}
6443EXPORT_SYMBOL(dev_get_phys_port_id);
6444
6445/**
6446 *	dev_get_phys_port_name - Get device physical port name
6447 *	@dev: device
6448 *	@name: port name
6449 *	@len: limit of bytes to copy to name
6450 *
6451 *	Get device physical port name
6452 */
6453int dev_get_phys_port_name(struct net_device *dev,
6454			   char *name, size_t len)
6455{
6456	const struct net_device_ops *ops = dev->netdev_ops;
6457
6458	if (!ops->ndo_get_phys_port_name)
6459		return -EOPNOTSUPP;
6460	return ops->ndo_get_phys_port_name(dev, name, len);
6461}
6462EXPORT_SYMBOL(dev_get_phys_port_name);
6463
6464/**
6465 *	dev_change_proto_down - update protocol port state information
6466 *	@dev: device
6467 *	@proto_down: new value
6468 *
6469 *	This info can be used by switch drivers to set the phys state of the
6470 *	port.
6471 */
6472int dev_change_proto_down(struct net_device *dev, bool proto_down)
6473{
6474	const struct net_device_ops *ops = dev->netdev_ops;
6475
6476	if (!ops->ndo_change_proto_down)
6477		return -EOPNOTSUPP;
6478	if (!netif_device_present(dev))
6479		return -ENODEV;
6480	return ops->ndo_change_proto_down(dev, proto_down);
6481}
6482EXPORT_SYMBOL(dev_change_proto_down);
6483
6484/**
6485 *	dev_new_index	-	allocate an ifindex
6486 *	@net: the applicable net namespace
6487 *
6488 *	Returns a suitable unique value for a new device interface
6489 *	number.  The caller must hold the rtnl semaphore or the
6490 *	dev_base_lock to be sure it remains unique.
6491 */
6492static int dev_new_index(struct net *net)
6493{
6494	int ifindex = net->ifindex;
6495	for (;;) {
6496		if (++ifindex <= 0)
6497			ifindex = 1;
6498		if (!__dev_get_by_index(net, ifindex))
6499			return net->ifindex = ifindex;
6500	}
6501}
6502
6503/* Delayed registration/unregisteration */
6504static LIST_HEAD(net_todo_list);
6505DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6506
6507static void net_set_todo(struct net_device *dev)
6508{
6509	list_add_tail(&dev->todo_list, &net_todo_list);
6510	dev_net(dev)->dev_unreg_count++;
6511}
6512
6513static void rollback_registered_many(struct list_head *head)
6514{
6515	struct net_device *dev, *tmp;
6516	LIST_HEAD(close_head);
6517
6518	BUG_ON(dev_boot_phase);
6519	ASSERT_RTNL();
6520
6521	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6522		/* Some devices call without registering
6523		 * for initialization unwind. Remove those
6524		 * devices and proceed with the remaining.
6525		 */
6526		if (dev->reg_state == NETREG_UNINITIALIZED) {
6527			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6528				 dev->name, dev);
6529
6530			WARN_ON(1);
6531			list_del(&dev->unreg_list);
6532			continue;
6533		}
6534		dev->dismantle = true;
6535		BUG_ON(dev->reg_state != NETREG_REGISTERED);
6536	}
6537
6538	/* If device is running, close it first. */
6539	list_for_each_entry(dev, head, unreg_list)
6540		list_add_tail(&dev->close_list, &close_head);
6541	dev_close_many(&close_head, true);
6542
6543	list_for_each_entry(dev, head, unreg_list) {
6544		/* And unlink it from device chain. */
6545		unlist_netdevice(dev);
6546
6547		dev->reg_state = NETREG_UNREGISTERING;
6548		on_each_cpu(flush_backlog, dev, 1);
6549	}
6550
6551	synchronize_net();
6552
6553	list_for_each_entry(dev, head, unreg_list) {
6554		struct sk_buff *skb = NULL;
6555
6556		/* Shutdown queueing discipline. */
6557		dev_shutdown(dev);
6558
6559
6560		/* Notify protocols, that we are about to destroy
6561		   this device. They should clean all the things.
6562		*/
6563		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6564
6565		if (!dev->rtnl_link_ops ||
6566		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6567			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6568						     GFP_KERNEL);
6569
6570		/*
6571		 *	Flush the unicast and multicast chains
6572		 */
6573		dev_uc_flush(dev);
6574		dev_mc_flush(dev);
6575
6576		if (dev->netdev_ops->ndo_uninit)
6577			dev->netdev_ops->ndo_uninit(dev);
6578
6579		if (skb)
6580			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6581
6582		/* Notifier chain MUST detach us all upper devices. */
6583		WARN_ON(netdev_has_any_upper_dev(dev));
6584
6585		/* Remove entries from kobject tree */
6586		netdev_unregister_kobject(dev);
6587#ifdef CONFIG_XPS
6588		/* Remove XPS queueing entries */
6589		netif_reset_xps_queues_gt(dev, 0);
6590#endif
6591	}
6592
6593	synchronize_net();
6594
6595	list_for_each_entry(dev, head, unreg_list)
6596		dev_put(dev);
6597}
6598
6599static void rollback_registered(struct net_device *dev)
6600{
6601	LIST_HEAD(single);
6602
6603	list_add(&dev->unreg_list, &single);
6604	rollback_registered_many(&single);
6605	list_del(&single);
6606}
6607
6608static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6609	struct net_device *upper, netdev_features_t features)
6610{
6611	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6612	netdev_features_t feature;
6613	int feature_bit;
6614
6615	for_each_netdev_feature(&upper_disables, feature_bit) {
6616		feature = __NETIF_F_BIT(feature_bit);
6617		if (!(upper->wanted_features & feature)
6618		    && (features & feature)) {
6619			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6620				   &feature, upper->name);
6621			features &= ~feature;
6622		}
6623	}
6624
6625	return features;
6626}
6627
6628static void netdev_sync_lower_features(struct net_device *upper,
6629	struct net_device *lower, netdev_features_t features)
6630{
6631	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6632	netdev_features_t feature;
6633	int feature_bit;
6634
6635	for_each_netdev_feature(&upper_disables, feature_bit) {
6636		feature = __NETIF_F_BIT(feature_bit);
6637		if (!(features & feature) && (lower->features & feature)) {
6638			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6639				   &feature, lower->name);
6640			lower->wanted_features &= ~feature;
6641			netdev_update_features(lower);
6642
6643			if (unlikely(lower->features & feature))
6644				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6645					    &feature, lower->name);
6646		}
6647	}
6648}
6649
6650static netdev_features_t netdev_fix_features(struct net_device *dev,
6651	netdev_features_t features)
6652{
6653	/* Fix illegal checksum combinations */
6654	if ((features & NETIF_F_HW_CSUM) &&
6655	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6656		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6657		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6658	}
6659
6660	/* TSO requires that SG is present as well. */
6661	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6662		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6663		features &= ~NETIF_F_ALL_TSO;
6664	}
6665
6666	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6667					!(features & NETIF_F_IP_CSUM)) {
6668		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6669		features &= ~NETIF_F_TSO;
6670		features &= ~NETIF_F_TSO_ECN;
6671	}
6672
6673	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6674					 !(features & NETIF_F_IPV6_CSUM)) {
6675		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6676		features &= ~NETIF_F_TSO6;
6677	}
6678
6679	/* TSO ECN requires that TSO is present as well. */
6680	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6681		features &= ~NETIF_F_TSO_ECN;
6682
6683	/* Software GSO depends on SG. */
6684	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6685		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6686		features &= ~NETIF_F_GSO;
6687	}
6688
6689	/* UFO needs SG and checksumming */
6690	if (features & NETIF_F_UFO) {
6691		/* maybe split UFO into V4 and V6? */
6692		if (!(features & NETIF_F_HW_CSUM) &&
6693		    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6694		     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6695			netdev_dbg(dev,
6696				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6697			features &= ~NETIF_F_UFO;
6698		}
6699
6700		if (!(features & NETIF_F_SG)) {
6701			netdev_dbg(dev,
6702				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6703			features &= ~NETIF_F_UFO;
6704		}
6705	}
6706
6707#ifdef CONFIG_NET_RX_BUSY_POLL
6708	if (dev->netdev_ops->ndo_busy_poll)
6709		features |= NETIF_F_BUSY_POLL;
6710	else
6711#endif
6712		features &= ~NETIF_F_BUSY_POLL;
6713
6714	return features;
6715}
6716
6717int __netdev_update_features(struct net_device *dev)
6718{
6719	struct net_device *upper, *lower;
6720	netdev_features_t features;
6721	struct list_head *iter;
6722	int err = -1;
6723
6724	ASSERT_RTNL();
6725
6726	features = netdev_get_wanted_features(dev);
6727
6728	if (dev->netdev_ops->ndo_fix_features)
6729		features = dev->netdev_ops->ndo_fix_features(dev, features);
6730
6731	/* driver might be less strict about feature dependencies */
6732	features = netdev_fix_features(dev, features);
6733
6734	/* some features can't be enabled if they're off an an upper device */
6735	netdev_for_each_upper_dev_rcu(dev, upper, iter)
6736		features = netdev_sync_upper_features(dev, upper, features);
6737
6738	if (dev->features == features)
6739		goto sync_lower;
6740
6741	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6742		&dev->features, &features);
6743
6744	if (dev->netdev_ops->ndo_set_features)
6745		err = dev->netdev_ops->ndo_set_features(dev, features);
6746	else
6747		err = 0;
6748
6749	if (unlikely(err < 0)) {
6750		netdev_err(dev,
6751			"set_features() failed (%d); wanted %pNF, left %pNF\n",
6752			err, &features, &dev->features);
6753		/* return non-0 since some features might have changed and
6754		 * it's better to fire a spurious notification than miss it
6755		 */
6756		return -1;
6757	}
6758
6759sync_lower:
6760	/* some features must be disabled on lower devices when disabled
6761	 * on an upper device (think: bonding master or bridge)
6762	 */
6763	netdev_for_each_lower_dev(dev, lower, iter)
6764		netdev_sync_lower_features(dev, lower, features);
6765
6766	if (!err)
6767		dev->features = features;
6768
6769	return err < 0 ? 0 : 1;
6770}
6771
6772/**
6773 *	netdev_update_features - recalculate device features
6774 *	@dev: the device to check
6775 *
6776 *	Recalculate dev->features set and send notifications if it
6777 *	has changed. Should be called after driver or hardware dependent
6778 *	conditions might have changed that influence the features.
6779 */
6780void netdev_update_features(struct net_device *dev)
6781{
6782	if (__netdev_update_features(dev))
6783		netdev_features_change(dev);
6784}
6785EXPORT_SYMBOL(netdev_update_features);
6786
6787/**
6788 *	netdev_change_features - recalculate device features
6789 *	@dev: the device to check
6790 *
6791 *	Recalculate dev->features set and send notifications even
6792 *	if they have not changed. Should be called instead of
6793 *	netdev_update_features() if also dev->vlan_features might
6794 *	have changed to allow the changes to be propagated to stacked
6795 *	VLAN devices.
6796 */
6797void netdev_change_features(struct net_device *dev)
6798{
6799	__netdev_update_features(dev);
6800	netdev_features_change(dev);
6801}
6802EXPORT_SYMBOL(netdev_change_features);
6803
6804/**
6805 *	netif_stacked_transfer_operstate -	transfer operstate
6806 *	@rootdev: the root or lower level device to transfer state from
6807 *	@dev: the device to transfer operstate to
6808 *
6809 *	Transfer operational state from root to device. This is normally
6810 *	called when a stacking relationship exists between the root
6811 *	device and the device(a leaf device).
6812 */
6813void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6814					struct net_device *dev)
6815{
6816	if (rootdev->operstate == IF_OPER_DORMANT)
6817		netif_dormant_on(dev);
6818	else
6819		netif_dormant_off(dev);
6820
6821	if (netif_carrier_ok(rootdev)) {
6822		if (!netif_carrier_ok(dev))
6823			netif_carrier_on(dev);
6824	} else {
6825		if (netif_carrier_ok(dev))
6826			netif_carrier_off(dev);
6827	}
6828}
6829EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6830
6831#ifdef CONFIG_SYSFS
6832static int netif_alloc_rx_queues(struct net_device *dev)
6833{
6834	unsigned int i, count = dev->num_rx_queues;
6835	struct netdev_rx_queue *rx;
6836	size_t sz = count * sizeof(*rx);
6837
6838	BUG_ON(count < 1);
6839
6840	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6841	if (!rx) {
6842		rx = vzalloc(sz);
6843		if (!rx)
6844			return -ENOMEM;
6845	}
6846	dev->_rx = rx;
6847
6848	for (i = 0; i < count; i++)
6849		rx[i].dev = dev;
6850	return 0;
6851}
6852#endif
6853
6854static void netdev_init_one_queue(struct net_device *dev,
6855				  struct netdev_queue *queue, void *_unused)
6856{
6857	/* Initialize queue lock */
6858	spin_lock_init(&queue->_xmit_lock);
6859	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6860	queue->xmit_lock_owner = -1;
6861	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6862	queue->dev = dev;
6863#ifdef CONFIG_BQL
6864	dql_init(&queue->dql, HZ);
6865#endif
6866}
6867
6868static void netif_free_tx_queues(struct net_device *dev)
6869{
6870	kvfree(dev->_tx);
6871}
6872
6873static int netif_alloc_netdev_queues(struct net_device *dev)
6874{
6875	unsigned int count = dev->num_tx_queues;
6876	struct netdev_queue *tx;
6877	size_t sz = count * sizeof(*tx);
6878
6879	if (count < 1 || count > 0xffff)
6880		return -EINVAL;
6881
6882	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6883	if (!tx) {
6884		tx = vzalloc(sz);
6885		if (!tx)
6886			return -ENOMEM;
6887	}
6888	dev->_tx = tx;
6889
6890	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6891	spin_lock_init(&dev->tx_global_lock);
6892
6893	return 0;
6894}
6895
6896void netif_tx_stop_all_queues(struct net_device *dev)
6897{
6898	unsigned int i;
6899
6900	for (i = 0; i < dev->num_tx_queues; i++) {
6901		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6902		netif_tx_stop_queue(txq);
6903	}
6904}
6905EXPORT_SYMBOL(netif_tx_stop_all_queues);
6906
6907/**
6908 *	register_netdevice	- register a network device
6909 *	@dev: device to register
6910 *
6911 *	Take a completed network device structure and add it to the kernel
6912 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6913 *	chain. 0 is returned on success. A negative errno code is returned
6914 *	on a failure to set up the device, or if the name is a duplicate.
6915 *
6916 *	Callers must hold the rtnl semaphore. You may want
6917 *	register_netdev() instead of this.
6918 *
6919 *	BUGS:
6920 *	The locking appears insufficient to guarantee two parallel registers
6921 *	will not get the same name.
6922 */
6923
6924int register_netdevice(struct net_device *dev)
6925{
6926	int ret;
6927	struct net *net = dev_net(dev);
6928
6929	BUG_ON(dev_boot_phase);
6930	ASSERT_RTNL();
6931
6932	might_sleep();
6933
6934	/* When net_device's are persistent, this will be fatal. */
6935	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6936	BUG_ON(!net);
6937
6938	spin_lock_init(&dev->addr_list_lock);
6939	netdev_set_addr_lockdep_class(dev);
6940
6941	ret = dev_get_valid_name(net, dev, dev->name);
6942	if (ret < 0)
6943		goto out;
6944
6945	/* Init, if this function is available */
6946	if (dev->netdev_ops->ndo_init) {
6947		ret = dev->netdev_ops->ndo_init(dev);
6948		if (ret) {
6949			if (ret > 0)
6950				ret = -EIO;
6951			goto out;
6952		}
6953	}
6954
6955	if (((dev->hw_features | dev->features) &
6956	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
6957	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6958	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6959		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6960		ret = -EINVAL;
6961		goto err_uninit;
6962	}
6963
6964	ret = -EBUSY;
6965	if (!dev->ifindex)
6966		dev->ifindex = dev_new_index(net);
6967	else if (__dev_get_by_index(net, dev->ifindex))
6968		goto err_uninit;
6969
6970	/* Transfer changeable features to wanted_features and enable
6971	 * software offloads (GSO and GRO).
6972	 */
6973	dev->hw_features |= NETIF_F_SOFT_FEATURES;
6974	dev->features |= NETIF_F_SOFT_FEATURES;
6975	dev->wanted_features = dev->features & dev->hw_features;
6976
6977	if (!(dev->flags & IFF_LOOPBACK)) {
6978		dev->hw_features |= NETIF_F_NOCACHE_COPY;
6979	}
6980
6981	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6982	 */
6983	dev->vlan_features |= NETIF_F_HIGHDMA;
6984
6985	/* Make NETIF_F_SG inheritable to tunnel devices.
6986	 */
6987	dev->hw_enc_features |= NETIF_F_SG;
6988
6989	/* Make NETIF_F_SG inheritable to MPLS.
6990	 */
6991	dev->mpls_features |= NETIF_F_SG;
6992
6993	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6994	ret = notifier_to_errno(ret);
6995	if (ret)
6996		goto err_uninit;
6997
6998	ret = netdev_register_kobject(dev);
6999	if (ret)
7000		goto err_uninit;
7001	dev->reg_state = NETREG_REGISTERED;
7002
7003	__netdev_update_features(dev);
7004
7005	/*
7006	 *	Default initial state at registry is that the
7007	 *	device is present.
7008	 */
7009
7010	set_bit(__LINK_STATE_PRESENT, &dev->state);
7011
7012	linkwatch_init_dev(dev);
7013
7014	dev_init_scheduler(dev);
7015	dev_hold(dev);
7016	list_netdevice(dev);
7017	add_device_randomness(dev->dev_addr, dev->addr_len);
7018
7019	/* If the device has permanent device address, driver should
7020	 * set dev_addr and also addr_assign_type should be set to
7021	 * NET_ADDR_PERM (default value).
7022	 */
7023	if (dev->addr_assign_type == NET_ADDR_PERM)
7024		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7025
7026	/* Notify protocols, that a new device appeared. */
7027	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7028	ret = notifier_to_errno(ret);
7029	if (ret) {
7030		rollback_registered(dev);
7031		dev->reg_state = NETREG_UNREGISTERED;
7032	}
7033	/*
7034	 *	Prevent userspace races by waiting until the network
7035	 *	device is fully setup before sending notifications.
7036	 */
7037	if (!dev->rtnl_link_ops ||
7038	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7039		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7040
7041out:
7042	return ret;
7043
7044err_uninit:
7045	if (dev->netdev_ops->ndo_uninit)
7046		dev->netdev_ops->ndo_uninit(dev);
7047	goto out;
7048}
7049EXPORT_SYMBOL(register_netdevice);
7050
7051/**
7052 *	init_dummy_netdev	- init a dummy network device for NAPI
7053 *	@dev: device to init
7054 *
7055 *	This takes a network device structure and initialize the minimum
7056 *	amount of fields so it can be used to schedule NAPI polls without
7057 *	registering a full blown interface. This is to be used by drivers
7058 *	that need to tie several hardware interfaces to a single NAPI
7059 *	poll scheduler due to HW limitations.
7060 */
7061int init_dummy_netdev(struct net_device *dev)
7062{
7063	/* Clear everything. Note we don't initialize spinlocks
7064	 * are they aren't supposed to be taken by any of the
7065	 * NAPI code and this dummy netdev is supposed to be
7066	 * only ever used for NAPI polls
7067	 */
7068	memset(dev, 0, sizeof(struct net_device));
7069
7070	/* make sure we BUG if trying to hit standard
7071	 * register/unregister code path
7072	 */
7073	dev->reg_state = NETREG_DUMMY;
7074
7075	/* NAPI wants this */
7076	INIT_LIST_HEAD(&dev->napi_list);
7077
7078	/* a dummy interface is started by default */
7079	set_bit(__LINK_STATE_PRESENT, &dev->state);
7080	set_bit(__LINK_STATE_START, &dev->state);
7081
7082	/* Note : We dont allocate pcpu_refcnt for dummy devices,
7083	 * because users of this 'device' dont need to change
7084	 * its refcount.
7085	 */
7086
7087	return 0;
7088}
7089EXPORT_SYMBOL_GPL(init_dummy_netdev);
7090
7091
7092/**
7093 *	register_netdev	- register a network device
7094 *	@dev: device to register
7095 *
7096 *	Take a completed network device structure and add it to the kernel
7097 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7098 *	chain. 0 is returned on success. A negative errno code is returned
7099 *	on a failure to set up the device, or if the name is a duplicate.
7100 *
7101 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
7102 *	and expands the device name if you passed a format string to
7103 *	alloc_netdev.
7104 */
7105int register_netdev(struct net_device *dev)
7106{
7107	int err;
7108
7109	rtnl_lock();
7110	err = register_netdevice(dev);
7111	rtnl_unlock();
7112	return err;
7113}
7114EXPORT_SYMBOL(register_netdev);
7115
7116int netdev_refcnt_read(const struct net_device *dev)
7117{
7118	int i, refcnt = 0;
7119
7120	for_each_possible_cpu(i)
7121		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7122	return refcnt;
7123}
7124EXPORT_SYMBOL(netdev_refcnt_read);
7125
7126/**
7127 * netdev_wait_allrefs - wait until all references are gone.
7128 * @dev: target net_device
7129 *
7130 * This is called when unregistering network devices.
7131 *
7132 * Any protocol or device that holds a reference should register
7133 * for netdevice notification, and cleanup and put back the
7134 * reference if they receive an UNREGISTER event.
7135 * We can get stuck here if buggy protocols don't correctly
7136 * call dev_put.
7137 */
7138static void netdev_wait_allrefs(struct net_device *dev)
7139{
7140	unsigned long rebroadcast_time, warning_time;
7141	int refcnt;
7142
7143	linkwatch_forget_dev(dev);
7144
7145	rebroadcast_time = warning_time = jiffies;
7146	refcnt = netdev_refcnt_read(dev);
7147
7148	while (refcnt != 0) {
7149		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7150			rtnl_lock();
7151
7152			/* Rebroadcast unregister notification */
7153			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7154
7155			__rtnl_unlock();
7156			rcu_barrier();
7157			rtnl_lock();
7158
7159			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7160			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7161				     &dev->state)) {
7162				/* We must not have linkwatch events
7163				 * pending on unregister. If this
7164				 * happens, we simply run the queue
7165				 * unscheduled, resulting in a noop
7166				 * for this device.
7167				 */
7168				linkwatch_run_queue();
7169			}
7170
7171			__rtnl_unlock();
7172
7173			rebroadcast_time = jiffies;
7174		}
7175
7176		msleep(250);
7177
7178		refcnt = netdev_refcnt_read(dev);
7179
7180		if (time_after(jiffies, warning_time + 10 * HZ)) {
7181			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7182				 dev->name, refcnt);
7183			warning_time = jiffies;
7184		}
7185	}
7186}
7187
7188/* The sequence is:
7189 *
7190 *	rtnl_lock();
7191 *	...
7192 *	register_netdevice(x1);
7193 *	register_netdevice(x2);
7194 *	...
7195 *	unregister_netdevice(y1);
7196 *	unregister_netdevice(y2);
7197 *      ...
7198 *	rtnl_unlock();
7199 *	free_netdev(y1);
7200 *	free_netdev(y2);
7201 *
7202 * We are invoked by rtnl_unlock().
7203 * This allows us to deal with problems:
7204 * 1) We can delete sysfs objects which invoke hotplug
7205 *    without deadlocking with linkwatch via keventd.
7206 * 2) Since we run with the RTNL semaphore not held, we can sleep
7207 *    safely in order to wait for the netdev refcnt to drop to zero.
7208 *
7209 * We must not return until all unregister events added during
7210 * the interval the lock was held have been completed.
7211 */
7212void netdev_run_todo(void)
7213{
7214	struct list_head list;
7215
7216	/* Snapshot list, allow later requests */
7217	list_replace_init(&net_todo_list, &list);
7218
7219	__rtnl_unlock();
7220
7221
7222	/* Wait for rcu callbacks to finish before next phase */
7223	if (!list_empty(&list))
7224		rcu_barrier();
7225
7226	while (!list_empty(&list)) {
7227		struct net_device *dev
7228			= list_first_entry(&list, struct net_device, todo_list);
7229		list_del(&dev->todo_list);
7230
7231		rtnl_lock();
7232		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7233		__rtnl_unlock();
7234
7235		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7236			pr_err("network todo '%s' but state %d\n",
7237			       dev->name, dev->reg_state);
7238			dump_stack();
7239			continue;
7240		}
7241
7242		dev->reg_state = NETREG_UNREGISTERED;
7243
7244		netdev_wait_allrefs(dev);
7245
7246		/* paranoia */
7247		BUG_ON(netdev_refcnt_read(dev));
7248		BUG_ON(!list_empty(&dev->ptype_all));
7249		BUG_ON(!list_empty(&dev->ptype_specific));
7250		WARN_ON(rcu_access_pointer(dev->ip_ptr));
7251		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7252		WARN_ON(dev->dn_ptr);
7253
7254		if (dev->destructor)
7255			dev->destructor(dev);
7256
7257		/* Report a network device has been unregistered */
7258		rtnl_lock();
7259		dev_net(dev)->dev_unreg_count--;
7260		__rtnl_unlock();
7261		wake_up(&netdev_unregistering_wq);
7262
7263		/* Free network device */
7264		kobject_put(&dev->dev.kobj);
7265	}
7266}
7267
7268/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7269 * all the same fields in the same order as net_device_stats, with only
7270 * the type differing, but rtnl_link_stats64 may have additional fields
7271 * at the end for newer counters.
7272 */
7273void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7274			     const struct net_device_stats *netdev_stats)
7275{
7276#if BITS_PER_LONG == 64
7277	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7278	memcpy(stats64, netdev_stats, sizeof(*stats64));
7279	/* zero out counters that only exist in rtnl_link_stats64 */
7280	memset((char *)stats64 + sizeof(*netdev_stats), 0,
7281	       sizeof(*stats64) - sizeof(*netdev_stats));
7282#else
7283	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7284	const unsigned long *src = (const unsigned long *)netdev_stats;
7285	u64 *dst = (u64 *)stats64;
7286
7287	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7288	for (i = 0; i < n; i++)
7289		dst[i] = src[i];
7290	/* zero out counters that only exist in rtnl_link_stats64 */
7291	memset((char *)stats64 + n * sizeof(u64), 0,
7292	       sizeof(*stats64) - n * sizeof(u64));
7293#endif
7294}
7295EXPORT_SYMBOL(netdev_stats_to_stats64);
7296
7297/**
7298 *	dev_get_stats	- get network device statistics
7299 *	@dev: device to get statistics from
7300 *	@storage: place to store stats
7301 *
7302 *	Get network statistics from device. Return @storage.
7303 *	The device driver may provide its own method by setting
7304 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7305 *	otherwise the internal statistics structure is used.
7306 */
7307struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7308					struct rtnl_link_stats64 *storage)
7309{
7310	const struct net_device_ops *ops = dev->netdev_ops;
7311
7312	if (ops->ndo_get_stats64) {
7313		memset(storage, 0, sizeof(*storage));
7314		ops->ndo_get_stats64(dev, storage);
7315	} else if (ops->ndo_get_stats) {
7316		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7317	} else {
7318		netdev_stats_to_stats64(storage, &dev->stats);
7319	}
7320	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7321	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7322	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7323	return storage;
7324}
7325EXPORT_SYMBOL(dev_get_stats);
7326
7327struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7328{
7329	struct netdev_queue *queue = dev_ingress_queue(dev);
7330
7331#ifdef CONFIG_NET_CLS_ACT
7332	if (queue)
7333		return queue;
7334	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7335	if (!queue)
7336		return NULL;
7337	netdev_init_one_queue(dev, queue, NULL);
7338	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7339	queue->qdisc_sleeping = &noop_qdisc;
7340	rcu_assign_pointer(dev->ingress_queue, queue);
7341#endif
7342	return queue;
7343}
7344
7345static const struct ethtool_ops default_ethtool_ops;
7346
7347void netdev_set_default_ethtool_ops(struct net_device *dev,
7348				    const struct ethtool_ops *ops)
7349{
7350	if (dev->ethtool_ops == &default_ethtool_ops)
7351		dev->ethtool_ops = ops;
7352}
7353EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7354
7355void netdev_freemem(struct net_device *dev)
7356{
7357	char *addr = (char *)dev - dev->padded;
7358
7359	kvfree(addr);
7360}
7361
7362/**
7363 *	alloc_netdev_mqs - allocate network device
7364 *	@sizeof_priv:		size of private data to allocate space for
7365 *	@name:			device name format string
7366 *	@name_assign_type: 	origin of device name
7367 *	@setup:			callback to initialize device
7368 *	@txqs:			the number of TX subqueues to allocate
7369 *	@rxqs:			the number of RX subqueues to allocate
7370 *
7371 *	Allocates a struct net_device with private data area for driver use
7372 *	and performs basic initialization.  Also allocates subqueue structs
7373 *	for each queue on the device.
7374 */
7375struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7376		unsigned char name_assign_type,
7377		void (*setup)(struct net_device *),
7378		unsigned int txqs, unsigned int rxqs)
7379{
7380	struct net_device *dev;
7381	size_t alloc_size;
7382	struct net_device *p;
7383
7384	BUG_ON(strlen(name) >= sizeof(dev->name));
7385
7386	if (txqs < 1) {
7387		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7388		return NULL;
7389	}
7390
7391#ifdef CONFIG_SYSFS
7392	if (rxqs < 1) {
7393		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7394		return NULL;
7395	}
7396#endif
7397
7398	alloc_size = sizeof(struct net_device);
7399	if (sizeof_priv) {
7400		/* ensure 32-byte alignment of private area */
7401		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7402		alloc_size += sizeof_priv;
7403	}
7404	/* ensure 32-byte alignment of whole construct */
7405	alloc_size += NETDEV_ALIGN - 1;
7406
7407	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7408	if (!p)
7409		p = vzalloc(alloc_size);
7410	if (!p)
7411		return NULL;
7412
7413	dev = PTR_ALIGN(p, NETDEV_ALIGN);
7414	dev->padded = (char *)dev - (char *)p;
7415
7416	dev->pcpu_refcnt = alloc_percpu(int);
7417	if (!dev->pcpu_refcnt)
7418		goto free_dev;
7419
7420	if (dev_addr_init(dev))
7421		goto free_pcpu;
7422
7423	dev_mc_init(dev);
7424	dev_uc_init(dev);
7425
7426	dev_net_set(dev, &init_net);
7427
7428	dev->gso_max_size = GSO_MAX_SIZE;
7429	dev->gso_max_segs = GSO_MAX_SEGS;
7430	dev->gso_min_segs = 0;
7431
7432	INIT_LIST_HEAD(&dev->napi_list);
7433	INIT_LIST_HEAD(&dev->unreg_list);
7434	INIT_LIST_HEAD(&dev->close_list);
7435	INIT_LIST_HEAD(&dev->link_watch_list);
7436	INIT_LIST_HEAD(&dev->adj_list.upper);
7437	INIT_LIST_HEAD(&dev->adj_list.lower);
7438	INIT_LIST_HEAD(&dev->all_adj_list.upper);
7439	INIT_LIST_HEAD(&dev->all_adj_list.lower);
7440	INIT_LIST_HEAD(&dev->ptype_all);
7441	INIT_LIST_HEAD(&dev->ptype_specific);
7442	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7443	setup(dev);
7444
7445	if (!dev->tx_queue_len) {
7446		dev->priv_flags |= IFF_NO_QUEUE;
7447		dev->tx_queue_len = 1;
7448	}
7449
7450	dev->num_tx_queues = txqs;
7451	dev->real_num_tx_queues = txqs;
7452	if (netif_alloc_netdev_queues(dev))
7453		goto free_all;
7454
7455#ifdef CONFIG_SYSFS
7456	dev->num_rx_queues = rxqs;
7457	dev->real_num_rx_queues = rxqs;
7458	if (netif_alloc_rx_queues(dev))
7459		goto free_all;
7460#endif
7461
7462	strcpy(dev->name, name);
7463	dev->name_assign_type = name_assign_type;
7464	dev->group = INIT_NETDEV_GROUP;
7465	if (!dev->ethtool_ops)
7466		dev->ethtool_ops = &default_ethtool_ops;
7467
7468	nf_hook_ingress_init(dev);
7469
7470	return dev;
7471
7472free_all:
7473	free_netdev(dev);
7474	return NULL;
7475
7476free_pcpu:
7477	free_percpu(dev->pcpu_refcnt);
7478free_dev:
7479	netdev_freemem(dev);
7480	return NULL;
7481}
7482EXPORT_SYMBOL(alloc_netdev_mqs);
7483
7484/**
7485 *	free_netdev - free network device
7486 *	@dev: device
7487 *
7488 *	This function does the last stage of destroying an allocated device
7489 * 	interface. The reference to the device object is released.
7490 *	If this is the last reference then it will be freed.
7491 *	Must be called in process context.
7492 */
7493void free_netdev(struct net_device *dev)
7494{
7495	struct napi_struct *p, *n;
7496
7497	might_sleep();
7498	netif_free_tx_queues(dev);
7499#ifdef CONFIG_SYSFS
7500	kvfree(dev->_rx);
7501#endif
7502
7503	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7504
7505	/* Flush device addresses */
7506	dev_addr_flush(dev);
7507
7508	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7509		netif_napi_del(p);
7510
7511	free_percpu(dev->pcpu_refcnt);
7512	dev->pcpu_refcnt = NULL;
7513
7514	/*  Compatibility with error handling in drivers */
7515	if (dev->reg_state == NETREG_UNINITIALIZED) {
7516		netdev_freemem(dev);
7517		return;
7518	}
7519
7520	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7521	dev->reg_state = NETREG_RELEASED;
7522
7523	/* will free via device release */
7524	put_device(&dev->dev);
7525}
7526EXPORT_SYMBOL(free_netdev);
7527
7528/**
7529 *	synchronize_net -  Synchronize with packet receive processing
7530 *
7531 *	Wait for packets currently being received to be done.
7532 *	Does not block later packets from starting.
7533 */
7534void synchronize_net(void)
7535{
7536	might_sleep();
7537	if (rtnl_is_locked())
7538		synchronize_rcu_expedited();
7539	else
7540		synchronize_rcu();
7541}
7542EXPORT_SYMBOL(synchronize_net);
7543
7544/**
7545 *	unregister_netdevice_queue - remove device from the kernel
7546 *	@dev: device
7547 *	@head: list
7548 *
7549 *	This function shuts down a device interface and removes it
7550 *	from the kernel tables.
7551 *	If head not NULL, device is queued to be unregistered later.
7552 *
7553 *	Callers must hold the rtnl semaphore.  You may want
7554 *	unregister_netdev() instead of this.
7555 */
7556
7557void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7558{
7559	ASSERT_RTNL();
7560
7561	if (head) {
7562		list_move_tail(&dev->unreg_list, head);
7563	} else {
7564		rollback_registered(dev);
7565		/* Finish processing unregister after unlock */
7566		net_set_todo(dev);
7567	}
7568}
7569EXPORT_SYMBOL(unregister_netdevice_queue);
7570
7571/**
7572 *	unregister_netdevice_many - unregister many devices
7573 *	@head: list of devices
7574 *
7575 *  Note: As most callers use a stack allocated list_head,
7576 *  we force a list_del() to make sure stack wont be corrupted later.
7577 */
7578void unregister_netdevice_many(struct list_head *head)
7579{
7580	struct net_device *dev;
7581
7582	if (!list_empty(head)) {
7583		rollback_registered_many(head);
7584		list_for_each_entry(dev, head, unreg_list)
7585			net_set_todo(dev);
7586		list_del(head);
7587	}
7588}
7589EXPORT_SYMBOL(unregister_netdevice_many);
7590
7591/**
7592 *	unregister_netdev - remove device from the kernel
7593 *	@dev: device
7594 *
7595 *	This function shuts down a device interface and removes it
7596 *	from the kernel tables.
7597 *
7598 *	This is just a wrapper for unregister_netdevice that takes
7599 *	the rtnl semaphore.  In general you want to use this and not
7600 *	unregister_netdevice.
7601 */
7602void unregister_netdev(struct net_device *dev)
7603{
7604	rtnl_lock();
7605	unregister_netdevice(dev);
7606	rtnl_unlock();
7607}
7608EXPORT_SYMBOL(unregister_netdev);
7609
7610/**
7611 *	dev_change_net_namespace - move device to different nethost namespace
7612 *	@dev: device
7613 *	@net: network namespace
7614 *	@pat: If not NULL name pattern to try if the current device name
7615 *	      is already taken in the destination network namespace.
7616 *
7617 *	This function shuts down a device interface and moves it
7618 *	to a new network namespace. On success 0 is returned, on
7619 *	a failure a netagive errno code is returned.
7620 *
7621 *	Callers must hold the rtnl semaphore.
7622 */
7623
7624int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7625{
7626	int err;
7627
7628	ASSERT_RTNL();
7629
7630	/* Don't allow namespace local devices to be moved. */
7631	err = -EINVAL;
7632	if (dev->features & NETIF_F_NETNS_LOCAL)
7633		goto out;
7634
7635	/* Ensure the device has been registrered */
7636	if (dev->reg_state != NETREG_REGISTERED)
7637		goto out;
7638
7639	/* Get out if there is nothing todo */
7640	err = 0;
7641	if (net_eq(dev_net(dev), net))
7642		goto out;
7643
7644	/* Pick the destination device name, and ensure
7645	 * we can use it in the destination network namespace.
7646	 */
7647	err = -EEXIST;
7648	if (__dev_get_by_name(net, dev->name)) {
7649		/* We get here if we can't use the current device name */
7650		if (!pat)
7651			goto out;
7652		if (dev_get_valid_name(net, dev, pat) < 0)
7653			goto out;
7654	}
7655
7656	/*
7657	 * And now a mini version of register_netdevice unregister_netdevice.
7658	 */
7659
7660	/* If device is running close it first. */
7661	dev_close(dev);
7662
7663	/* And unlink it from device chain */
7664	err = -ENODEV;
7665	unlist_netdevice(dev);
7666
7667	synchronize_net();
7668
7669	/* Shutdown queueing discipline. */
7670	dev_shutdown(dev);
7671
7672	/* Notify protocols, that we are about to destroy
7673	   this device. They should clean all the things.
7674
7675	   Note that dev->reg_state stays at NETREG_REGISTERED.
7676	   This is wanted because this way 8021q and macvlan know
7677	   the device is just moving and can keep their slaves up.
7678	*/
7679	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7680	rcu_barrier();
7681	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7682	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7683
7684	/*
7685	 *	Flush the unicast and multicast chains
7686	 */
7687	dev_uc_flush(dev);
7688	dev_mc_flush(dev);
7689
7690	/* Send a netdev-removed uevent to the old namespace */
7691	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7692	netdev_adjacent_del_links(dev);
7693
7694	/* Actually switch the network namespace */
7695	dev_net_set(dev, net);
7696
7697	/* If there is an ifindex conflict assign a new one */
7698	if (__dev_get_by_index(net, dev->ifindex))
7699		dev->ifindex = dev_new_index(net);
7700
7701	/* Send a netdev-add uevent to the new namespace */
7702	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7703	netdev_adjacent_add_links(dev);
7704
7705	/* Fixup kobjects */
7706	err = device_rename(&dev->dev, dev->name);
7707	WARN_ON(err);
7708
7709	/* Add the device back in the hashes */
7710	list_netdevice(dev);
7711
7712	/* Notify protocols, that a new device appeared. */
7713	call_netdevice_notifiers(NETDEV_REGISTER, dev);
7714
7715	/*
7716	 *	Prevent userspace races by waiting until the network
7717	 *	device is fully setup before sending notifications.
7718	 */
7719	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7720
7721	synchronize_net();
7722	err = 0;
7723out:
7724	return err;
7725}
7726EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7727
7728static int dev_cpu_callback(struct notifier_block *nfb,
7729			    unsigned long action,
7730			    void *ocpu)
7731{
7732	struct sk_buff **list_skb;
7733	struct sk_buff *skb;
7734	unsigned int cpu, oldcpu = (unsigned long)ocpu;
7735	struct softnet_data *sd, *oldsd;
7736
7737	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7738		return NOTIFY_OK;
7739
7740	local_irq_disable();
7741	cpu = smp_processor_id();
7742	sd = &per_cpu(softnet_data, cpu);
7743	oldsd = &per_cpu(softnet_data, oldcpu);
7744
7745	/* Find end of our completion_queue. */
7746	list_skb = &sd->completion_queue;
7747	while (*list_skb)
7748		list_skb = &(*list_skb)->next;
7749	/* Append completion queue from offline CPU. */
7750	*list_skb = oldsd->completion_queue;
7751	oldsd->completion_queue = NULL;
7752
7753	/* Append output queue from offline CPU. */
7754	if (oldsd->output_queue) {
7755		*sd->output_queue_tailp = oldsd->output_queue;
7756		sd->output_queue_tailp = oldsd->output_queue_tailp;
7757		oldsd->output_queue = NULL;
7758		oldsd->output_queue_tailp = &oldsd->output_queue;
7759	}
7760	/* Append NAPI poll list from offline CPU, with one exception :
7761	 * process_backlog() must be called by cpu owning percpu backlog.
7762	 * We properly handle process_queue & input_pkt_queue later.
7763	 */
7764	while (!list_empty(&oldsd->poll_list)) {
7765		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7766							    struct napi_struct,
7767							    poll_list);
7768
7769		list_del_init(&napi->poll_list);
7770		if (napi->poll == process_backlog)
7771			napi->state = 0;
7772		else
7773			____napi_schedule(sd, napi);
7774	}
7775
7776	raise_softirq_irqoff(NET_TX_SOFTIRQ);
7777	local_irq_enable();
7778
7779	/* Process offline CPU's input_pkt_queue */
7780	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7781		netif_rx_ni(skb);
7782		input_queue_head_incr(oldsd);
7783	}
7784	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7785		netif_rx_ni(skb);
7786		input_queue_head_incr(oldsd);
7787	}
7788
7789	return NOTIFY_OK;
7790}
7791
7792
7793/**
7794 *	netdev_increment_features - increment feature set by one
7795 *	@all: current feature set
7796 *	@one: new feature set
7797 *	@mask: mask feature set
7798 *
7799 *	Computes a new feature set after adding a device with feature set
7800 *	@one to the master device with current feature set @all.  Will not
7801 *	enable anything that is off in @mask. Returns the new feature set.
7802 */
7803netdev_features_t netdev_increment_features(netdev_features_t all,
7804	netdev_features_t one, netdev_features_t mask)
7805{
7806	if (mask & NETIF_F_HW_CSUM)
7807		mask |= NETIF_F_CSUM_MASK;
7808	mask |= NETIF_F_VLAN_CHALLENGED;
7809
7810	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
7811	all &= one | ~NETIF_F_ALL_FOR_ALL;
7812
7813	/* If one device supports hw checksumming, set for all. */
7814	if (all & NETIF_F_HW_CSUM)
7815		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
7816
7817	return all;
7818}
7819EXPORT_SYMBOL(netdev_increment_features);
7820
7821static struct hlist_head * __net_init netdev_create_hash(void)
7822{
7823	int i;
7824	struct hlist_head *hash;
7825
7826	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7827	if (hash != NULL)
7828		for (i = 0; i < NETDEV_HASHENTRIES; i++)
7829			INIT_HLIST_HEAD(&hash[i]);
7830
7831	return hash;
7832}
7833
7834/* Initialize per network namespace state */
7835static int __net_init netdev_init(struct net *net)
7836{
7837	if (net != &init_net)
7838		INIT_LIST_HEAD(&net->dev_base_head);
7839
7840	net->dev_name_head = netdev_create_hash();
7841	if (net->dev_name_head == NULL)
7842		goto err_name;
7843
7844	net->dev_index_head = netdev_create_hash();
7845	if (net->dev_index_head == NULL)
7846		goto err_idx;
7847
7848	return 0;
7849
7850err_idx:
7851	kfree(net->dev_name_head);
7852err_name:
7853	return -ENOMEM;
7854}
7855
7856/**
7857 *	netdev_drivername - network driver for the device
7858 *	@dev: network device
7859 *
7860 *	Determine network driver for device.
7861 */
7862const char *netdev_drivername(const struct net_device *dev)
7863{
7864	const struct device_driver *driver;
7865	const struct device *parent;
7866	const char *empty = "";
7867
7868	parent = dev->dev.parent;
7869	if (!parent)
7870		return empty;
7871
7872	driver = parent->driver;
7873	if (driver && driver->name)
7874		return driver->name;
7875	return empty;
7876}
7877
7878static void __netdev_printk(const char *level, const struct net_device *dev,
7879			    struct va_format *vaf)
7880{
7881	if (dev && dev->dev.parent) {
7882		dev_printk_emit(level[1] - '0',
7883				dev->dev.parent,
7884				"%s %s %s%s: %pV",
7885				dev_driver_string(dev->dev.parent),
7886				dev_name(dev->dev.parent),
7887				netdev_name(dev), netdev_reg_state(dev),
7888				vaf);
7889	} else if (dev) {
7890		printk("%s%s%s: %pV",
7891		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
7892	} else {
7893		printk("%s(NULL net_device): %pV", level, vaf);
7894	}
7895}
7896
7897void netdev_printk(const char *level, const struct net_device *dev,
7898		   const char *format, ...)
7899{
7900	struct va_format vaf;
7901	va_list args;
7902
7903	va_start(args, format);
7904
7905	vaf.fmt = format;
7906	vaf.va = &args;
7907
7908	__netdev_printk(level, dev, &vaf);
7909
7910	va_end(args);
7911}
7912EXPORT_SYMBOL(netdev_printk);
7913
7914#define define_netdev_printk_level(func, level)			\
7915void func(const struct net_device *dev, const char *fmt, ...)	\
7916{								\
7917	struct va_format vaf;					\
7918	va_list args;						\
7919								\
7920	va_start(args, fmt);					\
7921								\
7922	vaf.fmt = fmt;						\
7923	vaf.va = &args;						\
7924								\
7925	__netdev_printk(level, dev, &vaf);			\
7926								\
7927	va_end(args);						\
7928}								\
7929EXPORT_SYMBOL(func);
7930
7931define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7932define_netdev_printk_level(netdev_alert, KERN_ALERT);
7933define_netdev_printk_level(netdev_crit, KERN_CRIT);
7934define_netdev_printk_level(netdev_err, KERN_ERR);
7935define_netdev_printk_level(netdev_warn, KERN_WARNING);
7936define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7937define_netdev_printk_level(netdev_info, KERN_INFO);
7938
7939static void __net_exit netdev_exit(struct net *net)
7940{
7941	kfree(net->dev_name_head);
7942	kfree(net->dev_index_head);
7943}
7944
7945static struct pernet_operations __net_initdata netdev_net_ops = {
7946	.init = netdev_init,
7947	.exit = netdev_exit,
7948};
7949
7950static void __net_exit default_device_exit(struct net *net)
7951{
7952	struct net_device *dev, *aux;
7953	/*
7954	 * Push all migratable network devices back to the
7955	 * initial network namespace
7956	 */
7957	rtnl_lock();
7958	for_each_netdev_safe(net, dev, aux) {
7959		int err;
7960		char fb_name[IFNAMSIZ];
7961
7962		/* Ignore unmoveable devices (i.e. loopback) */
7963		if (dev->features & NETIF_F_NETNS_LOCAL)
7964			continue;
7965
7966		/* Leave virtual devices for the generic cleanup */
7967		if (dev->rtnl_link_ops)
7968			continue;
7969
7970		/* Push remaining network devices to init_net */
7971		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7972		err = dev_change_net_namespace(dev, &init_net, fb_name);
7973		if (err) {
7974			pr_emerg("%s: failed to move %s to init_net: %d\n",
7975				 __func__, dev->name, err);
7976			BUG();
7977		}
7978	}
7979	rtnl_unlock();
7980}
7981
7982static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7983{
7984	/* Return with the rtnl_lock held when there are no network
7985	 * devices unregistering in any network namespace in net_list.
7986	 */
7987	struct net *net;
7988	bool unregistering;
7989	DEFINE_WAIT_FUNC(wait, woken_wake_function);
7990
7991	add_wait_queue(&netdev_unregistering_wq, &wait);
7992	for (;;) {
7993		unregistering = false;
7994		rtnl_lock();
7995		list_for_each_entry(net, net_list, exit_list) {
7996			if (net->dev_unreg_count > 0) {
7997				unregistering = true;
7998				break;
7999			}
8000		}
8001		if (!unregistering)
8002			break;
8003		__rtnl_unlock();
8004
8005		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8006	}
8007	remove_wait_queue(&netdev_unregistering_wq, &wait);
8008}
8009
8010static void __net_exit default_device_exit_batch(struct list_head *net_list)
8011{
8012	/* At exit all network devices most be removed from a network
8013	 * namespace.  Do this in the reverse order of registration.
8014	 * Do this across as many network namespaces as possible to
8015	 * improve batching efficiency.
8016	 */
8017	struct net_device *dev;
8018	struct net *net;
8019	LIST_HEAD(dev_kill_list);
8020
8021	/* To prevent network device cleanup code from dereferencing
8022	 * loopback devices or network devices that have been freed
8023	 * wait here for all pending unregistrations to complete,
8024	 * before unregistring the loopback device and allowing the
8025	 * network namespace be freed.
8026	 *
8027	 * The netdev todo list containing all network devices
8028	 * unregistrations that happen in default_device_exit_batch
8029	 * will run in the rtnl_unlock() at the end of
8030	 * default_device_exit_batch.
8031	 */
8032	rtnl_lock_unregistering(net_list);
8033	list_for_each_entry(net, net_list, exit_list) {
8034		for_each_netdev_reverse(net, dev) {
8035			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8036				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8037			else
8038				unregister_netdevice_queue(dev, &dev_kill_list);
8039		}
8040	}
8041	unregister_netdevice_many(&dev_kill_list);
8042	rtnl_unlock();
8043}
8044
8045static struct pernet_operations __net_initdata default_device_ops = {
8046	.exit = default_device_exit,
8047	.exit_batch = default_device_exit_batch,
8048};
8049
8050/*
8051 *	Initialize the DEV module. At boot time this walks the device list and
8052 *	unhooks any devices that fail to initialise (normally hardware not
8053 *	present) and leaves us with a valid list of present and active devices.
8054 *
8055 */
8056
8057/*
8058 *       This is called single threaded during boot, so no need
8059 *       to take the rtnl semaphore.
8060 */
8061static int __init net_dev_init(void)
8062{
8063	int i, rc = -ENOMEM;
8064
8065	BUG_ON(!dev_boot_phase);
8066
8067	if (dev_proc_init())
8068		goto out;
8069
8070	if (netdev_kobject_init())
8071		goto out;
8072
8073	INIT_LIST_HEAD(&ptype_all);
8074	for (i = 0; i < PTYPE_HASH_SIZE; i++)
8075		INIT_LIST_HEAD(&ptype_base[i]);
8076
8077	INIT_LIST_HEAD(&offload_base);
8078
8079	if (register_pernet_subsys(&netdev_net_ops))
8080		goto out;
8081
8082	/*
8083	 *	Initialise the packet receive queues.
8084	 */
8085
8086	for_each_possible_cpu(i) {
8087		struct softnet_data *sd = &per_cpu(softnet_data, i);
8088
8089		skb_queue_head_init(&sd->input_pkt_queue);
8090		skb_queue_head_init(&sd->process_queue);
8091		INIT_LIST_HEAD(&sd->poll_list);
8092		sd->output_queue_tailp = &sd->output_queue;
8093#ifdef CONFIG_RPS
8094		sd->csd.func = rps_trigger_softirq;
8095		sd->csd.info = sd;
8096		sd->cpu = i;
8097#endif
8098
8099		sd->backlog.poll = process_backlog;
8100		sd->backlog.weight = weight_p;
8101	}
8102
8103	dev_boot_phase = 0;
8104
8105	/* The loopback device is special if any other network devices
8106	 * is present in a network namespace the loopback device must
8107	 * be present. Since we now dynamically allocate and free the
8108	 * loopback device ensure this invariant is maintained by
8109	 * keeping the loopback device as the first device on the
8110	 * list of network devices.  Ensuring the loopback devices
8111	 * is the first device that appears and the last network device
8112	 * that disappears.
8113	 */
8114	if (register_pernet_device(&loopback_net_ops))
8115		goto out;
8116
8117	if (register_pernet_device(&default_device_ops))
8118		goto out;
8119
8120	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8121	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8122
8123	hotcpu_notifier(dev_cpu_callback, 0);
8124	dst_subsys_init();
8125	rc = 0;
8126out:
8127	return rc;
8128}
8129
8130subsys_initcall(net_dev_init);

    1// SPDX-License-Identifier: GPL-2.0-or-later
    2/*
    3 *      NET3    Protocol independent device support routines.
    4 *
    5 *	Derived from the non IP parts of dev.c 1.0.19
    6 *              Authors:	Ross Biro
    7 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
    8 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
    9 *
   10 *	Additional Authors:
   11 *		Florian la Roche <rzsfl@rz.uni-sb.de>
   12 *		Alan Cox <gw4pts@gw4pts.ampr.org>
   13 *		David Hinds <dahinds@users.sourceforge.net>
   14 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
   15 *		Adam Sulmicki <adam@cfar.umd.edu>
   16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
   17 *
   18 *	Changes:
   19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
   20 *                                      to 2 if register_netdev gets called
   21 *                                      before net_dev_init & also removed a
   22 *                                      few lines of code in the process.
   23 *		Alan Cox	:	device private ioctl copies fields back.
   24 *		Alan Cox	:	Transmit queue code does relevant
   25 *					stunts to keep the queue safe.
   26 *		Alan Cox	:	Fixed double lock.
   27 *		Alan Cox	:	Fixed promisc NULL pointer trap
   28 *		????????	:	Support the full private ioctl range
   29 *		Alan Cox	:	Moved ioctl permission check into
   30 *					drivers
   31 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
   32 *		Alan Cox	:	100 backlog just doesn't cut it when
   33 *					you start doing multicast video 8)
   34 *		Alan Cox	:	Rewrote net_bh and list manager.
   35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
   36 *		Alan Cox	:	Took out transmit every packet pass
   37 *					Saved a few bytes in the ioctl handler
   38 *		Alan Cox	:	Network driver sets packet type before
   39 *					calling netif_rx. Saves a function
   40 *					call a packet.
   41 *		Alan Cox	:	Hashed net_bh()
   42 *		Richard Kooijman:	Timestamp fixes.
   43 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
   44 *		Alan Cox	:	Device lock protection.
   45 *              Alan Cox        :       Fixed nasty side effect of device close
   46 *					changes.
   47 *		Rudi Cilibrasi	:	Pass the right thing to
   48 *					set_mac_address()
   49 *		Dave Miller	:	32bit quantity for the device lock to
   50 *					make it work out on a Sparc.
   51 *		Bjorn Ekwall	:	Added KERNELD hack.
   52 *		Alan Cox	:	Cleaned up the backlog initialise.
   53 *		Craig Metz	:	SIOCGIFCONF fix if space for under
   54 *					1 device.
   55 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
   56 *					is no device open function.
   57 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
   58 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
   59 *		Cyrus Durgin	:	Cleaned for KMOD
   60 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
   61 *					A network device unload needs to purge
   62 *					the backlog queue.
   63 *	Paul Rusty Russell	:	SIOCSIFNAME
   64 *              Pekka Riikonen  :	Netdev boot-time settings code
   65 *              Andrew Morton   :       Make unregister_netdevice wait
   66 *                                      indefinitely on dev->refcnt
   67 *              J Hadi Salim    :       - Backlog queue sampling
   68 *				        - netif_rx() feedback
   69 */
   70
   71#include <linux/uaccess.h>
   72#include <linux/bitops.h>
   73#include <linux/capability.h>
   74#include <linux/cpu.h>
   75#include <linux/types.h>
   76#include <linux/kernel.h>
   77#include <linux/hash.h>
   78#include <linux/slab.h>
   79#include <linux/sched.h>
   80#include <linux/sched/mm.h>
   81#include <linux/mutex.h>
   82#include <linux/rwsem.h>
   83#include <linux/string.h>
   84#include <linux/mm.h>
   85#include <linux/socket.h>
   86#include <linux/sockios.h>
   87#include <linux/errno.h>
   88#include <linux/interrupt.h>
   89#include <linux/if_ether.h>
   90#include <linux/netdevice.h>
   91#include <linux/etherdevice.h>
   92#include <linux/ethtool.h>
   93#include <linux/skbuff.h>
   94#include <linux/bpf.h>
   95#include <linux/bpf_trace.h>
   96#include <net/net_namespace.h>
   97#include <net/sock.h>
   98#include <net/busy_poll.h>
   99#include <linux/rtnetlink.h>
  100#include <linux/stat.h>
  101#include <net/dst.h>
  102#include <net/dst_metadata.h>
  103#include <net/pkt_sched.h>
  104#include <net/pkt_cls.h>
  105#include <net/checksum.h>
  106#include <net/xfrm.h>
  107#include <linux/highmem.h>
  108#include <linux/init.h>
  109#include <linux/module.h>
  110#include <linux/netpoll.h>
  111#include <linux/rcupdate.h>
  112#include <linux/delay.h>
  113#include <net/iw_handler.h>
  114#include <asm/current.h>
  115#include <linux/audit.h>
  116#include <linux/dmaengine.h>
  117#include <linux/err.h>
  118#include <linux/ctype.h>
  119#include <linux/if_arp.h>
  120#include <linux/if_vlan.h>
  121#include <linux/ip.h>
  122#include <net/ip.h>
  123#include <net/mpls.h>
  124#include <linux/ipv6.h>
  125#include <linux/in.h>
  126#include <linux/jhash.h>
  127#include <linux/random.h>
  128#include <trace/events/napi.h>
  129#include <trace/events/net.h>
  130#include <trace/events/skb.h>
  131#include <linux/inetdevice.h>
  132#include <linux/cpu_rmap.h>
  133#include <linux/static_key.h>
  134#include <linux/hashtable.h>
  135#include <linux/vmalloc.h>
  136#include <linux/if_macvlan.h>
  137#include <linux/errqueue.h>
  138#include <linux/hrtimer.h>
  139#include <linux/netfilter_ingress.h>
  140#include <linux/crash_dump.h>
  141#include <linux/sctp.h>
  142#include <net/udp_tunnel.h>
  143#include <linux/net_namespace.h>
  144#include <linux/indirect_call_wrapper.h>
  145#include <net/devlink.h>
  146#include <linux/pm_runtime.h>
  147
  148#include "net-sysfs.h"
  149
  150#define MAX_GRO_SKBS 8
  151
  152/* This should be increased if a protocol with a bigger head is added. */
  153#define GRO_MAX_HEAD (MAX_HEADER + 128)
  154
  155static DEFINE_SPINLOCK(ptype_lock);
  156static DEFINE_SPINLOCK(offload_lock);
  157struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
  158struct list_head ptype_all __read_mostly;	/* Taps */
  159static struct list_head offload_base __read_mostly;
  160
  161static int netif_rx_internal(struct sk_buff *skb);
  162static int call_netdevice_notifiers_info(unsigned long val,
  163					 struct netdev_notifier_info *info);
  164static int call_netdevice_notifiers_extack(unsigned long val,
  165					   struct net_device *dev,
  166					   struct netlink_ext_ack *extack);
  167static struct napi_struct *napi_by_id(unsigned int napi_id);
  168
  169/*
  170 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
  171 * semaphore.
  172 *
  173 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
  174 *
  175 * Writers must hold the rtnl semaphore while they loop through the
  176 * dev_base_head list, and hold dev_base_lock for writing when they do the
  177 * actual updates.  This allows pure readers to access the list even
  178 * while a writer is preparing to update it.
  179 *
  180 * To put it another way, dev_base_lock is held for writing only to
  181 * protect against pure readers; the rtnl semaphore provides the
  182 * protection against other writers.
  183 *
  184 * See, for example usages, register_netdevice() and
  185 * unregister_netdevice(), which must be called with the rtnl
  186 * semaphore held.
  187 */
  188DEFINE_RWLOCK(dev_base_lock);
  189EXPORT_SYMBOL(dev_base_lock);
  190
  191static DEFINE_MUTEX(ifalias_mutex);
  192
  193/* protects napi_hash addition/deletion and napi_gen_id */
  194static DEFINE_SPINLOCK(napi_hash_lock);
  195
  196static unsigned int napi_gen_id = NR_CPUS;
  197static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
  198
  199static DECLARE_RWSEM(devnet_rename_sem);
  200
  201static inline void dev_base_seq_inc(struct net *net)
  202{
  203	while (++net->dev_base_seq == 0)
  204		;
  205}
  206
  207static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
  208{
  209	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
  210
  211	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
  212}
  213
  214static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
  215{
  216	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
  217}
  218
  219static inline void rps_lock(struct softnet_data *sd)
  220{
  221#ifdef CONFIG_RPS
  222	spin_lock(&sd->input_pkt_queue.lock);
  223#endif
  224}
  225
  226static inline void rps_unlock(struct softnet_data *sd)
  227{
  228#ifdef CONFIG_RPS
  229	spin_unlock(&sd->input_pkt_queue.lock);
  230#endif
  231}
  232
  233static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
  234						       const char *name)
  235{
  236	struct netdev_name_node *name_node;
  237
  238	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
  239	if (!name_node)
  240		return NULL;
  241	INIT_HLIST_NODE(&name_node->hlist);
  242	name_node->dev = dev;
  243	name_node->name = name;
  244	return name_node;
  245}
  246
  247static struct netdev_name_node *
  248netdev_name_node_head_alloc(struct net_device *dev)
  249{
  250	struct netdev_name_node *name_node;
  251
  252	name_node = netdev_name_node_alloc(dev, dev->name);
  253	if (!name_node)
  254		return NULL;
  255	INIT_LIST_HEAD(&name_node->list);
  256	return name_node;
  257}
  258
  259static void netdev_name_node_free(struct netdev_name_node *name_node)
  260{
  261	kfree(name_node);
  262}
  263
  264static void netdev_name_node_add(struct net *net,
  265				 struct netdev_name_node *name_node)
  266{
  267	hlist_add_head_rcu(&name_node->hlist,
  268			   dev_name_hash(net, name_node->name));
  269}
  270
  271static void netdev_name_node_del(struct netdev_name_node *name_node)
  272{
  273	hlist_del_rcu(&name_node->hlist);
  274}
  275
  276static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
  277							const char *name)
  278{
  279	struct hlist_head *head = dev_name_hash(net, name);
  280	struct netdev_name_node *name_node;
  281
  282	hlist_for_each_entry(name_node, head, hlist)
  283		if (!strcmp(name_node->name, name))
  284			return name_node;
  285	return NULL;
  286}
  287
  288static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
  289							    const char *name)
  290{
  291	struct hlist_head *head = dev_name_hash(net, name);
  292	struct netdev_name_node *name_node;
  293
  294	hlist_for_each_entry_rcu(name_node, head, hlist)
  295		if (!strcmp(name_node->name, name))
  296			return name_node;
  297	return NULL;
  298}
  299
  300int netdev_name_node_alt_create(struct net_device *dev, const char *name)
  301{
  302	struct netdev_name_node *name_node;
  303	struct net *net = dev_net(dev);
  304
  305	name_node = netdev_name_node_lookup(net, name);
  306	if (name_node)
  307		return -EEXIST;
  308	name_node = netdev_name_node_alloc(dev, name);
  309	if (!name_node)
  310		return -ENOMEM;
  311	netdev_name_node_add(net, name_node);
  312	/* The node that holds dev->name acts as a head of per-device list. */
  313	list_add_tail(&name_node->list, &dev->name_node->list);
  314
  315	return 0;
  316}
  317EXPORT_SYMBOL(netdev_name_node_alt_create);
  318
  319static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
  320{
  321	list_del(&name_node->list);
  322	netdev_name_node_del(name_node);
  323	kfree(name_node->name);
  324	netdev_name_node_free(name_node);
  325}
  326
  327int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
  328{
  329	struct netdev_name_node *name_node;
  330	struct net *net = dev_net(dev);
  331
  332	name_node = netdev_name_node_lookup(net, name);
  333	if (!name_node)
  334		return -ENOENT;
  335	/* lookup might have found our primary name or a name belonging
  336	 * to another device.
  337	 */
  338	if (name_node == dev->name_node || name_node->dev != dev)
  339		return -EINVAL;
  340
  341	__netdev_name_node_alt_destroy(name_node);
  342
  343	return 0;
  344}
  345EXPORT_SYMBOL(netdev_name_node_alt_destroy);
  346
  347static void netdev_name_node_alt_flush(struct net_device *dev)
  348{
  349	struct netdev_name_node *name_node, *tmp;
  350
  351	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
  352		__netdev_name_node_alt_destroy(name_node);
  353}
  354
  355/* Device list insertion */
  356static void list_netdevice(struct net_device *dev)
  357{
  358	struct net *net = dev_net(dev);
  359
  360	ASSERT_RTNL();
  361
  362	write_lock_bh(&dev_base_lock);
  363	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
  364	netdev_name_node_add(net, dev->name_node);
  365	hlist_add_head_rcu(&dev->index_hlist,
  366			   dev_index_hash(net, dev->ifindex));
  367	write_unlock_bh(&dev_base_lock);
  368
  369	dev_base_seq_inc(net);
  370}
  371
  372/* Device list removal
  373 * caller must respect a RCU grace period before freeing/reusing dev
  374 */
  375static void unlist_netdevice(struct net_device *dev)
  376{
  377	ASSERT_RTNL();
  378
  379	/* Unlink dev from the device chain */
  380	write_lock_bh(&dev_base_lock);
  381	list_del_rcu(&dev->dev_list);
  382	netdev_name_node_del(dev->name_node);
  383	hlist_del_rcu(&dev->index_hlist);
  384	write_unlock_bh(&dev_base_lock);
  385
  386	dev_base_seq_inc(dev_net(dev));
  387}
  388
  389/*
  390 *	Our notifier list
  391 */
  392
  393static RAW_NOTIFIER_HEAD(netdev_chain);
  394
  395/*
  396 *	Device drivers call our routines to queue packets here. We empty the
  397 *	queue in the local softnet handler.
  398 */
  399
  400DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
  401EXPORT_PER_CPU_SYMBOL(softnet_data);
  402
  403#ifdef CONFIG_LOCKDEP
  404/*
  405 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
  406 * according to dev->type
  407 */
  408static const unsigned short netdev_lock_type[] = {
  409	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
  410	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
  411	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
  412	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
  413	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
  414	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
  415	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
  416	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
  417	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
  418	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
  419	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
  420	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
  421	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
  422	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
  423	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
  424
  425static const char *const netdev_lock_name[] = {
  426	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
  427	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
  428	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
  429	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
  430	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
  431	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
  432	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
  433	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
  434	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
  435	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
  436	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
  437	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
  438	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
  439	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
  440	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
  441
  442static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
  443static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
  444
  445static inline unsigned short netdev_lock_pos(unsigned short dev_type)
  446{
  447	int i;
  448
  449	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
  450		if (netdev_lock_type[i] == dev_type)
  451			return i;
  452	/* the last key is used by default */
  453	return ARRAY_SIZE(netdev_lock_type) - 1;
  454}
  455
  456static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  457						 unsigned short dev_type)
  458{
  459	int i;
  460
  461	i = netdev_lock_pos(dev_type);
  462	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
  463				   netdev_lock_name[i]);
  464}
  465
  466static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  467{
  468	int i;
  469
  470	i = netdev_lock_pos(dev->type);
  471	lockdep_set_class_and_name(&dev->addr_list_lock,
  472				   &netdev_addr_lock_key[i],
  473				   netdev_lock_name[i]);
  474}
  475#else
  476static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  477						 unsigned short dev_type)
  478{
  479}
  480
  481static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  482{
  483}
  484#endif
  485
  486/*******************************************************************************
  487 *
  488 *		Protocol management and registration routines
  489 *
  490 *******************************************************************************/
  491
  492
  493/*
  494 *	Add a protocol ID to the list. Now that the input handler is
  495 *	smarter we can dispense with all the messy stuff that used to be
  496 *	here.
  497 *
  498 *	BEWARE!!! Protocol handlers, mangling input packets,
  499 *	MUST BE last in hash buckets and checking protocol handlers
  500 *	MUST start from promiscuous ptype_all chain in net_bh.
  501 *	It is true now, do not change it.
  502 *	Explanation follows: if protocol handler, mangling packet, will
  503 *	be the first on list, it is not able to sense, that packet
  504 *	is cloned and should be copied-on-write, so that it will
  505 *	change it and subsequent readers will get broken packet.
  506 *							--ANK (980803)
  507 */
  508
  509static inline struct list_head *ptype_head(const struct packet_type *pt)
  510{
  511	if (pt->type == htons(ETH_P_ALL))
  512		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
  513	else
  514		return pt->dev ? &pt->dev->ptype_specific :
  515				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
  516}
  517
  518/**
  519 *	dev_add_pack - add packet handler
  520 *	@pt: packet type declaration
  521 *
  522 *	Add a protocol handler to the networking stack. The passed &packet_type
  523 *	is linked into kernel lists and may not be freed until it has been
  524 *	removed from the kernel lists.
  525 *
  526 *	This call does not sleep therefore it can not
  527 *	guarantee all CPU's that are in middle of receiving packets
  528 *	will see the new packet type (until the next received packet).
  529 */
  530
  531void dev_add_pack(struct packet_type *pt)
  532{
  533	struct list_head *head = ptype_head(pt);
  534
  535	spin_lock(&ptype_lock);
  536	list_add_rcu(&pt->list, head);
  537	spin_unlock(&ptype_lock);
  538}
  539EXPORT_SYMBOL(dev_add_pack);
  540
  541/**
  542 *	__dev_remove_pack	 - remove packet handler
  543 *	@pt: packet type declaration
  544 *
  545 *	Remove a protocol handler that was previously added to the kernel
  546 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  547 *	from the kernel lists and can be freed or reused once this function
  548 *	returns.
  549 *
  550 *      The packet type might still be in use by receivers
  551 *	and must not be freed until after all the CPU's have gone
  552 *	through a quiescent state.
  553 */
  554void __dev_remove_pack(struct packet_type *pt)
  555{
  556	struct list_head *head = ptype_head(pt);
  557	struct packet_type *pt1;
  558
  559	spin_lock(&ptype_lock);
  560
  561	list_for_each_entry(pt1, head, list) {
  562		if (pt == pt1) {
  563			list_del_rcu(&pt->list);
  564			goto out;
  565		}
  566	}
  567
  568	pr_warn("dev_remove_pack: %p not found\n", pt);
  569out:
  570	spin_unlock(&ptype_lock);
  571}
  572EXPORT_SYMBOL(__dev_remove_pack);
  573
  574/**
  575 *	dev_remove_pack	 - remove packet handler
  576 *	@pt: packet type declaration
  577 *
  578 *	Remove a protocol handler that was previously added to the kernel
  579 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  580 *	from the kernel lists and can be freed or reused once this function
  581 *	returns.
  582 *
  583 *	This call sleeps to guarantee that no CPU is looking at the packet
  584 *	type after return.
  585 */
  586void dev_remove_pack(struct packet_type *pt)
  587{
  588	__dev_remove_pack(pt);
  589
  590	synchronize_net();
  591}
  592EXPORT_SYMBOL(dev_remove_pack);
  593
  594
  595/**
  596 *	dev_add_offload - register offload handlers
  597 *	@po: protocol offload declaration
  598 *
  599 *	Add protocol offload handlers to the networking stack. The passed
  600 *	&proto_offload is linked into kernel lists and may not be freed until
  601 *	it has been removed from the kernel lists.
  602 *
  603 *	This call does not sleep therefore it can not
  604 *	guarantee all CPU's that are in middle of receiving packets
  605 *	will see the new offload handlers (until the next received packet).
  606 */
  607void dev_add_offload(struct packet_offload *po)
  608{
  609	struct packet_offload *elem;
  610
  611	spin_lock(&offload_lock);
  612	list_for_each_entry(elem, &offload_base, list) {
  613		if (po->priority < elem->priority)
  614			break;
  615	}
  616	list_add_rcu(&po->list, elem->list.prev);
  617	spin_unlock(&offload_lock);
  618}
  619EXPORT_SYMBOL(dev_add_offload);
  620
  621/**
  622 *	__dev_remove_offload	 - remove offload handler
  623 *	@po: packet offload declaration
  624 *
  625 *	Remove a protocol offload handler that was previously added to the
  626 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
  627 *	is removed from the kernel lists and can be freed or reused once this
  628 *	function returns.
  629 *
  630 *      The packet type might still be in use by receivers
  631 *	and must not be freed until after all the CPU's have gone
  632 *	through a quiescent state.
  633 */
  634static void __dev_remove_offload(struct packet_offload *po)
  635{
  636	struct list_head *head = &offload_base;
  637	struct packet_offload *po1;
  638
  639	spin_lock(&offload_lock);
  640
  641	list_for_each_entry(po1, head, list) {
  642		if (po == po1) {
  643			list_del_rcu(&po->list);
  644			goto out;
  645		}
  646	}
  647
  648	pr_warn("dev_remove_offload: %p not found\n", po);
  649out:
  650	spin_unlock(&offload_lock);
  651}
  652
  653/**
  654 *	dev_remove_offload	 - remove packet offload handler
  655 *	@po: packet offload declaration
  656 *
  657 *	Remove a packet offload handler that was previously added to the kernel
  658 *	offload handlers by dev_add_offload(). The passed &offload_type is
  659 *	removed from the kernel lists and can be freed or reused once this
  660 *	function returns.
  661 *
  662 *	This call sleeps to guarantee that no CPU is looking at the packet
  663 *	type after return.
  664 */
  665void dev_remove_offload(struct packet_offload *po)
  666{
  667	__dev_remove_offload(po);
  668
  669	synchronize_net();
  670}
  671EXPORT_SYMBOL(dev_remove_offload);
  672
  673/******************************************************************************
  674 *
  675 *		      Device Boot-time Settings Routines
  676 *
  677 ******************************************************************************/
  678
  679/* Boot time configuration table */
  680static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
  681
  682/**
  683 *	netdev_boot_setup_add	- add new setup entry
  684 *	@name: name of the device
  685 *	@map: configured settings for the device
  686 *
  687 *	Adds new setup entry to the dev_boot_setup list.  The function
  688 *	returns 0 on error and 1 on success.  This is a generic routine to
  689 *	all netdevices.
  690 */
  691static int netdev_boot_setup_add(char *name, struct ifmap *map)
  692{
  693	struct netdev_boot_setup *s;
  694	int i;
  695
  696	s = dev_boot_setup;
  697	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
  698		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
  699			memset(s[i].name, 0, sizeof(s[i].name));
  700			strlcpy(s[i].name, name, IFNAMSIZ);
  701			memcpy(&s[i].map, map, sizeof(s[i].map));
  702			break;
  703		}
  704	}
  705
  706	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
  707}
  708
  709/**
  710 * netdev_boot_setup_check	- check boot time settings
  711 * @dev: the netdevice
  712 *
  713 * Check boot time settings for the device.
  714 * The found settings are set for the device to be used
  715 * later in the device probing.
  716 * Returns 0 if no settings found, 1 if they are.
  717 */
  718int netdev_boot_setup_check(struct net_device *dev)
  719{
  720	struct netdev_boot_setup *s = dev_boot_setup;
  721	int i;
  722
  723	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
  724		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
  725		    !strcmp(dev->name, s[i].name)) {
  726			dev->irq = s[i].map.irq;
  727			dev->base_addr = s[i].map.base_addr;
  728			dev->mem_start = s[i].map.mem_start;
  729			dev->mem_end = s[i].map.mem_end;
  730			return 1;
  731		}
  732	}
  733	return 0;
  734}
  735EXPORT_SYMBOL(netdev_boot_setup_check);
  736
  737
  738/**
  739 * netdev_boot_base	- get address from boot time settings
  740 * @prefix: prefix for network device
  741 * @unit: id for network device
  742 *
  743 * Check boot time settings for the base address of device.
  744 * The found settings are set for the device to be used
  745 * later in the device probing.
  746 * Returns 0 if no settings found.
  747 */
  748unsigned long netdev_boot_base(const char *prefix, int unit)
  749{
  750	const struct netdev_boot_setup *s = dev_boot_setup;
  751	char name[IFNAMSIZ];
  752	int i;
  753
  754	sprintf(name, "%s%d", prefix, unit);
  755
  756	/*
  757	 * If device already registered then return base of 1
  758	 * to indicate not to probe for this interface
  759	 */
  760	if (__dev_get_by_name(&init_net, name))
  761		return 1;
  762
  763	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
  764		if (!strcmp(name, s[i].name))
  765			return s[i].map.base_addr;
  766	return 0;
  767}
  768
  769/*
  770 * Saves at boot time configured settings for any netdevice.
  771 */
  772int __init netdev_boot_setup(char *str)
  773{
  774	int ints[5];
  775	struct ifmap map;
  776
  777	str = get_options(str, ARRAY_SIZE(ints), ints);
  778	if (!str || !*str)
  779		return 0;
  780
  781	/* Save settings */
  782	memset(&map, 0, sizeof(map));
  783	if (ints[0] > 0)
  784		map.irq = ints[1];
  785	if (ints[0] > 1)
  786		map.base_addr = ints[2];
  787	if (ints[0] > 2)
  788		map.mem_start = ints[3];
  789	if (ints[0] > 3)
  790		map.mem_end = ints[4];
  791
  792	/* Add new entry to the list */
  793	return netdev_boot_setup_add(str, &map);
  794}
  795
  796__setup("netdev=", netdev_boot_setup);
  797
  798/*******************************************************************************
  799 *
  800 *			    Device Interface Subroutines
  801 *
  802 *******************************************************************************/
  803
  804/**
  805 *	dev_get_iflink	- get 'iflink' value of a interface
  806 *	@dev: targeted interface
  807 *
  808 *	Indicates the ifindex the interface is linked to.
  809 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
  810 */
  811
  812int dev_get_iflink(const struct net_device *dev)
  813{
  814	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
  815		return dev->netdev_ops->ndo_get_iflink(dev);
  816
  817	return dev->ifindex;
  818}
  819EXPORT_SYMBOL(dev_get_iflink);
  820
  821/**
  822 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
  823 *	@dev: targeted interface
  824 *	@skb: The packet.
  825 *
  826 *	For better visibility of tunnel traffic OVS needs to retrieve
  827 *	egress tunnel information for a packet. Following API allows
  828 *	user to get this info.
  829 */
  830int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
  831{
  832	struct ip_tunnel_info *info;
  833
  834	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
  835		return -EINVAL;
  836
  837	info = skb_tunnel_info_unclone(skb);
  838	if (!info)
  839		return -ENOMEM;
  840	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
  841		return -EINVAL;
  842
  843	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
  844}
  845EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
  846
  847/**
  848 *	__dev_get_by_name	- find a device by its name
  849 *	@net: the applicable net namespace
  850 *	@name: name to find
  851 *
  852 *	Find an interface by name. Must be called under RTNL semaphore
  853 *	or @dev_base_lock. If the name is found a pointer to the device
  854 *	is returned. If the name is not found then %NULL is returned. The
  855 *	reference counters are not incremented so the caller must be
  856 *	careful with locks.
  857 */
  858
  859struct net_device *__dev_get_by_name(struct net *net, const char *name)
  860{
  861	struct netdev_name_node *node_name;
  862
  863	node_name = netdev_name_node_lookup(net, name);
  864	return node_name ? node_name->dev : NULL;
  865}
  866EXPORT_SYMBOL(__dev_get_by_name);
  867
  868/**
  869 * dev_get_by_name_rcu	- find a device by its name
  870 * @net: the applicable net namespace
  871 * @name: name to find
  872 *
  873 * Find an interface by name.
  874 * If the name is found a pointer to the device is returned.
  875 * If the name is not found then %NULL is returned.
  876 * The reference counters are not incremented so the caller must be
  877 * careful with locks. The caller must hold RCU lock.
  878 */
  879
  880struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
  881{
  882	struct netdev_name_node *node_name;
  883
  884	node_name = netdev_name_node_lookup_rcu(net, name);
  885	return node_name ? node_name->dev : NULL;
  886}
  887EXPORT_SYMBOL(dev_get_by_name_rcu);
  888
  889/**
  890 *	dev_get_by_name		- find a device by its name
  891 *	@net: the applicable net namespace
  892 *	@name: name to find
  893 *
  894 *	Find an interface by name. This can be called from any
  895 *	context and does its own locking. The returned handle has
  896 *	the usage count incremented and the caller must use dev_put() to
  897 *	release it when it is no longer needed. %NULL is returned if no
  898 *	matching device is found.
  899 */
  900
  901struct net_device *dev_get_by_name(struct net *net, const char *name)
  902{
  903	struct net_device *dev;
  904
  905	rcu_read_lock();
  906	dev = dev_get_by_name_rcu(net, name);
  907	if (dev)
  908		dev_hold(dev);
  909	rcu_read_unlock();
  910	return dev;
  911}
  912EXPORT_SYMBOL(dev_get_by_name);
  913
  914/**
  915 *	__dev_get_by_index - find a device by its ifindex
  916 *	@net: the applicable net namespace
  917 *	@ifindex: index of device
  918 *
  919 *	Search for an interface by index. Returns %NULL if the device
  920 *	is not found or a pointer to the device. The device has not
  921 *	had its reference counter increased so the caller must be careful
  922 *	about locking. The caller must hold either the RTNL semaphore
  923 *	or @dev_base_lock.
  924 */
  925
  926struct net_device *__dev_get_by_index(struct net *net, int ifindex)
  927{
  928	struct net_device *dev;
  929	struct hlist_head *head = dev_index_hash(net, ifindex);
  930
  931	hlist_for_each_entry(dev, head, index_hlist)
  932		if (dev->ifindex == ifindex)
  933			return dev;
  934
  935	return NULL;
  936}
  937EXPORT_SYMBOL(__dev_get_by_index);
  938
  939/**
  940 *	dev_get_by_index_rcu - find a device by its ifindex
  941 *	@net: the applicable net namespace
  942 *	@ifindex: index of device
  943 *
  944 *	Search for an interface by index. Returns %NULL if the device
  945 *	is not found or a pointer to the device. The device has not
  946 *	had its reference counter increased so the caller must be careful
  947 *	about locking. The caller must hold RCU lock.
  948 */
  949
  950struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
  951{
  952	struct net_device *dev;
  953	struct hlist_head *head = dev_index_hash(net, ifindex);
  954
  955	hlist_for_each_entry_rcu(dev, head, index_hlist)
  956		if (dev->ifindex == ifindex)
  957			return dev;
  958
  959	return NULL;
  960}
  961EXPORT_SYMBOL(dev_get_by_index_rcu);
  962
  963
  964/**
  965 *	dev_get_by_index - find a device by its ifindex
  966 *	@net: the applicable net namespace
  967 *	@ifindex: index of device
  968 *
  969 *	Search for an interface by index. Returns NULL if the device
  970 *	is not found or a pointer to the device. The device returned has
  971 *	had a reference added and the pointer is safe until the user calls
  972 *	dev_put to indicate they have finished with it.
  973 */
  974
  975struct net_device *dev_get_by_index(struct net *net, int ifindex)
  976{
  977	struct net_device *dev;
  978
  979	rcu_read_lock();
  980	dev = dev_get_by_index_rcu(net, ifindex);
  981	if (dev)
  982		dev_hold(dev);
  983	rcu_read_unlock();
  984	return dev;
  985}
  986EXPORT_SYMBOL(dev_get_by_index);
  987
  988/**
  989 *	dev_get_by_napi_id - find a device by napi_id
  990 *	@napi_id: ID of the NAPI struct
  991 *
  992 *	Search for an interface by NAPI ID. Returns %NULL if the device
  993 *	is not found or a pointer to the device. The device has not had
  994 *	its reference counter increased so the caller must be careful
  995 *	about locking. The caller must hold RCU lock.
  996 */
  997
  998struct net_device *dev_get_by_napi_id(unsigned int napi_id)
  999{
 1000	struct napi_struct *napi;
 1001
 1002	WARN_ON_ONCE(!rcu_read_lock_held());
 1003
 1004	if (napi_id < MIN_NAPI_ID)
 1005		return NULL;
 1006
 1007	napi = napi_by_id(napi_id);
 1008
 1009	return napi ? napi->dev : NULL;
 1010}
 1011EXPORT_SYMBOL(dev_get_by_napi_id);
 1012
 1013/**
 1014 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 1015 *	@net: network namespace
 1016 *	@name: a pointer to the buffer where the name will be stored.
 1017 *	@ifindex: the ifindex of the interface to get the name from.
 1018 */
 1019int netdev_get_name(struct net *net, char *name, int ifindex)
 1020{
 1021	struct net_device *dev;
 1022	int ret;
 1023
 1024	down_read(&devnet_rename_sem);
 1025	rcu_read_lock();
 1026
 1027	dev = dev_get_by_index_rcu(net, ifindex);
 1028	if (!dev) {
 1029		ret = -ENODEV;
 1030		goto out;
 1031	}
 1032
 1033	strcpy(name, dev->name);
 1034
 1035	ret = 0;
 1036out:
 1037	rcu_read_unlock();
 1038	up_read(&devnet_rename_sem);
 1039	return ret;
 1040}
 1041
 1042/**
 1043 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 1044 *	@net: the applicable net namespace
 1045 *	@type: media type of device
 1046 *	@ha: hardware address
 1047 *
 1048 *	Search for an interface by MAC address. Returns NULL if the device
 1049 *	is not found or a pointer to the device.
 1050 *	The caller must hold RCU or RTNL.
 1051 *	The returned device has not had its ref count increased
 1052 *	and the caller must therefore be careful about locking
 1053 *
 1054 */
 1055
 1056struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 1057				       const char *ha)
 1058{
 1059	struct net_device *dev;
 1060
 1061	for_each_netdev_rcu(net, dev)
 1062		if (dev->type == type &&
 1063		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 1064			return dev;
 1065
 1066	return NULL;
 1067}
 1068EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 1069
 1070struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 1071{
 1072	struct net_device *dev;
 1073
 1074	ASSERT_RTNL();
 1075	for_each_netdev(net, dev)
 1076		if (dev->type == type)
 1077			return dev;
 1078
 1079	return NULL;
 1080}
 1081EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 1082
 1083struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 1084{
 1085	struct net_device *dev, *ret = NULL;
 1086
 1087	rcu_read_lock();
 1088	for_each_netdev_rcu(net, dev)
 1089		if (dev->type == type) {
 1090			dev_hold(dev);
 1091			ret = dev;
 1092			break;
 1093		}
 1094	rcu_read_unlock();
 1095	return ret;
 1096}
 1097EXPORT_SYMBOL(dev_getfirstbyhwtype);
 1098
 1099/**
 1100 *	__dev_get_by_flags - find any device with given flags
 1101 *	@net: the applicable net namespace
 1102 *	@if_flags: IFF_* values
 1103 *	@mask: bitmask of bits in if_flags to check
 1104 *
 1105 *	Search for any interface with the given flags. Returns NULL if a device
 1106 *	is not found or a pointer to the device. Must be called inside
 1107 *	rtnl_lock(), and result refcount is unchanged.
 1108 */
 1109
 1110struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 1111				      unsigned short mask)
 1112{
 1113	struct net_device *dev, *ret;
 1114
 1115	ASSERT_RTNL();
 1116
 1117	ret = NULL;
 1118	for_each_netdev(net, dev) {
 1119		if (((dev->flags ^ if_flags) & mask) == 0) {
 1120			ret = dev;
 1121			break;
 1122		}
 1123	}
 1124	return ret;
 1125}
 1126EXPORT_SYMBOL(__dev_get_by_flags);
 1127
 1128/**
 1129 *	dev_valid_name - check if name is okay for network device
 1130 *	@name: name string
 1131 *
 1132 *	Network device names need to be valid file names to
 1133 *	to allow sysfs to work.  We also disallow any kind of
 1134 *	whitespace.
 1135 */
 1136bool dev_valid_name(const char *name)
 1137{
 1138	if (*name == '\0')
 1139		return false;
 1140	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
 1141		return false;
 1142	if (!strcmp(name, ".") || !strcmp(name, ".."))
 1143		return false;
 1144
 1145	while (*name) {
 1146		if (*name == '/' || *name == ':' || isspace(*name))
 1147			return false;
 1148		name++;
 1149	}
 1150	return true;
 1151}
 1152EXPORT_SYMBOL(dev_valid_name);
 1153
 1154/**
 1155 *	__dev_alloc_name - allocate a name for a device
 1156 *	@net: network namespace to allocate the device name in
 1157 *	@name: name format string
 1158 *	@buf:  scratch buffer and result name string
 1159 *
 1160 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1161 *	id. It scans list of devices to build up a free map, then chooses
 1162 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1163 *	while allocating the name and adding the device in order to avoid
 1164 *	duplicates.
 1165 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1166 *	Returns the number of the unit assigned or a negative errno code.
 1167 */
 1168
 1169static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 1170{
 1171	int i = 0;
 1172	const char *p;
 1173	const int max_netdevices = 8*PAGE_SIZE;
 1174	unsigned long *inuse;
 1175	struct net_device *d;
 1176
 1177	if (!dev_valid_name(name))
 1178		return -EINVAL;
 1179
 1180	p = strchr(name, '%');
 1181	if (p) {
 1182		/*
 1183		 * Verify the string as this thing may have come from
 1184		 * the user.  There must be either one "%d" and no other "%"
 1185		 * characters.
 1186		 */
 1187		if (p[1] != 'd' || strchr(p + 2, '%'))
 1188			return -EINVAL;
 1189
 1190		/* Use one page as a bit array of possible slots */
 1191		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 1192		if (!inuse)
 1193			return -ENOMEM;
 1194
 1195		for_each_netdev(net, d) {
 1196			if (!sscanf(d->name, name, &i))
 1197				continue;
 1198			if (i < 0 || i >= max_netdevices)
 1199				continue;
 1200
 1201			/*  avoid cases where sscanf is not exact inverse of printf */
 1202			snprintf(buf, IFNAMSIZ, name, i);
 1203			if (!strncmp(buf, d->name, IFNAMSIZ))
 1204				set_bit(i, inuse);
 1205		}
 1206
 1207		i = find_first_zero_bit(inuse, max_netdevices);
 1208		free_page((unsigned long) inuse);
 1209	}
 1210
 1211	snprintf(buf, IFNAMSIZ, name, i);
 1212	if (!__dev_get_by_name(net, buf))
 1213		return i;
 1214
 1215	/* It is possible to run out of possible slots
 1216	 * when the name is long and there isn't enough space left
 1217	 * for the digits, or if all bits are used.
 1218	 */
 1219	return -ENFILE;
 1220}
 1221
 1222static int dev_alloc_name_ns(struct net *net,
 1223			     struct net_device *dev,
 1224			     const char *name)
 1225{
 1226	char buf[IFNAMSIZ];
 1227	int ret;
 1228
 1229	BUG_ON(!net);
 1230	ret = __dev_alloc_name(net, name, buf);
 1231	if (ret >= 0)
 1232		strlcpy(dev->name, buf, IFNAMSIZ);
 1233	return ret;
 1234}
 1235
 1236/**
 1237 *	dev_alloc_name - allocate a name for a device
 1238 *	@dev: device
 1239 *	@name: name format string
 1240 *
 1241 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1242 *	id. It scans list of devices to build up a free map, then chooses
 1243 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1244 *	while allocating the name and adding the device in order to avoid
 1245 *	duplicates.
 1246 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1247 *	Returns the number of the unit assigned or a negative errno code.
 1248 */
 1249
 1250int dev_alloc_name(struct net_device *dev, const char *name)
 1251{
 1252	return dev_alloc_name_ns(dev_net(dev), dev, name);
 1253}
 1254EXPORT_SYMBOL(dev_alloc_name);
 1255
 1256static int dev_get_valid_name(struct net *net, struct net_device *dev,
 1257			      const char *name)
 1258{
 1259	BUG_ON(!net);
 1260
 1261	if (!dev_valid_name(name))
 1262		return -EINVAL;
 1263
 1264	if (strchr(name, '%'))
 1265		return dev_alloc_name_ns(net, dev, name);
 1266	else if (__dev_get_by_name(net, name))
 1267		return -EEXIST;
 1268	else if (dev->name != name)
 1269		strlcpy(dev->name, name, IFNAMSIZ);
 1270
 1271	return 0;
 1272}
 1273
 1274/**
 1275 *	dev_change_name - change name of a device
 1276 *	@dev: device
 1277 *	@newname: name (or format string) must be at least IFNAMSIZ
 1278 *
 1279 *	Change name of a device, can pass format strings "eth%d".
 1280 *	for wildcarding.
 1281 */
 1282int dev_change_name(struct net_device *dev, const char *newname)
 1283{
 1284	unsigned char old_assign_type;
 1285	char oldname[IFNAMSIZ];
 1286	int err = 0;
 1287	int ret;
 1288	struct net *net;
 1289
 1290	ASSERT_RTNL();
 1291	BUG_ON(!dev_net(dev));
 1292
 1293	net = dev_net(dev);
 1294
 1295	/* Some auto-enslaved devices e.g. failover slaves are
 1296	 * special, as userspace might rename the device after
 1297	 * the interface had been brought up and running since
 1298	 * the point kernel initiated auto-enslavement. Allow
 1299	 * live name change even when these slave devices are
 1300	 * up and running.
 1301	 *
 1302	 * Typically, users of these auto-enslaving devices
 1303	 * don't actually care about slave name change, as
 1304	 * they are supposed to operate on master interface
 1305	 * directly.
 1306	 */
 1307	if (dev->flags & IFF_UP &&
 1308	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
 1309		return -EBUSY;
 1310
 1311	down_write(&devnet_rename_sem);
 1312
 1313	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
 1314		up_write(&devnet_rename_sem);
 1315		return 0;
 1316	}
 1317
 1318	memcpy(oldname, dev->name, IFNAMSIZ);
 1319
 1320	err = dev_get_valid_name(net, dev, newname);
 1321	if (err < 0) {
 1322		up_write(&devnet_rename_sem);
 1323		return err;
 1324	}
 1325
 1326	if (oldname[0] && !strchr(oldname, '%'))
 1327		netdev_info(dev, "renamed from %s\n", oldname);
 1328
 1329	old_assign_type = dev->name_assign_type;
 1330	dev->name_assign_type = NET_NAME_RENAMED;
 1331
 1332rollback:
 1333	ret = device_rename(&dev->dev, dev->name);
 1334	if (ret) {
 1335		memcpy(dev->name, oldname, IFNAMSIZ);
 1336		dev->name_assign_type = old_assign_type;
 1337		up_write(&devnet_rename_sem);
 1338		return ret;
 1339	}
 1340
 1341	up_write(&devnet_rename_sem);
 1342
 1343	netdev_adjacent_rename_links(dev, oldname);
 1344
 1345	write_lock_bh(&dev_base_lock);
 1346	netdev_name_node_del(dev->name_node);
 1347	write_unlock_bh(&dev_base_lock);
 1348
 1349	synchronize_rcu();
 1350
 1351	write_lock_bh(&dev_base_lock);
 1352	netdev_name_node_add(net, dev->name_node);
 1353	write_unlock_bh(&dev_base_lock);
 1354
 1355	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 1356	ret = notifier_to_errno(ret);
 1357
 1358	if (ret) {
 1359		/* err >= 0 after dev_alloc_name() or stores the first errno */
 1360		if (err >= 0) {
 1361			err = ret;
 1362			down_write(&devnet_rename_sem);
 1363			memcpy(dev->name, oldname, IFNAMSIZ);
 1364			memcpy(oldname, newname, IFNAMSIZ);
 1365			dev->name_assign_type = old_assign_type;
 1366			old_assign_type = NET_NAME_RENAMED;
 1367			goto rollback;
 1368		} else {
 1369			pr_err("%s: name change rollback failed: %d\n",
 1370			       dev->name, ret);
 1371		}
 1372	}
 1373
 1374	return err;
 1375}
 1376
 1377/**
 1378 *	dev_set_alias - change ifalias of a device
 1379 *	@dev: device
 1380 *	@alias: name up to IFALIASZ
 1381 *	@len: limit of bytes to copy from info
 1382 *
 1383 *	Set ifalias for a device,
 1384 */
 1385int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 1386{
 1387	struct dev_ifalias *new_alias = NULL;
 1388
 1389	if (len >= IFALIASZ)
 1390		return -EINVAL;
 1391
 1392	if (len) {
 1393		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
 1394		if (!new_alias)
 1395			return -ENOMEM;
 1396
 1397		memcpy(new_alias->ifalias, alias, len);
 1398		new_alias->ifalias[len] = 0;
 1399	}
 1400
 1401	mutex_lock(&ifalias_mutex);
 1402	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
 1403					mutex_is_locked(&ifalias_mutex));
 1404	mutex_unlock(&ifalias_mutex);
 1405
 1406	if (new_alias)
 1407		kfree_rcu(new_alias, rcuhead);
 1408
 1409	return len;
 1410}
 1411EXPORT_SYMBOL(dev_set_alias);
 1412
 1413/**
 1414 *	dev_get_alias - get ifalias of a device
 1415 *	@dev: device
 1416 *	@name: buffer to store name of ifalias
 1417 *	@len: size of buffer
 1418 *
 1419 *	get ifalias for a device.  Caller must make sure dev cannot go
 1420 *	away,  e.g. rcu read lock or own a reference count to device.
 1421 */
 1422int dev_get_alias(const struct net_device *dev, char *name, size_t len)
 1423{
 1424	const struct dev_ifalias *alias;
 1425	int ret = 0;
 1426
 1427	rcu_read_lock();
 1428	alias = rcu_dereference(dev->ifalias);
 1429	if (alias)
 1430		ret = snprintf(name, len, "%s", alias->ifalias);
 1431	rcu_read_unlock();
 1432
 1433	return ret;
 1434}
 1435
 1436/**
 1437 *	netdev_features_change - device changes features
 1438 *	@dev: device to cause notification
 1439 *
 1440 *	Called to indicate a device has changed features.
 1441 */
 1442void netdev_features_change(struct net_device *dev)
 1443{
 1444	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 1445}
 1446EXPORT_SYMBOL(netdev_features_change);
 1447
 1448/**
 1449 *	netdev_state_change - device changes state
 1450 *	@dev: device to cause notification
 1451 *
 1452 *	Called to indicate a device has changed state. This function calls
 1453 *	the notifier chains for netdev_chain and sends a NEWLINK message
 1454 *	to the routing socket.
 1455 */
 1456void netdev_state_change(struct net_device *dev)
 1457{
 1458	if (dev->flags & IFF_UP) {
 1459		struct netdev_notifier_change_info change_info = {
 1460			.info.dev = dev,
 1461		};
 1462
 1463		call_netdevice_notifiers_info(NETDEV_CHANGE,
 1464					      &change_info.info);
 1465		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
 1466	}
 1467}
 1468EXPORT_SYMBOL(netdev_state_change);
 1469
 1470/**
 1471 * netdev_notify_peers - notify network peers about existence of @dev
 1472 * @dev: network device
 1473 *
 1474 * Generate traffic such that interested network peers are aware of
 1475 * @dev, such as by generating a gratuitous ARP. This may be used when
 1476 * a device wants to inform the rest of the network about some sort of
 1477 * reconfiguration such as a failover event or virtual machine
 1478 * migration.
 1479 */
 1480void netdev_notify_peers(struct net_device *dev)
 1481{
 1482	rtnl_lock();
 1483	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
 1484	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
 1485	rtnl_unlock();
 1486}
 1487EXPORT_SYMBOL(netdev_notify_peers);
 1488
 1489static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1490{
 1491	const struct net_device_ops *ops = dev->netdev_ops;
 1492	int ret;
 1493
 1494	ASSERT_RTNL();
 1495
 1496	if (!netif_device_present(dev)) {
 1497		/* may be detached because parent is runtime-suspended */
 1498		if (dev->dev.parent)
 1499			pm_runtime_resume(dev->dev.parent);
 1500		if (!netif_device_present(dev))
 1501			return -ENODEV;
 1502	}
 1503
 1504	/* Block netpoll from trying to do any rx path servicing.
 1505	 * If we don't do this there is a chance ndo_poll_controller
 1506	 * or ndo_poll may be running while we open the device
 1507	 */
 1508	netpoll_poll_disable(dev);
 1509
 1510	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
 1511	ret = notifier_to_errno(ret);
 1512	if (ret)
 1513		return ret;
 1514
 1515	set_bit(__LINK_STATE_START, &dev->state);
 1516
 1517	if (ops->ndo_validate_addr)
 1518		ret = ops->ndo_validate_addr(dev);
 1519
 1520	if (!ret && ops->ndo_open)
 1521		ret = ops->ndo_open(dev);
 1522
 1523	netpoll_poll_enable(dev);
 1524
 1525	if (ret)
 1526		clear_bit(__LINK_STATE_START, &dev->state);
 1527	else {
 1528		dev->flags |= IFF_UP;
 1529		dev_set_rx_mode(dev);
 1530		dev_activate(dev);
 1531		add_device_randomness(dev->dev_addr, dev->addr_len);
 1532	}
 1533
 1534	return ret;
 1535}
 1536
 1537/**
 1538 *	dev_open	- prepare an interface for use.
 1539 *	@dev: device to open
 1540 *	@extack: netlink extended ack
 1541 *
 1542 *	Takes a device from down to up state. The device's private open
 1543 *	function is invoked and then the multicast lists are loaded. Finally
 1544 *	the device is moved into the up state and a %NETDEV_UP message is
 1545 *	sent to the netdev notifier chain.
 1546 *
 1547 *	Calling this function on an active interface is a nop. On a failure
 1548 *	a negative errno code is returned.
 1549 */
 1550int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1551{
 1552	int ret;
 1553
 1554	if (dev->flags & IFF_UP)
 1555		return 0;
 1556
 1557	ret = __dev_open(dev, extack);
 1558	if (ret < 0)
 1559		return ret;
 1560
 1561	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
 1562	call_netdevice_notifiers(NETDEV_UP, dev);
 1563
 1564	return ret;
 1565}
 1566EXPORT_SYMBOL(dev_open);
 1567
 1568static void __dev_close_many(struct list_head *head)
 1569{
 1570	struct net_device *dev;
 1571
 1572	ASSERT_RTNL();
 1573	might_sleep();
 1574
 1575	list_for_each_entry(dev, head, close_list) {
 1576		/* Temporarily disable netpoll until the interface is down */
 1577		netpoll_poll_disable(dev);
 1578
 1579		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
 1580
 1581		clear_bit(__LINK_STATE_START, &dev->state);
 1582
 1583		/* Synchronize to scheduled poll. We cannot touch poll list, it
 1584		 * can be even on different cpu. So just clear netif_running().
 1585		 *
 1586		 * dev->stop() will invoke napi_disable() on all of it's
 1587		 * napi_struct instances on this device.
 1588		 */
 1589		smp_mb__after_atomic(); /* Commit netif_running(). */
 1590	}
 1591
 1592	dev_deactivate_many(head);
 1593
 1594	list_for_each_entry(dev, head, close_list) {
 1595		const struct net_device_ops *ops = dev->netdev_ops;
 1596
 1597		/*
 1598		 *	Call the device specific close. This cannot fail.
 1599		 *	Only if device is UP
 1600		 *
 1601		 *	We allow it to be called even after a DETACH hot-plug
 1602		 *	event.
 1603		 */
 1604		if (ops->ndo_stop)
 1605			ops->ndo_stop(dev);
 1606
 1607		dev->flags &= ~IFF_UP;
 1608		netpoll_poll_enable(dev);
 1609	}
 1610}
 1611
 1612static void __dev_close(struct net_device *dev)
 1613{
 1614	LIST_HEAD(single);
 1615
 1616	list_add(&dev->close_list, &single);
 1617	__dev_close_many(&single);
 1618	list_del(&single);
 1619}
 1620
 1621void dev_close_many(struct list_head *head, bool unlink)
 1622{
 1623	struct net_device *dev, *tmp;
 1624
 1625	/* Remove the devices that don't need to be closed */
 1626	list_for_each_entry_safe(dev, tmp, head, close_list)
 1627		if (!(dev->flags & IFF_UP))
 1628			list_del_init(&dev->close_list);
 1629
 1630	__dev_close_many(head);
 1631
 1632	list_for_each_entry_safe(dev, tmp, head, close_list) {
 1633		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
 1634		call_netdevice_notifiers(NETDEV_DOWN, dev);
 1635		if (unlink)
 1636			list_del_init(&dev->close_list);
 1637	}
 1638}
 1639EXPORT_SYMBOL(dev_close_many);
 1640
 1641/**
 1642 *	dev_close - shutdown an interface.
 1643 *	@dev: device to shutdown
 1644 *
 1645 *	This function moves an active device into down state. A
 1646 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 1647 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 1648 *	chain.
 1649 */
 1650void dev_close(struct net_device *dev)
 1651{
 1652	if (dev->flags & IFF_UP) {
 1653		LIST_HEAD(single);
 1654
 1655		list_add(&dev->close_list, &single);
 1656		dev_close_many(&single, true);
 1657		list_del(&single);
 1658	}
 1659}
 1660EXPORT_SYMBOL(dev_close);
 1661
 1662
 1663/**
 1664 *	dev_disable_lro - disable Large Receive Offload on a device
 1665 *	@dev: device
 1666 *
 1667 *	Disable Large Receive Offload (LRO) on a net device.  Must be
 1668 *	called under RTNL.  This is needed if received packets may be
 1669 *	forwarded to another interface.
 1670 */
 1671void dev_disable_lro(struct net_device *dev)
 1672{
 1673	struct net_device *lower_dev;
 1674	struct list_head *iter;
 1675
 1676	dev->wanted_features &= ~NETIF_F_LRO;
 1677	netdev_update_features(dev);
 1678
 1679	if (unlikely(dev->features & NETIF_F_LRO))
 1680		netdev_WARN(dev, "failed to disable LRO!\n");
 1681
 1682	netdev_for_each_lower_dev(dev, lower_dev, iter)
 1683		dev_disable_lro(lower_dev);
 1684}
 1685EXPORT_SYMBOL(dev_disable_lro);
 1686
 1687/**
 1688 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 1689 *	@dev: device
 1690 *
 1691 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 1692 *	called under RTNL.  This is needed if Generic XDP is installed on
 1693 *	the device.
 1694 */
 1695static void dev_disable_gro_hw(struct net_device *dev)
 1696{
 1697	dev->wanted_features &= ~NETIF_F_GRO_HW;
 1698	netdev_update_features(dev);
 1699
 1700	if (unlikely(dev->features & NETIF_F_GRO_HW))
 1701		netdev_WARN(dev, "failed to disable GRO_HW!\n");
 1702}
 1703
 1704const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 1705{
 1706#define N(val) 						\
 1707	case NETDEV_##val:				\
 1708		return "NETDEV_" __stringify(val);
 1709	switch (cmd) {
 1710	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
 1711	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
 1712	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
 1713	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
 1714	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
 1715	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
 1716	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 1717	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 1718	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 1719	N(PRE_CHANGEADDR)
 1720	}
 1721#undef N
 1722	return "UNKNOWN_NETDEV_EVENT";
 1723}
 1724EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
 1725
 1726static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 1727				   struct net_device *dev)
 1728{
 1729	struct netdev_notifier_info info = {
 1730		.dev = dev,
 1731	};
 1732
 1733	return nb->notifier_call(nb, val, &info);
 1734}
 1735
 1736static int call_netdevice_register_notifiers(struct notifier_block *nb,
 1737					     struct net_device *dev)
 1738{
 1739	int err;
 1740
 1741	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
 1742	err = notifier_to_errno(err);
 1743	if (err)
 1744		return err;
 1745
 1746	if (!(dev->flags & IFF_UP))
 1747		return 0;
 1748
 1749	call_netdevice_notifier(nb, NETDEV_UP, dev);
 1750	return 0;
 1751}
 1752
 1753static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
 1754						struct net_device *dev)
 1755{
 1756	if (dev->flags & IFF_UP) {
 1757		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
 1758					dev);
 1759		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
 1760	}
 1761	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 1762}
 1763
 1764static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
 1765						 struct net *net)
 1766{
 1767	struct net_device *dev;
 1768	int err;
 1769
 1770	for_each_netdev(net, dev) {
 1771		err = call_netdevice_register_notifiers(nb, dev);
 1772		if (err)
 1773			goto rollback;
 1774	}
 1775	return 0;
 1776
 1777rollback:
 1778	for_each_netdev_continue_reverse(net, dev)
 1779		call_netdevice_unregister_notifiers(nb, dev);
 1780	return err;
 1781}
 1782
 1783static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
 1784						    struct net *net)
 1785{
 1786	struct net_device *dev;
 1787
 1788	for_each_netdev(net, dev)
 1789		call_netdevice_unregister_notifiers(nb, dev);
 1790}
 1791
 1792static int dev_boot_phase = 1;
 1793
 1794/**
 1795 * register_netdevice_notifier - register a network notifier block
 1796 * @nb: notifier
 1797 *
 1798 * Register a notifier to be called when network device events occur.
 1799 * The notifier passed is linked into the kernel structures and must
 1800 * not be reused until it has been unregistered. A negative errno code
 1801 * is returned on a failure.
 1802 *
 1803 * When registered all registration and up events are replayed
 1804 * to the new notifier to allow device to have a race free
 1805 * view of the network device list.
 1806 */
 1807
 1808int register_netdevice_notifier(struct notifier_block *nb)
 1809{
 1810	struct net *net;
 1811	int err;
 1812
 1813	/* Close race with setup_net() and cleanup_net() */
 1814	down_write(&pernet_ops_rwsem);
 1815	rtnl_lock();
 1816	err = raw_notifier_chain_register(&netdev_chain, nb);
 1817	if (err)
 1818		goto unlock;
 1819	if (dev_boot_phase)
 1820		goto unlock;
 1821	for_each_net(net) {
 1822		err = call_netdevice_register_net_notifiers(nb, net);
 1823		if (err)
 1824			goto rollback;
 1825	}
 1826
 1827unlock:
 1828	rtnl_unlock();
 1829	up_write(&pernet_ops_rwsem);
 1830	return err;
 1831
 1832rollback:
 1833	for_each_net_continue_reverse(net)
 1834		call_netdevice_unregister_net_notifiers(nb, net);
 1835
 1836	raw_notifier_chain_unregister(&netdev_chain, nb);
 1837	goto unlock;
 1838}
 1839EXPORT_SYMBOL(register_netdevice_notifier);
 1840
 1841/**
 1842 * unregister_netdevice_notifier - unregister a network notifier block
 1843 * @nb: notifier
 1844 *
 1845 * Unregister a notifier previously registered by
 1846 * register_netdevice_notifier(). The notifier is unlinked into the
 1847 * kernel structures and may then be reused. A negative errno code
 1848 * is returned on a failure.
 1849 *
 1850 * After unregistering unregister and down device events are synthesized
 1851 * for all devices on the device list to the removed notifier to remove
 1852 * the need for special case cleanup code.
 1853 */
 1854
 1855int unregister_netdevice_notifier(struct notifier_block *nb)
 1856{
 1857	struct net *net;
 1858	int err;
 1859
 1860	/* Close race with setup_net() and cleanup_net() */
 1861	down_write(&pernet_ops_rwsem);
 1862	rtnl_lock();
 1863	err = raw_notifier_chain_unregister(&netdev_chain, nb);
 1864	if (err)
 1865		goto unlock;
 1866
 1867	for_each_net(net)
 1868		call_netdevice_unregister_net_notifiers(nb, net);
 1869
 1870unlock:
 1871	rtnl_unlock();
 1872	up_write(&pernet_ops_rwsem);
 1873	return err;
 1874}
 1875EXPORT_SYMBOL(unregister_netdevice_notifier);
 1876
 1877static int __register_netdevice_notifier_net(struct net *net,
 1878					     struct notifier_block *nb,
 1879					     bool ignore_call_fail)
 1880{
 1881	int err;
 1882
 1883	err = raw_notifier_chain_register(&net->netdev_chain, nb);
 1884	if (err)
 1885		return err;
 1886	if (dev_boot_phase)
 1887		return 0;
 1888
 1889	err = call_netdevice_register_net_notifiers(nb, net);
 1890	if (err && !ignore_call_fail)
 1891		goto chain_unregister;
 1892
 1893	return 0;
 1894
 1895chain_unregister:
 1896	raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1897	return err;
 1898}
 1899
 1900static int __unregister_netdevice_notifier_net(struct net *net,
 1901					       struct notifier_block *nb)
 1902{
 1903	int err;
 1904
 1905	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1906	if (err)
 1907		return err;
 1908
 1909	call_netdevice_unregister_net_notifiers(nb, net);
 1910	return 0;
 1911}
 1912
 1913/**
 1914 * register_netdevice_notifier_net - register a per-netns network notifier block
 1915 * @net: network namespace
 1916 * @nb: notifier
 1917 *
 1918 * Register a notifier to be called when network device events occur.
 1919 * The notifier passed is linked into the kernel structures and must
 1920 * not be reused until it has been unregistered. A negative errno code
 1921 * is returned on a failure.
 1922 *
 1923 * When registered all registration and up events are replayed
 1924 * to the new notifier to allow device to have a race free
 1925 * view of the network device list.
 1926 */
 1927
 1928int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
 1929{
 1930	int err;
 1931
 1932	rtnl_lock();
 1933	err = __register_netdevice_notifier_net(net, nb, false);
 1934	rtnl_unlock();
 1935	return err;
 1936}
 1937EXPORT_SYMBOL(register_netdevice_notifier_net);
 1938
 1939/**
 1940 * unregister_netdevice_notifier_net - unregister a per-netns
 1941 *                                     network notifier block
 1942 * @net: network namespace
 1943 * @nb: notifier
 1944 *
 1945 * Unregister a notifier previously registered by
 1946 * register_netdevice_notifier(). The notifier is unlinked into the
 1947 * kernel structures and may then be reused. A negative errno code
 1948 * is returned on a failure.
 1949 *
 1950 * After unregistering unregister and down device events are synthesized
 1951 * for all devices on the device list to the removed notifier to remove
 1952 * the need for special case cleanup code.
 1953 */
 1954
 1955int unregister_netdevice_notifier_net(struct net *net,
 1956				      struct notifier_block *nb)
 1957{
 1958	int err;
 1959
 1960	rtnl_lock();
 1961	err = __unregister_netdevice_notifier_net(net, nb);
 1962	rtnl_unlock();
 1963	return err;
 1964}
 1965EXPORT_SYMBOL(unregister_netdevice_notifier_net);
 1966
 1967int register_netdevice_notifier_dev_net(struct net_device *dev,
 1968					struct notifier_block *nb,
 1969					struct netdev_net_notifier *nn)
 1970{
 1971	int err;
 1972
 1973	rtnl_lock();
 1974	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
 1975	if (!err) {
 1976		nn->nb = nb;
 1977		list_add(&nn->list, &dev->net_notifier_list);
 1978	}
 1979	rtnl_unlock();
 1980	return err;
 1981}
 1982EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
 1983
 1984int unregister_netdevice_notifier_dev_net(struct net_device *dev,
 1985					  struct notifier_block *nb,
 1986					  struct netdev_net_notifier *nn)
 1987{
 1988	int err;
 1989
 1990	rtnl_lock();
 1991	list_del(&nn->list);
 1992	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
 1993	rtnl_unlock();
 1994	return err;
 1995}
 1996EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
 1997
 1998static void move_netdevice_notifiers_dev_net(struct net_device *dev,
 1999					     struct net *net)
 2000{
 2001	struct netdev_net_notifier *nn;
 2002
 2003	list_for_each_entry(nn, &dev->net_notifier_list, list) {
 2004		__unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
 2005		__register_netdevice_notifier_net(net, nn->nb, true);
 2006	}
 2007}
 2008
 2009/**
 2010 *	call_netdevice_notifiers_info - call all network notifier blocks
 2011 *	@val: value passed unmodified to notifier function
 2012 *	@info: notifier information data
 2013 *
 2014 *	Call all network notifier blocks.  Parameters and return value
 2015 *	are as for raw_notifier_call_chain().
 2016 */
 2017
 2018static int call_netdevice_notifiers_info(unsigned long val,
 2019					 struct netdev_notifier_info *info)
 2020{
 2021	struct net *net = dev_net(info->dev);
 2022	int ret;
 2023
 2024	ASSERT_RTNL();
 2025
 2026	/* Run per-netns notifier block chain first, then run the global one.
 2027	 * Hopefully, one day, the global one is going to be removed after
 2028	 * all notifier block registrators get converted to be per-netns.
 2029	 */
 2030	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
 2031	if (ret & NOTIFY_STOP_MASK)
 2032		return ret;
 2033	return raw_notifier_call_chain(&netdev_chain, val, info);
 2034}
 2035
 2036static int call_netdevice_notifiers_extack(unsigned long val,
 2037					   struct net_device *dev,
 2038					   struct netlink_ext_ack *extack)
 2039{
 2040	struct netdev_notifier_info info = {
 2041		.dev = dev,
 2042		.extack = extack,
 2043	};
 2044
 2045	return call_netdevice_notifiers_info(val, &info);
 2046}
 2047
 2048/**
 2049 *	call_netdevice_notifiers - call all network notifier blocks
 2050 *      @val: value passed unmodified to notifier function
 2051 *      @dev: net_device pointer passed unmodified to notifier function
 2052 *
 2053 *	Call all network notifier blocks.  Parameters and return value
 2054 *	are as for raw_notifier_call_chain().
 2055 */
 2056
 2057int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 2058{
 2059	return call_netdevice_notifiers_extack(val, dev, NULL);
 2060}
 2061EXPORT_SYMBOL(call_netdevice_notifiers);
 2062
 2063/**
 2064 *	call_netdevice_notifiers_mtu - call all network notifier blocks
 2065 *	@val: value passed unmodified to notifier function
 2066 *	@dev: net_device pointer passed unmodified to notifier function
 2067 *	@arg: additional u32 argument passed to the notifier function
 2068 *
 2069 *	Call all network notifier blocks.  Parameters and return value
 2070 *	are as for raw_notifier_call_chain().
 2071 */
 2072static int call_netdevice_notifiers_mtu(unsigned long val,
 2073					struct net_device *dev, u32 arg)
 2074{
 2075	struct netdev_notifier_info_ext info = {
 2076		.info.dev = dev,
 2077		.ext.mtu = arg,
 2078	};
 2079
 2080	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
 2081
 2082	return call_netdevice_notifiers_info(val, &info.info);
 2083}
 2084
 2085#ifdef CONFIG_NET_INGRESS
 2086static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
 2087
 2088void net_inc_ingress_queue(void)
 2089{
 2090	static_branch_inc(&ingress_needed_key);
 2091}
 2092EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
 2093
 2094void net_dec_ingress_queue(void)
 2095{
 2096	static_branch_dec(&ingress_needed_key);
 2097}
 2098EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
 2099#endif
 2100
 2101#ifdef CONFIG_NET_EGRESS
 2102static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
 2103
 2104void net_inc_egress_queue(void)
 2105{
 2106	static_branch_inc(&egress_needed_key);
 2107}
 2108EXPORT_SYMBOL_GPL(net_inc_egress_queue);
 2109
 2110void net_dec_egress_queue(void)
 2111{
 2112	static_branch_dec(&egress_needed_key);
 2113}
 2114EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 2115#endif
 2116
 2117static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
 2118#ifdef CONFIG_JUMP_LABEL
 2119static atomic_t netstamp_needed_deferred;
 2120static atomic_t netstamp_wanted;
 2121static void netstamp_clear(struct work_struct *work)
 2122{
 2123	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
 2124	int wanted;
 2125
 2126	wanted = atomic_add_return(deferred, &netstamp_wanted);
 2127	if (wanted > 0)
 2128		static_branch_enable(&netstamp_needed_key);
 2129	else
 2130		static_branch_disable(&netstamp_needed_key);
 2131}
 2132static DECLARE_WORK(netstamp_work, netstamp_clear);
 2133#endif
 2134
 2135void net_enable_timestamp(void)
 2136{
 2137#ifdef CONFIG_JUMP_LABEL
 2138	int wanted;
 2139
 2140	while (1) {
 2141		wanted = atomic_read(&netstamp_wanted);
 2142		if (wanted <= 0)
 2143			break;
 2144		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
 2145			return;
 2146	}
 2147	atomic_inc(&netstamp_needed_deferred);
 2148	schedule_work(&netstamp_work);
 2149#else
 2150	static_branch_inc(&netstamp_needed_key);
 2151#endif
 2152}
 2153EXPORT_SYMBOL(net_enable_timestamp);
 2154
 2155void net_disable_timestamp(void)
 2156{
 2157#ifdef CONFIG_JUMP_LABEL
 2158	int wanted;
 2159
 2160	while (1) {
 2161		wanted = atomic_read(&netstamp_wanted);
 2162		if (wanted <= 1)
 2163			break;
 2164		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
 2165			return;
 2166	}
 2167	atomic_dec(&netstamp_needed_deferred);
 2168	schedule_work(&netstamp_work);
 2169#else
 2170	static_branch_dec(&netstamp_needed_key);
 2171#endif
 2172}
 2173EXPORT_SYMBOL(net_disable_timestamp);
 2174
 2175static inline void net_timestamp_set(struct sk_buff *skb)
 2176{
 2177	skb->tstamp = 0;
 2178	if (static_branch_unlikely(&netstamp_needed_key))
 2179		__net_timestamp(skb);
 2180}
 2181
 2182#define net_timestamp_check(COND, SKB)				\
 2183	if (static_branch_unlikely(&netstamp_needed_key)) {	\
 2184		if ((COND) && !(SKB)->tstamp)			\
 2185			__net_timestamp(SKB);			\
 2186	}							\
 2187
 2188bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
 2189{
 2190	unsigned int len;
 2191
 2192	if (!(dev->flags & IFF_UP))
 2193		return false;
 2194
 2195	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
 2196	if (skb->len <= len)
 2197		return true;
 2198
 2199	/* if TSO is enabled, we don't care about the length as the packet
 2200	 * could be forwarded without being segmented before
 2201	 */
 2202	if (skb_is_gso(skb))
 2203		return true;
 2204
 2205	return false;
 2206}
 2207EXPORT_SYMBOL_GPL(is_skb_forwardable);
 2208
 2209int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2210{
 2211	int ret = ____dev_forward_skb(dev, skb);
 2212
 2213	if (likely(!ret)) {
 2214		skb->protocol = eth_type_trans(skb, dev);
 2215		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 2216	}
 2217
 2218	return ret;
 2219}
 2220EXPORT_SYMBOL_GPL(__dev_forward_skb);
 2221
 2222/**
 2223 * dev_forward_skb - loopback an skb to another netif
 2224 *
 2225 * @dev: destination network device
 2226 * @skb: buffer to forward
 2227 *
 2228 * return values:
 2229 *	NET_RX_SUCCESS	(no congestion)
 2230 *	NET_RX_DROP     (packet was dropped, but freed)
 2231 *
 2232 * dev_forward_skb can be used for injecting an skb from the
 2233 * start_xmit function of one device into the receive queue
 2234 * of another device.
 2235 *
 2236 * The receiving device may be in another namespace, so
 2237 * we have to clear all information in the skb that could
 2238 * impact namespace isolation.
 2239 */
 2240int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2241{
 2242	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
 2243}
 2244EXPORT_SYMBOL_GPL(dev_forward_skb);
 2245
 2246static inline int deliver_skb(struct sk_buff *skb,
 2247			      struct packet_type *pt_prev,
 2248			      struct net_device *orig_dev)
 2249{
 2250	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 2251		return -ENOMEM;
 2252	refcount_inc(&skb->users);
 2253	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 2254}
 2255
 2256static inline void deliver_ptype_list_skb(struct sk_buff *skb,
 2257					  struct packet_type **pt,
 2258					  struct net_device *orig_dev,
 2259					  __be16 type,
 2260					  struct list_head *ptype_list)
 2261{
 2262	struct packet_type *ptype, *pt_prev = *pt;
 2263
 2264	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2265		if (ptype->type != type)
 2266			continue;
 2267		if (pt_prev)
 2268			deliver_skb(skb, pt_prev, orig_dev);
 2269		pt_prev = ptype;
 2270	}
 2271	*pt = pt_prev;
 2272}
 2273
 2274static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 2275{
 2276	if (!ptype->af_packet_priv || !skb->sk)
 2277		return false;
 2278
 2279	if (ptype->id_match)
 2280		return ptype->id_match(ptype, skb->sk);
 2281	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
 2282		return true;
 2283
 2284	return false;
 2285}
 2286
 2287/**
 2288 * dev_nit_active - return true if any network interface taps are in use
 2289 *
 2290 * @dev: network device to check for the presence of taps
 2291 */
 2292bool dev_nit_active(struct net_device *dev)
 2293{
 2294	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
 2295}
 2296EXPORT_SYMBOL_GPL(dev_nit_active);
 2297
 2298/*
 2299 *	Support routine. Sends outgoing frames to any network
 2300 *	taps currently in use.
 2301 */
 2302
 2303void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 2304{
 2305	struct packet_type *ptype;
 2306	struct sk_buff *skb2 = NULL;
 2307	struct packet_type *pt_prev = NULL;
 2308	struct list_head *ptype_list = &ptype_all;
 2309
 2310	rcu_read_lock();
 2311again:
 2312	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2313		if (ptype->ignore_outgoing)
 2314			continue;
 2315
 2316		/* Never send packets back to the socket
 2317		 * they originated from - MvS (miquels@drinkel.ow.org)
 2318		 */
 2319		if (skb_loop_sk(ptype, skb))
 2320			continue;
 2321
 2322		if (pt_prev) {
 2323			deliver_skb(skb2, pt_prev, skb->dev);
 2324			pt_prev = ptype;
 2325			continue;
 2326		}
 2327
 2328		/* need to clone skb, done only once */
 2329		skb2 = skb_clone(skb, GFP_ATOMIC);
 2330		if (!skb2)
 2331			goto out_unlock;
 2332
 2333		net_timestamp_set(skb2);
 2334
 2335		/* skb->nh should be correctly
 2336		 * set by sender, so that the second statement is
 2337		 * just protection against buggy protocols.
 2338		 */
 2339		skb_reset_mac_header(skb2);
 2340
 2341		if (skb_network_header(skb2) < skb2->data ||
 2342		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
 2343			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
 2344					     ntohs(skb2->protocol),
 2345					     dev->name);
 2346			skb_reset_network_header(skb2);
 2347		}
 2348
 2349		skb2->transport_header = skb2->network_header;
 2350		skb2->pkt_type = PACKET_OUTGOING;
 2351		pt_prev = ptype;
 2352	}
 2353
 2354	if (ptype_list == &ptype_all) {
 2355		ptype_list = &dev->ptype_all;
 2356		goto again;
 2357	}
 2358out_unlock:
 2359	if (pt_prev) {
 2360		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
 2361			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 2362		else
 2363			kfree_skb(skb2);
 2364	}
 2365	rcu_read_unlock();
 2366}
 2367EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
 2368
 2369/**
 2370 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 2371 * @dev: Network device
 2372 * @txq: number of queues available
 2373 *
 2374 * If real_num_tx_queues is changed the tc mappings may no longer be
 2375 * valid. To resolve this verify the tc mapping remains valid and if
 2376 * not NULL the mapping. With no priorities mapping to this
 2377 * offset/count pair it will no longer be used. In the worst case TC0
 2378 * is invalid nothing can be done so disable priority mappings. If is
 2379 * expected that drivers will fix this mapping if they can before
 2380 * calling netif_set_real_num_tx_queues.
 2381 */
 2382static void netif_setup_tc(struct net_device *dev, unsigned int txq)
 2383{
 2384	int i;
 2385	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2386
 2387	/* If TC0 is invalidated disable TC mapping */
 2388	if (tc->offset + tc->count > txq) {
 2389		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
 2390		dev->num_tc = 0;
 2391		return;
 2392	}
 2393
 2394	/* Invalidated prio to tc mappings set to TC0 */
 2395	for (i = 1; i < TC_BITMASK + 1; i++) {
 2396		int q = netdev_get_prio_tc_map(dev, i);
 2397
 2398		tc = &dev->tc_to_txq[q];
 2399		if (tc->offset + tc->count > txq) {
 2400			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
 2401				i, q);
 2402			netdev_set_prio_tc_map(dev, i, 0);
 2403		}
 2404	}
 2405}
 2406
 2407int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
 2408{
 2409	if (dev->num_tc) {
 2410		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2411		int i;
 2412
 2413		/* walk through the TCs and see if it falls into any of them */
 2414		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
 2415			if ((txq - tc->offset) < tc->count)
 2416				return i;
 2417		}
 2418
 2419		/* didn't find it, just return -1 to indicate no match */
 2420		return -1;
 2421	}
 2422
 2423	return 0;
 2424}
 2425EXPORT_SYMBOL(netdev_txq_to_tc);
 2426
 2427#ifdef CONFIG_XPS
 2428struct static_key xps_needed __read_mostly;
 2429EXPORT_SYMBOL(xps_needed);
 2430struct static_key xps_rxqs_needed __read_mostly;
 2431EXPORT_SYMBOL(xps_rxqs_needed);
 2432static DEFINE_MUTEX(xps_map_mutex);
 2433#define xmap_dereference(P)		\
 2434	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
 2435
 2436static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 2437			     int tci, u16 index)
 2438{
 2439	struct xps_map *map = NULL;
 2440	int pos;
 2441
 2442	if (dev_maps)
 2443		map = xmap_dereference(dev_maps->attr_map[tci]);
 2444	if (!map)
 2445		return false;
 2446
 2447	for (pos = map->len; pos--;) {
 2448		if (map->queues[pos] != index)
 2449			continue;
 2450
 2451		if (map->len > 1) {
 2452			map->queues[pos] = map->queues[--map->len];
 2453			break;
 2454		}
 2455
 2456		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 2457		kfree_rcu(map, rcu);
 2458		return false;
 2459	}
 2460
 2461	return true;
 2462}
 2463
 2464static bool remove_xps_queue_cpu(struct net_device *dev,
 2465				 struct xps_dev_maps *dev_maps,
 2466				 int cpu, u16 offset, u16 count)
 2467{
 2468	int num_tc = dev->num_tc ? : 1;
 2469	bool active = false;
 2470	int tci;
 2471
 2472	for (tci = cpu * num_tc; num_tc--; tci++) {
 2473		int i, j;
 2474
 2475		for (i = count, j = offset; i--; j++) {
 2476			if (!remove_xps_queue(dev_maps, tci, j))
 2477				break;
 2478		}
 2479
 2480		active |= i < 0;
 2481	}
 2482
 2483	return active;
 2484}
 2485
 2486static void reset_xps_maps(struct net_device *dev,
 2487			   struct xps_dev_maps *dev_maps,
 2488			   bool is_rxqs_map)
 2489{
 2490	if (is_rxqs_map) {
 2491		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
 2492		RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
 2493	} else {
 2494		RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
 2495	}
 2496	static_key_slow_dec_cpuslocked(&xps_needed);
 2497	kfree_rcu(dev_maps, rcu);
 2498}
 2499
 2500static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
 2501			   struct xps_dev_maps *dev_maps, unsigned int nr_ids,
 2502			   u16 offset, u16 count, bool is_rxqs_map)
 2503{
 2504	bool active = false;
 2505	int i, j;
 2506
 2507	for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
 2508	     j < nr_ids;)
 2509		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
 2510					       count);
 2511	if (!active)
 2512		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 2513
 2514	if (!is_rxqs_map) {
 2515		for (i = offset + (count - 1); count--; i--) {
 2516			netdev_queue_numa_node_write(
 2517				netdev_get_tx_queue(dev, i),
 2518				NUMA_NO_NODE);
 2519		}
 2520	}
 2521}
 2522
 2523static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 2524				   u16 count)
 2525{
 2526	const unsigned long *possible_mask = NULL;
 2527	struct xps_dev_maps *dev_maps;
 2528	unsigned int nr_ids;
 2529
 2530	if (!static_key_false(&xps_needed))
 2531		return;
 2532
 2533	cpus_read_lock();
 2534	mutex_lock(&xps_map_mutex);
 2535
 2536	if (static_key_false(&xps_rxqs_needed)) {
 2537		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 2538		if (dev_maps) {
 2539			nr_ids = dev->num_rx_queues;
 2540			clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
 2541				       offset, count, true);
 2542		}
 2543	}
 2544
 2545	dev_maps = xmap_dereference(dev->xps_cpus_map);
 2546	if (!dev_maps)
 2547		goto out_no_maps;
 2548
 2549	if (num_possible_cpus() > 1)
 2550		possible_mask = cpumask_bits(cpu_possible_mask);
 2551	nr_ids = nr_cpu_ids;
 2552	clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
 2553		       false);
 2554
 2555out_no_maps:
 2556	mutex_unlock(&xps_map_mutex);
 2557	cpus_read_unlock();
 2558}
 2559
 2560static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
 2561{
 2562	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
 2563}
 2564
 2565static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
 2566				      u16 index, bool is_rxqs_map)
 2567{
 2568	struct xps_map *new_map;
 2569	int alloc_len = XPS_MIN_MAP_ALLOC;
 2570	int i, pos;
 2571
 2572	for (pos = 0; map && pos < map->len; pos++) {
 2573		if (map->queues[pos] != index)
 2574			continue;
 2575		return map;
 2576	}
 2577
 2578	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
 2579	if (map) {
 2580		if (pos < map->alloc_len)
 2581			return map;
 2582
 2583		alloc_len = map->alloc_len * 2;
 2584	}
 2585
 2586	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
 2587	 *  map
 2588	 */
 2589	if (is_rxqs_map)
 2590		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
 2591	else
 2592		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
 2593				       cpu_to_node(attr_index));
 2594	if (!new_map)
 2595		return NULL;
 2596
 2597	for (i = 0; i < pos; i++)
 2598		new_map->queues[i] = map->queues[i];
 2599	new_map->alloc_len = alloc_len;
 2600	new_map->len = pos;
 2601
 2602	return new_map;
 2603}
 2604
 2605/* Must be called under cpus_read_lock */
 2606int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 2607			  u16 index, bool is_rxqs_map)
 2608{
 2609	const unsigned long *online_mask = NULL, *possible_mask = NULL;
 2610	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
 2611	int i, j, tci, numa_node_id = -2;
 2612	int maps_sz, num_tc = 1, tc = 0;
 2613	struct xps_map *map, *new_map;
 2614	bool active = false;
 2615	unsigned int nr_ids;
 2616
 2617	if (dev->num_tc) {
 2618		/* Do not allow XPS on subordinate device directly */
 2619		num_tc = dev->num_tc;
 2620		if (num_tc < 0)
 2621			return -EINVAL;
 2622
 2623		/* If queue belongs to subordinate dev use its map */
 2624		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
 2625
 2626		tc = netdev_txq_to_tc(dev, index);
 2627		if (tc < 0)
 2628			return -EINVAL;
 2629	}
 2630
 2631	mutex_lock(&xps_map_mutex);
 2632	if (is_rxqs_map) {
 2633		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
 2634		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 2635		nr_ids = dev->num_rx_queues;
 2636	} else {
 2637		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
 2638		if (num_possible_cpus() > 1) {
 2639			online_mask = cpumask_bits(cpu_online_mask);
 2640			possible_mask = cpumask_bits(cpu_possible_mask);
 2641		}
 2642		dev_maps = xmap_dereference(dev->xps_cpus_map);
 2643		nr_ids = nr_cpu_ids;
 2644	}
 2645
 2646	if (maps_sz < L1_CACHE_BYTES)
 2647		maps_sz = L1_CACHE_BYTES;
 2648
 2649	/* allocate memory for queue storage */
 2650	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
 2651	     j < nr_ids;) {
 2652		if (!new_dev_maps)
 2653			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 2654		if (!new_dev_maps) {
 2655			mutex_unlock(&xps_map_mutex);
 2656			return -ENOMEM;
 2657		}
 2658
 2659		tci = j * num_tc + tc;
 2660		map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
 2661				 NULL;
 2662
 2663		map = expand_xps_map(map, j, index, is_rxqs_map);
 2664		if (!map)
 2665			goto error;
 2666
 2667		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2668	}
 2669
 2670	if (!new_dev_maps)
 2671		goto out_no_new_maps;
 2672
 2673	if (!dev_maps) {
 2674		/* Increment static keys at most once per type */
 2675		static_key_slow_inc_cpuslocked(&xps_needed);
 2676		if (is_rxqs_map)
 2677			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
 2678	}
 2679
 2680	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2681	     j < nr_ids;) {
 2682		/* copy maps belonging to foreign traffic classes */
 2683		for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
 2684			/* fill in the new device map from the old device map */
 2685			map = xmap_dereference(dev_maps->attr_map[tci]);
 2686			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2687		}
 2688
 2689		/* We need to explicitly update tci as prevous loop
 2690		 * could break out early if dev_maps is NULL.
 2691		 */
 2692		tci = j * num_tc + tc;
 2693
 2694		if (netif_attr_test_mask(j, mask, nr_ids) &&
 2695		    netif_attr_test_online(j, online_mask, nr_ids)) {
 2696			/* add tx-queue to CPU/rx-queue maps */
 2697			int pos = 0;
 2698
 2699			map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2700			while ((pos < map->len) && (map->queues[pos] != index))
 2701				pos++;
 2702
 2703			if (pos == map->len)
 2704				map->queues[map->len++] = index;
 2705#ifdef CONFIG_NUMA
 2706			if (!is_rxqs_map) {
 2707				if (numa_node_id == -2)
 2708					numa_node_id = cpu_to_node(j);
 2709				else if (numa_node_id != cpu_to_node(j))
 2710					numa_node_id = -1;
 2711			}
 2712#endif
 2713		} else if (dev_maps) {
 2714			/* fill in the new device map from the old device map */
 2715			map = xmap_dereference(dev_maps->attr_map[tci]);
 2716			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2717		}
 2718
 2719		/* copy maps belonging to foreign traffic classes */
 2720		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
 2721			/* fill in the new device map from the old device map */
 2722			map = xmap_dereference(dev_maps->attr_map[tci]);
 2723			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2724		}
 2725	}
 2726
 2727	if (is_rxqs_map)
 2728		rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
 2729	else
 2730		rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
 2731
 2732	/* Cleanup old maps */
 2733	if (!dev_maps)
 2734		goto out_no_old_maps;
 2735
 2736	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2737	     j < nr_ids;) {
 2738		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2739			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2740			map = xmap_dereference(dev_maps->attr_map[tci]);
 2741			if (map && map != new_map)
 2742				kfree_rcu(map, rcu);
 2743		}
 2744	}
 2745
 2746	kfree_rcu(dev_maps, rcu);
 2747
 2748out_no_old_maps:
 2749	dev_maps = new_dev_maps;
 2750	active = true;
 2751
 2752out_no_new_maps:
 2753	if (!is_rxqs_map) {
 2754		/* update Tx queue numa node */
 2755		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
 2756					     (numa_node_id >= 0) ?
 2757					     numa_node_id : NUMA_NO_NODE);
 2758	}
 2759
 2760	if (!dev_maps)
 2761		goto out_no_maps;
 2762
 2763	/* removes tx-queue from unused CPUs/rx-queues */
 2764	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2765	     j < nr_ids;) {
 2766		for (i = tc, tci = j * num_tc; i--; tci++)
 2767			active |= remove_xps_queue(dev_maps, tci, index);
 2768		if (!netif_attr_test_mask(j, mask, nr_ids) ||
 2769		    !netif_attr_test_online(j, online_mask, nr_ids))
 2770			active |= remove_xps_queue(dev_maps, tci, index);
 2771		for (i = num_tc - tc, tci++; --i; tci++)
 2772			active |= remove_xps_queue(dev_maps, tci, index);
 2773	}
 2774
 2775	/* free map if not active */
 2776	if (!active)
 2777		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 2778
 2779out_no_maps:
 2780	mutex_unlock(&xps_map_mutex);
 2781
 2782	return 0;
 2783error:
 2784	/* remove any maps that we added */
 2785	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2786	     j < nr_ids;) {
 2787		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2788			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2789			map = dev_maps ?
 2790			      xmap_dereference(dev_maps->attr_map[tci]) :
 2791			      NULL;
 2792			if (new_map && new_map != map)
 2793				kfree(new_map);
 2794		}
 2795	}
 2796
 2797	mutex_unlock(&xps_map_mutex);
 2798
 2799	kfree(new_dev_maps);
 2800	return -ENOMEM;
 2801}
 2802EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
 2803
 2804int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 2805			u16 index)
 2806{
 2807	int ret;
 2808
 2809	cpus_read_lock();
 2810	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
 2811	cpus_read_unlock();
 2812
 2813	return ret;
 2814}
 2815EXPORT_SYMBOL(netif_set_xps_queue);
 2816
 2817#endif
 2818static void netdev_unbind_all_sb_channels(struct net_device *dev)
 2819{
 2820	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2821
 2822	/* Unbind any subordinate channels */
 2823	while (txq-- != &dev->_tx[0]) {
 2824		if (txq->sb_dev)
 2825			netdev_unbind_sb_channel(dev, txq->sb_dev);
 2826	}
 2827}
 2828
 2829void netdev_reset_tc(struct net_device *dev)
 2830{
 2831#ifdef CONFIG_XPS
 2832	netif_reset_xps_queues_gt(dev, 0);
 2833#endif
 2834	netdev_unbind_all_sb_channels(dev);
 2835
 2836	/* Reset TC configuration of device */
 2837	dev->num_tc = 0;
 2838	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
 2839	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
 2840}
 2841EXPORT_SYMBOL(netdev_reset_tc);
 2842
 2843int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
 2844{
 2845	if (tc >= dev->num_tc)
 2846		return -EINVAL;
 2847
 2848#ifdef CONFIG_XPS
 2849	netif_reset_xps_queues(dev, offset, count);
 2850#endif
 2851	dev->tc_to_txq[tc].count = count;
 2852	dev->tc_to_txq[tc].offset = offset;
 2853	return 0;
 2854}
 2855EXPORT_SYMBOL(netdev_set_tc_queue);
 2856
 2857int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 2858{
 2859	if (num_tc > TC_MAX_QUEUE)
 2860		return -EINVAL;
 2861
 2862#ifdef CONFIG_XPS
 2863	netif_reset_xps_queues_gt(dev, 0);
 2864#endif
 2865	netdev_unbind_all_sb_channels(dev);
 2866
 2867	dev->num_tc = num_tc;
 2868	return 0;
 2869}
 2870EXPORT_SYMBOL(netdev_set_num_tc);
 2871
 2872void netdev_unbind_sb_channel(struct net_device *dev,
 2873			      struct net_device *sb_dev)
 2874{
 2875	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2876
 2877#ifdef CONFIG_XPS
 2878	netif_reset_xps_queues_gt(sb_dev, 0);
 2879#endif
 2880	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
 2881	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
 2882
 2883	while (txq-- != &dev->_tx[0]) {
 2884		if (txq->sb_dev == sb_dev)
 2885			txq->sb_dev = NULL;
 2886	}
 2887}
 2888EXPORT_SYMBOL(netdev_unbind_sb_channel);
 2889
 2890int netdev_bind_sb_channel_queue(struct net_device *dev,
 2891				 struct net_device *sb_dev,
 2892				 u8 tc, u16 count, u16 offset)
 2893{
 2894	/* Make certain the sb_dev and dev are already configured */
 2895	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
 2896		return -EINVAL;
 2897
 2898	/* We cannot hand out queues we don't have */
 2899	if ((offset + count) > dev->real_num_tx_queues)
 2900		return -EINVAL;
 2901
 2902	/* Record the mapping */
 2903	sb_dev->tc_to_txq[tc].count = count;
 2904	sb_dev->tc_to_txq[tc].offset = offset;
 2905
 2906	/* Provide a way for Tx queue to find the tc_to_txq map or
 2907	 * XPS map for itself.
 2908	 */
 2909	while (count--)
 2910		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
 2911
 2912	return 0;
 2913}
 2914EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
 2915
 2916int netdev_set_sb_channel(struct net_device *dev, u16 channel)
 2917{
 2918	/* Do not use a multiqueue device to represent a subordinate channel */
 2919	if (netif_is_multiqueue(dev))
 2920		return -ENODEV;
 2921
 2922	/* We allow channels 1 - 32767 to be used for subordinate channels.
 2923	 * Channel 0 is meant to be "native" mode and used only to represent
 2924	 * the main root device. We allow writing 0 to reset the device back
 2925	 * to normal mode after being used as a subordinate channel.
 2926	 */
 2927	if (channel > S16_MAX)
 2928		return -EINVAL;
 2929
 2930	dev->num_tc = -channel;
 2931
 2932	return 0;
 2933}
 2934EXPORT_SYMBOL(netdev_set_sb_channel);
 2935
 2936/*
 2937 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
 2938 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
 2939 */
 2940int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 2941{
 2942	bool disabling;
 2943	int rc;
 2944
 2945	disabling = txq < dev->real_num_tx_queues;
 2946
 2947	if (txq < 1 || txq > dev->num_tx_queues)
 2948		return -EINVAL;
 2949
 2950	if (dev->reg_state == NETREG_REGISTERED ||
 2951	    dev->reg_state == NETREG_UNREGISTERING) {
 2952		ASSERT_RTNL();
 2953
 2954		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
 2955						  txq);
 2956		if (rc)
 2957			return rc;
 2958
 2959		if (dev->num_tc)
 2960			netif_setup_tc(dev, txq);
 2961
 2962		dev->real_num_tx_queues = txq;
 2963
 2964		if (disabling) {
 2965			synchronize_net();
 2966			qdisc_reset_all_tx_gt(dev, txq);
 2967#ifdef CONFIG_XPS
 2968			netif_reset_xps_queues_gt(dev, txq);
 2969#endif
 2970		}
 2971	} else {
 2972		dev->real_num_tx_queues = txq;
 2973	}
 2974
 2975	return 0;
 2976}
 2977EXPORT_SYMBOL(netif_set_real_num_tx_queues);
 2978
 2979#ifdef CONFIG_SYSFS
 2980/**
 2981 *	netif_set_real_num_rx_queues - set actual number of RX queues used
 2982 *	@dev: Network device
 2983 *	@rxq: Actual number of RX queues
 2984 *
 2985 *	This must be called either with the rtnl_lock held or before
 2986 *	registration of the net device.  Returns 0 on success, or a
 2987 *	negative error code.  If called before registration, it always
 2988 *	succeeds.
 2989 */
 2990int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
 2991{
 2992	int rc;
 2993
 2994	if (rxq < 1 || rxq > dev->num_rx_queues)
 2995		return -EINVAL;
 2996
 2997	if (dev->reg_state == NETREG_REGISTERED) {
 2998		ASSERT_RTNL();
 2999
 3000		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
 3001						  rxq);
 3002		if (rc)
 3003			return rc;
 3004	}
 3005
 3006	dev->real_num_rx_queues = rxq;
 3007	return 0;
 3008}
 3009EXPORT_SYMBOL(netif_set_real_num_rx_queues);
 3010#endif
 3011
 3012/**
 3013 * netif_get_num_default_rss_queues - default number of RSS queues
 3014 *
 3015 * This routine should set an upper limit on the number of RSS queues
 3016 * used by default by multiqueue devices.
 3017 */
 3018int netif_get_num_default_rss_queues(void)
 3019{
 3020	return is_kdump_kernel() ?
 3021		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
 3022}
 3023EXPORT_SYMBOL(netif_get_num_default_rss_queues);
 3024
 3025static void __netif_reschedule(struct Qdisc *q)
 3026{
 3027	struct softnet_data *sd;
 3028	unsigned long flags;
 3029
 3030	local_irq_save(flags);
 3031	sd = this_cpu_ptr(&softnet_data);
 3032	q->next_sched = NULL;
 3033	*sd->output_queue_tailp = q;
 3034	sd->output_queue_tailp = &q->next_sched;
 3035	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3036	local_irq_restore(flags);
 3037}
 3038
 3039void __netif_schedule(struct Qdisc *q)
 3040{
 3041	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
 3042		__netif_reschedule(q);
 3043}
 3044EXPORT_SYMBOL(__netif_schedule);
 3045
 3046struct dev_kfree_skb_cb {
 3047	enum skb_free_reason reason;
 3048};
 3049
 3050static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
 3051{
 3052	return (struct dev_kfree_skb_cb *)skb->cb;
 3053}
 3054
 3055void netif_schedule_queue(struct netdev_queue *txq)
 3056{
 3057	rcu_read_lock();
 3058	if (!netif_xmit_stopped(txq)) {
 3059		struct Qdisc *q = rcu_dereference(txq->qdisc);
 3060
 3061		__netif_schedule(q);
 3062	}
 3063	rcu_read_unlock();
 3064}
 3065EXPORT_SYMBOL(netif_schedule_queue);
 3066
 3067void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 3068{
 3069	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
 3070		struct Qdisc *q;
 3071
 3072		rcu_read_lock();
 3073		q = rcu_dereference(dev_queue->qdisc);
 3074		__netif_schedule(q);
 3075		rcu_read_unlock();
 3076	}
 3077}
 3078EXPORT_SYMBOL(netif_tx_wake_queue);
 3079
 3080void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
 3081{
 3082	unsigned long flags;
 3083
 3084	if (unlikely(!skb))
 3085		return;
 3086
 3087	if (likely(refcount_read(&skb->users) == 1)) {
 3088		smp_rmb();
 3089		refcount_set(&skb->users, 0);
 3090	} else if (likely(!refcount_dec_and_test(&skb->users))) {
 3091		return;
 3092	}
 3093	get_kfree_skb_cb(skb)->reason = reason;
 3094	local_irq_save(flags);
 3095	skb->next = __this_cpu_read(softnet_data.completion_queue);
 3096	__this_cpu_write(softnet_data.completion_queue, skb);
 3097	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3098	local_irq_restore(flags);
 3099}
 3100EXPORT_SYMBOL(__dev_kfree_skb_irq);
 3101
 3102void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
 3103{
 3104	if (in_irq() || irqs_disabled())
 3105		__dev_kfree_skb_irq(skb, reason);
 3106	else
 3107		dev_kfree_skb(skb);
 3108}
 3109EXPORT_SYMBOL(__dev_kfree_skb_any);
 3110
 3111
 3112/**
 3113 * netif_device_detach - mark device as removed
 3114 * @dev: network device
 3115 *
 3116 * Mark device as removed from system and therefore no longer available.
 3117 */
 3118void netif_device_detach(struct net_device *dev)
 3119{
 3120	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3121	    netif_running(dev)) {
 3122		netif_tx_stop_all_queues(dev);
 3123	}
 3124}
 3125EXPORT_SYMBOL(netif_device_detach);
 3126
 3127/**
 3128 * netif_device_attach - mark device as attached
 3129 * @dev: network device
 3130 *
 3131 * Mark device as attached from system and restart if needed.
 3132 */
 3133void netif_device_attach(struct net_device *dev)
 3134{
 3135	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3136	    netif_running(dev)) {
 3137		netif_tx_wake_all_queues(dev);
 3138		__netdev_watchdog_up(dev);
 3139	}
 3140}
 3141EXPORT_SYMBOL(netif_device_attach);
 3142
 3143/*
 3144 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 3145 * to be used as a distribution range.
 3146 */
 3147static u16 skb_tx_hash(const struct net_device *dev,
 3148		       const struct net_device *sb_dev,
 3149		       struct sk_buff *skb)
 3150{
 3151	u32 hash;
 3152	u16 qoffset = 0;
 3153	u16 qcount = dev->real_num_tx_queues;
 3154
 3155	if (dev->num_tc) {
 3156		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
 3157
 3158		qoffset = sb_dev->tc_to_txq[tc].offset;
 3159		qcount = sb_dev->tc_to_txq[tc].count;
 3160	}
 3161
 3162	if (skb_rx_queue_recorded(skb)) {
 3163		hash = skb_get_rx_queue(skb);
 3164		if (hash >= qoffset)
 3165			hash -= qoffset;
 3166		while (unlikely(hash >= qcount))
 3167			hash -= qcount;
 3168		return hash + qoffset;
 3169	}
 3170
 3171	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
 3172}
 3173
 3174static void skb_warn_bad_offload(const struct sk_buff *skb)
 3175{
 3176	static const netdev_features_t null_features;
 3177	struct net_device *dev = skb->dev;
 3178	const char *name = "";
 3179
 3180	if (!net_ratelimit())
 3181		return;
 3182
 3183	if (dev) {
 3184		if (dev->dev.parent)
 3185			name = dev_driver_string(dev->dev.parent);
 3186		else
 3187			name = netdev_name(dev);
 3188	}
 3189	skb_dump(KERN_WARNING, skb, false);
 3190	WARN(1, "%s: caps=(%pNF, %pNF)\n",
 3191	     name, dev ? &dev->features : &null_features,
 3192	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
 3193}
 3194
 3195/*
 3196 * Invalidate hardware checksum when packet is to be mangled, and
 3197 * complete checksum manually on outgoing path.
 3198 */
 3199int skb_checksum_help(struct sk_buff *skb)
 3200{
 3201	__wsum csum;
 3202	int ret = 0, offset;
 3203
 3204	if (skb->ip_summed == CHECKSUM_COMPLETE)
 3205		goto out_set_summed;
 3206
 3207	if (unlikely(skb_shinfo(skb)->gso_size)) {
 3208		skb_warn_bad_offload(skb);
 3209		return -EINVAL;
 3210	}
 3211
 3212	/* Before computing a checksum, we should make sure no frag could
 3213	 * be modified by an external entity : checksum could be wrong.
 3214	 */
 3215	if (skb_has_shared_frag(skb)) {
 3216		ret = __skb_linearize(skb);
 3217		if (ret)
 3218			goto out;
 3219	}
 3220
 3221	offset = skb_checksum_start_offset(skb);
 3222	BUG_ON(offset >= skb_headlen(skb));
 3223	csum = skb_checksum(skb, offset, skb->len - offset, 0);
 3224
 3225	offset += skb->csum_offset;
 3226	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
 3227
 3228	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
 3229	if (ret)
 3230		goto out;
 3231
 3232	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
 3233out_set_summed:
 3234	skb->ip_summed = CHECKSUM_NONE;
 3235out:
 3236	return ret;
 3237}
 3238EXPORT_SYMBOL(skb_checksum_help);
 3239
 3240int skb_crc32c_csum_help(struct sk_buff *skb)
 3241{
 3242	__le32 crc32c_csum;
 3243	int ret = 0, offset, start;
 3244
 3245	if (skb->ip_summed != CHECKSUM_PARTIAL)
 3246		goto out;
 3247
 3248	if (unlikely(skb_is_gso(skb)))
 3249		goto out;
 3250
 3251	/* Before computing a checksum, we should make sure no frag could
 3252	 * be modified by an external entity : checksum could be wrong.
 3253	 */
 3254	if (unlikely(skb_has_shared_frag(skb))) {
 3255		ret = __skb_linearize(skb);
 3256		if (ret)
 3257			goto out;
 3258	}
 3259	start = skb_checksum_start_offset(skb);
 3260	offset = start + offsetof(struct sctphdr, checksum);
 3261	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
 3262		ret = -EINVAL;
 3263		goto out;
 3264	}
 3265
 3266	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
 3267	if (ret)
 3268		goto out;
 3269
 3270	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
 3271						  skb->len - start, ~(__u32)0,
 3272						  crc32c_csum_stub));
 3273	*(__le32 *)(skb->data + offset) = crc32c_csum;
 3274	skb->ip_summed = CHECKSUM_NONE;
 3275	skb->csum_not_inet = 0;
 3276out:
 3277	return ret;
 3278}
 3279
 3280__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 3281{
 3282	__be16 type = skb->protocol;
 3283
 3284	/* Tunnel gso handlers can set protocol to ethernet. */
 3285	if (type == htons(ETH_P_TEB)) {
 3286		struct ethhdr *eth;
 3287
 3288		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
 3289			return 0;
 3290
 3291		eth = (struct ethhdr *)skb->data;
 3292		type = eth->h_proto;
 3293	}
 3294
 3295	return __vlan_get_protocol(skb, type, depth);
 3296}
 3297
 3298/**
 3299 *	skb_mac_gso_segment - mac layer segmentation handler.
 3300 *	@skb: buffer to segment
 3301 *	@features: features for the output path (see dev->features)
 3302 */
 3303struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 3304				    netdev_features_t features)
 3305{
 3306	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
 3307	struct packet_offload *ptype;
 3308	int vlan_depth = skb->mac_len;
 3309	__be16 type = skb_network_protocol(skb, &vlan_depth);
 3310
 3311	if (unlikely(!type))
 3312		return ERR_PTR(-EINVAL);
 3313
 3314	__skb_pull(skb, vlan_depth);
 3315
 3316	rcu_read_lock();
 3317	list_for_each_entry_rcu(ptype, &offload_base, list) {
 3318		if (ptype->type == type && ptype->callbacks.gso_segment) {
 3319			segs = ptype->callbacks.gso_segment(skb, features);
 3320			break;
 3321		}
 3322	}
 3323	rcu_read_unlock();
 3324
 3325	__skb_push(skb, skb->data - skb_mac_header(skb));
 3326
 3327	return segs;
 3328}
 3329EXPORT_SYMBOL(skb_mac_gso_segment);
 3330
 3331
 3332/* openvswitch calls this on rx path, so we need a different check.
 3333 */
 3334static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 3335{
 3336	if (tx_path)
 3337		return skb->ip_summed != CHECKSUM_PARTIAL &&
 3338		       skb->ip_summed != CHECKSUM_UNNECESSARY;
 3339
 3340	return skb->ip_summed == CHECKSUM_NONE;
 3341}
 3342
 3343/**
 3344 *	__skb_gso_segment - Perform segmentation on skb.
 3345 *	@skb: buffer to segment
 3346 *	@features: features for the output path (see dev->features)
 3347 *	@tx_path: whether it is called in TX path
 3348 *
 3349 *	This function segments the given skb and returns a list of segments.
 3350 *
 3351 *	It may return NULL if the skb requires no segmentation.  This is
 3352 *	only possible when GSO is used for verifying header integrity.
 3353 *
 3354 *	Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
 3355 */
 3356struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 3357				  netdev_features_t features, bool tx_path)
 3358{
 3359	struct sk_buff *segs;
 3360
 3361	if (unlikely(skb_needs_check(skb, tx_path))) {
 3362		int err;
 3363
 3364		/* We're going to init ->check field in TCP or UDP header */
 3365		err = skb_cow_head(skb, 0);
 3366		if (err < 0)
 3367			return ERR_PTR(err);
 3368	}
 3369
 3370	/* Only report GSO partial support if it will enable us to
 3371	 * support segmentation on this frame without needing additional
 3372	 * work.
 3373	 */
 3374	if (features & NETIF_F_GSO_PARTIAL) {
 3375		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
 3376		struct net_device *dev = skb->dev;
 3377
 3378		partial_features |= dev->features & dev->gso_partial_features;
 3379		if (!skb_gso_ok(skb, features | partial_features))
 3380			features &= ~NETIF_F_GSO_PARTIAL;
 3381	}
 3382
 3383	BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
 3384		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
 3385
 3386	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
 3387	SKB_GSO_CB(skb)->encap_level = 0;
 3388
 3389	skb_reset_mac_header(skb);
 3390	skb_reset_mac_len(skb);
 3391
 3392	segs = skb_mac_gso_segment(skb, features);
 3393
 3394	if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
 3395		skb_warn_bad_offload(skb);
 3396
 3397	return segs;
 3398}
 3399EXPORT_SYMBOL(__skb_gso_segment);
 3400
 3401/* Take action when hardware reception checksum errors are detected. */
 3402#ifdef CONFIG_BUG
 3403void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 3404{
 3405	if (net_ratelimit()) {
 3406		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
 3407		skb_dump(KERN_ERR, skb, true);
 3408		dump_stack();
 3409	}
 3410}
 3411EXPORT_SYMBOL(netdev_rx_csum_fault);
 3412#endif
 3413
 3414/* XXX: check that highmem exists at all on the given machine. */
 3415static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 3416{
 3417#ifdef CONFIG_HIGHMEM
 3418	int i;
 3419
 3420	if (!(dev->features & NETIF_F_HIGHDMA)) {
 3421		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 3422			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 3423
 3424			if (PageHighMem(skb_frag_page(frag)))
 3425				return 1;
 3426		}
 3427	}
 3428#endif
 3429	return 0;
 3430}
 3431
 3432/* If MPLS offload request, verify we are testing hardware MPLS features
 3433 * instead of standard features for the netdev.
 3434 */
 3435#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
 3436static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3437					   netdev_features_t features,
 3438					   __be16 type)
 3439{
 3440	if (eth_p_mpls(type))
 3441		features &= skb->dev->mpls_features;
 3442
 3443	return features;
 3444}
 3445#else
 3446static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3447					   netdev_features_t features,
 3448					   __be16 type)
 3449{
 3450	return features;
 3451}
 3452#endif
 3453
 3454static netdev_features_t harmonize_features(struct sk_buff *skb,
 3455	netdev_features_t features)
 3456{
 3457	__be16 type;
 3458
 3459	type = skb_network_protocol(skb, NULL);
 3460	features = net_mpls_features(skb, features, type);
 3461
 3462	if (skb->ip_summed != CHECKSUM_NONE &&
 3463	    !can_checksum_protocol(features, type)) {
 3464		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 3465	}
 3466	if (illegal_highdma(skb->dev, skb))
 3467		features &= ~NETIF_F_SG;
 3468
 3469	return features;
 3470}
 3471
 3472netdev_features_t passthru_features_check(struct sk_buff *skb,
 3473					  struct net_device *dev,
 3474					  netdev_features_t features)
 3475{
 3476	return features;
 3477}
 3478EXPORT_SYMBOL(passthru_features_check);
 3479
 3480static netdev_features_t dflt_features_check(struct sk_buff *skb,
 3481					     struct net_device *dev,
 3482					     netdev_features_t features)
 3483{
 3484	return vlan_features_check(skb, features);
 3485}
 3486
 3487static netdev_features_t gso_features_check(const struct sk_buff *skb,
 3488					    struct net_device *dev,
 3489					    netdev_features_t features)
 3490{
 3491	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 3492
 3493	if (gso_segs > dev->gso_max_segs)
 3494		return features & ~NETIF_F_GSO_MASK;
 3495
 3496	/* Support for GSO partial features requires software
 3497	 * intervention before we can actually process the packets
 3498	 * so we need to strip support for any partial features now
 3499	 * and we can pull them back in after we have partially
 3500	 * segmented the frame.
 3501	 */
 3502	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
 3503		features &= ~dev->gso_partial_features;
 3504
 3505	/* Make sure to clear the IPv4 ID mangling feature if the
 3506	 * IPv4 header has the potential to be fragmented.
 3507	 */
 3508	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
 3509		struct iphdr *iph = skb->encapsulation ?
 3510				    inner_ip_hdr(skb) : ip_hdr(skb);
 3511
 3512		if (!(iph->frag_off & htons(IP_DF)))
 3513			features &= ~NETIF_F_TSO_MANGLEID;
 3514	}
 3515
 3516	return features;
 3517}
 3518
 3519netdev_features_t netif_skb_features(struct sk_buff *skb)
 3520{
 3521	struct net_device *dev = skb->dev;
 3522	netdev_features_t features = dev->features;
 3523
 3524	if (skb_is_gso(skb))
 3525		features = gso_features_check(skb, dev, features);
 3526
 3527	/* If encapsulation offload request, verify we are testing
 3528	 * hardware encapsulation features instead of standard
 3529	 * features for the netdev
 3530	 */
 3531	if (skb->encapsulation)
 3532		features &= dev->hw_enc_features;
 3533
 3534	if (skb_vlan_tagged(skb))
 3535		features = netdev_intersect_features(features,
 3536						     dev->vlan_features |
 3537						     NETIF_F_HW_VLAN_CTAG_TX |
 3538						     NETIF_F_HW_VLAN_STAG_TX);
 3539
 3540	if (dev->netdev_ops->ndo_features_check)
 3541		features &= dev->netdev_ops->ndo_features_check(skb, dev,
 3542								features);
 3543	else
 3544		features &= dflt_features_check(skb, dev, features);
 3545
 3546	return harmonize_features(skb, features);
 3547}
 3548EXPORT_SYMBOL(netif_skb_features);
 3549
 3550static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 3551		    struct netdev_queue *txq, bool more)
 3552{
 3553	unsigned int len;
 3554	int rc;
 3555
 3556	if (dev_nit_active(dev))
 3557		dev_queue_xmit_nit(skb, dev);
 3558
 3559	len = skb->len;
 3560	trace_net_dev_start_xmit(skb, dev);
 3561	rc = netdev_start_xmit(skb, dev, txq, more);
 3562	trace_net_dev_xmit(skb, rc, dev, len);
 3563
 3564	return rc;
 3565}
 3566
 3567struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
 3568				    struct netdev_queue *txq, int *ret)
 3569{
 3570	struct sk_buff *skb = first;
 3571	int rc = NETDEV_TX_OK;
 3572
 3573	while (skb) {
 3574		struct sk_buff *next = skb->next;
 3575
 3576		skb_mark_not_on_list(skb);
 3577		rc = xmit_one(skb, dev, txq, next != NULL);
 3578		if (unlikely(!dev_xmit_complete(rc))) {
 3579			skb->next = next;
 3580			goto out;
 3581		}
 3582
 3583		skb = next;
 3584		if (netif_tx_queue_stopped(txq) && skb) {
 3585			rc = NETDEV_TX_BUSY;
 3586			break;
 3587		}
 3588	}
 3589
 3590out:
 3591	*ret = rc;
 3592	return skb;
 3593}
 3594
 3595static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 3596					  netdev_features_t features)
 3597{
 3598	if (skb_vlan_tag_present(skb) &&
 3599	    !vlan_hw_offload_capable(features, skb->vlan_proto))
 3600		skb = __vlan_hwaccel_push_inside(skb);
 3601	return skb;
 3602}
 3603
 3604int skb_csum_hwoffload_help(struct sk_buff *skb,
 3605			    const netdev_features_t features)
 3606{
 3607	if (unlikely(skb->csum_not_inet))
 3608		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
 3609			skb_crc32c_csum_help(skb);
 3610
 3611	return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
 3612}
 3613EXPORT_SYMBOL(skb_csum_hwoffload_help);
 3614
 3615static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
 3616{
 3617	netdev_features_t features;
 3618
 3619	features = netif_skb_features(skb);
 3620	skb = validate_xmit_vlan(skb, features);
 3621	if (unlikely(!skb))
 3622		goto out_null;
 3623
 3624	skb = sk_validate_xmit_skb(skb, dev);
 3625	if (unlikely(!skb))
 3626		goto out_null;
 3627
 3628	if (netif_needs_gso(skb, features)) {
 3629		struct sk_buff *segs;
 3630
 3631		segs = skb_gso_segment(skb, features);
 3632		if (IS_ERR(segs)) {
 3633			goto out_kfree_skb;
 3634		} else if (segs) {
 3635			consume_skb(skb);
 3636			skb = segs;
 3637		}
 3638	} else {
 3639		if (skb_needs_linearize(skb, features) &&
 3640		    __skb_linearize(skb))
 3641			goto out_kfree_skb;
 3642
 3643		/* If packet is not checksummed and device does not
 3644		 * support checksumming for this protocol, complete
 3645		 * checksumming here.
 3646		 */
 3647		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 3648			if (skb->encapsulation)
 3649				skb_set_inner_transport_header(skb,
 3650							       skb_checksum_start_offset(skb));
 3651			else
 3652				skb_set_transport_header(skb,
 3653							 skb_checksum_start_offset(skb));
 3654			if (skb_csum_hwoffload_help(skb, features))
 3655				goto out_kfree_skb;
 3656		}
 3657	}
 3658
 3659	skb = validate_xmit_xfrm(skb, features, again);
 3660
 3661	return skb;
 3662
 3663out_kfree_skb:
 3664	kfree_skb(skb);
 3665out_null:
 3666	atomic_long_inc(&dev->tx_dropped);
 3667	return NULL;
 3668}
 3669
 3670struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
 3671{
 3672	struct sk_buff *next, *head = NULL, *tail;
 3673
 3674	for (; skb != NULL; skb = next) {
 3675		next = skb->next;
 3676		skb_mark_not_on_list(skb);
 3677
 3678		/* in case skb wont be segmented, point to itself */
 3679		skb->prev = skb;
 3680
 3681		skb = validate_xmit_skb(skb, dev, again);
 3682		if (!skb)
 3683			continue;
 3684
 3685		if (!head)
 3686			head = skb;
 3687		else
 3688			tail->next = skb;
 3689		/* If skb was segmented, skb->prev points to
 3690		 * the last segment. If not, it still contains skb.
 3691		 */
 3692		tail = skb->prev;
 3693	}
 3694	return head;
 3695}
 3696EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
 3697
 3698static void qdisc_pkt_len_init(struct sk_buff *skb)
 3699{
 3700	const struct skb_shared_info *shinfo = skb_shinfo(skb);
 3701
 3702	qdisc_skb_cb(skb)->pkt_len = skb->len;
 3703
 3704	/* To get more precise estimation of bytes sent on wire,
 3705	 * we add to pkt_len the headers size of all segments
 3706	 */
 3707	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
 3708		unsigned int hdr_len;
 3709		u16 gso_segs = shinfo->gso_segs;
 3710
 3711		/* mac layer + network layer */
 3712		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
 3713
 3714		/* + transport layer */
 3715		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
 3716			const struct tcphdr *th;
 3717			struct tcphdr _tcphdr;
 3718
 3719			th = skb_header_pointer(skb, skb_transport_offset(skb),
 3720						sizeof(_tcphdr), &_tcphdr);
 3721			if (likely(th))
 3722				hdr_len += __tcp_hdrlen(th);
 3723		} else {
 3724			struct udphdr _udphdr;
 3725
 3726			if (skb_header_pointer(skb, skb_transport_offset(skb),
 3727					       sizeof(_udphdr), &_udphdr))
 3728				hdr_len += sizeof(struct udphdr);
 3729		}
 3730
 3731		if (shinfo->gso_type & SKB_GSO_DODGY)
 3732			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
 3733						shinfo->gso_size);
 3734
 3735		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
 3736	}
 3737}
 3738
 3739static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 3740				 struct net_device *dev,
 3741				 struct netdev_queue *txq)
 3742{
 3743	spinlock_t *root_lock = qdisc_lock(q);
 3744	struct sk_buff *to_free = NULL;
 3745	bool contended;
 3746	int rc;
 3747
 3748	qdisc_calculate_pkt_len(skb, q);
 3749
 3750	if (q->flags & TCQ_F_NOLOCK) {
 3751		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 3752		qdisc_run(q);
 3753
 3754		if (unlikely(to_free))
 3755			kfree_skb_list(to_free);
 3756		return rc;
 3757	}
 3758
 3759	/*
 3760	 * Heuristic to force contended enqueues to serialize on a
 3761	 * separate lock before trying to get qdisc main lock.
 3762	 * This permits qdisc->running owner to get the lock more
 3763	 * often and dequeue packets faster.
 3764	 */
 3765	contended = qdisc_is_running(q);
 3766	if (unlikely(contended))
 3767		spin_lock(&q->busylock);
 3768
 3769	spin_lock(root_lock);
 3770	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
 3771		__qdisc_drop(skb, &to_free);
 3772		rc = NET_XMIT_DROP;
 3773	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
 3774		   qdisc_run_begin(q)) {
 3775		/*
 3776		 * This is a work-conserving queue; there are no old skbs
 3777		 * waiting to be sent out; and the qdisc is not running -
 3778		 * xmit the skb directly.
 3779		 */
 3780
 3781		qdisc_bstats_update(q, skb);
 3782
 3783		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
 3784			if (unlikely(contended)) {
 3785				spin_unlock(&q->busylock);
 3786				contended = false;
 3787			}
 3788			__qdisc_run(q);
 3789		}
 3790
 3791		qdisc_run_end(q);
 3792		rc = NET_XMIT_SUCCESS;
 3793	} else {
 3794		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 3795		if (qdisc_run_begin(q)) {
 3796			if (unlikely(contended)) {
 3797				spin_unlock(&q->busylock);
 3798				contended = false;
 3799			}
 3800			__qdisc_run(q);
 3801			qdisc_run_end(q);
 3802		}
 3803	}
 3804	spin_unlock(root_lock);
 3805	if (unlikely(to_free))
 3806		kfree_skb_list(to_free);
 3807	if (unlikely(contended))
 3808		spin_unlock(&q->busylock);
 3809	return rc;
 3810}
 3811
 3812#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 3813static void skb_update_prio(struct sk_buff *skb)
 3814{
 3815	const struct netprio_map *map;
 3816	const struct sock *sk;
 3817	unsigned int prioidx;
 3818
 3819	if (skb->priority)
 3820		return;
 3821	map = rcu_dereference_bh(skb->dev->priomap);
 3822	if (!map)
 3823		return;
 3824	sk = skb_to_full_sk(skb);
 3825	if (!sk)
 3826		return;
 3827
 3828	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
 3829
 3830	if (prioidx < map->priomap_len)
 3831		skb->priority = map->priomap[prioidx];
 3832}
 3833#else
 3834#define skb_update_prio(skb)
 3835#endif
 3836
 3837/**
 3838 *	dev_loopback_xmit - loop back @skb
 3839 *	@net: network namespace this loopback is happening in
 3840 *	@sk:  sk needed to be a netfilter okfn
 3841 *	@skb: buffer to transmit
 3842 */
 3843int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
 3844{
 3845	skb_reset_mac_header(skb);
 3846	__skb_pull(skb, skb_network_offset(skb));
 3847	skb->pkt_type = PACKET_LOOPBACK;
 3848	skb->ip_summed = CHECKSUM_UNNECESSARY;
 3849	WARN_ON(!skb_dst(skb));
 3850	skb_dst_force(skb);
 3851	netif_rx_ni(skb);
 3852	return 0;
 3853}
 3854EXPORT_SYMBOL(dev_loopback_xmit);
 3855
 3856#ifdef CONFIG_NET_EGRESS
 3857static struct sk_buff *
 3858sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 3859{
 3860	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
 3861	struct tcf_result cl_res;
 3862
 3863	if (!miniq)
 3864		return skb;
 3865
 3866	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
 3867	mini_qdisc_bstats_cpu_update(miniq, skb);
 3868
 3869	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
 3870	case TC_ACT_OK:
 3871	case TC_ACT_RECLASSIFY:
 3872		skb->tc_index = TC_H_MIN(cl_res.classid);
 3873		break;
 3874	case TC_ACT_SHOT:
 3875		mini_qdisc_qstats_cpu_drop(miniq);
 3876		*ret = NET_XMIT_DROP;
 3877		kfree_skb(skb);
 3878		return NULL;
 3879	case TC_ACT_STOLEN:
 3880	case TC_ACT_QUEUED:
 3881	case TC_ACT_TRAP:
 3882		*ret = NET_XMIT_SUCCESS;
 3883		consume_skb(skb);
 3884		return NULL;
 3885	case TC_ACT_REDIRECT:
 3886		/* No need to push/pop skb's mac_header here on egress! */
 3887		skb_do_redirect(skb);
 3888		*ret = NET_XMIT_SUCCESS;
 3889		return NULL;
 3890	default:
 3891		break;
 3892	}
 3893
 3894	return skb;
 3895}
 3896#endif /* CONFIG_NET_EGRESS */
 3897
 3898#ifdef CONFIG_XPS
 3899static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 3900			       struct xps_dev_maps *dev_maps, unsigned int tci)
 3901{
 3902	struct xps_map *map;
 3903	int queue_index = -1;
 3904
 3905	if (dev->num_tc) {
 3906		tci *= dev->num_tc;
 3907		tci += netdev_get_prio_tc_map(dev, skb->priority);
 3908	}
 3909
 3910	map = rcu_dereference(dev_maps->attr_map[tci]);
 3911	if (map) {
 3912		if (map->len == 1)
 3913			queue_index = map->queues[0];
 3914		else
 3915			queue_index = map->queues[reciprocal_scale(
 3916						skb_get_hash(skb), map->len)];
 3917		if (unlikely(queue_index >= dev->real_num_tx_queues))
 3918			queue_index = -1;
 3919	}
 3920	return queue_index;
 3921}
 3922#endif
 3923
 3924static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
 3925			 struct sk_buff *skb)
 3926{
 3927#ifdef CONFIG_XPS
 3928	struct xps_dev_maps *dev_maps;
 3929	struct sock *sk = skb->sk;
 3930	int queue_index = -1;
 3931
 3932	if (!static_key_false(&xps_needed))
 3933		return -1;
 3934
 3935	rcu_read_lock();
 3936	if (!static_key_false(&xps_rxqs_needed))
 3937		goto get_cpus_map;
 3938
 3939	dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
 3940	if (dev_maps) {
 3941		int tci = sk_rx_queue_get(sk);
 3942
 3943		if (tci >= 0 && tci < dev->num_rx_queues)
 3944			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 3945							  tci);
 3946	}
 3947
 3948get_cpus_map:
 3949	if (queue_index < 0) {
 3950		dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
 3951		if (dev_maps) {
 3952			unsigned int tci = skb->sender_cpu - 1;
 3953
 3954			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 3955							  tci);
 3956		}
 3957	}
 3958	rcu_read_unlock();
 3959
 3960	return queue_index;
 3961#else
 3962	return -1;
 3963#endif
 3964}
 3965
 3966u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 3967		     struct net_device *sb_dev)
 3968{
 3969	return 0;
 3970}
 3971EXPORT_SYMBOL(dev_pick_tx_zero);
 3972
 3973u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
 3974		       struct net_device *sb_dev)
 3975{
 3976	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
 3977}
 3978EXPORT_SYMBOL(dev_pick_tx_cpu_id);
 3979
 3980u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 3981		     struct net_device *sb_dev)
 3982{
 3983	struct sock *sk = skb->sk;
 3984	int queue_index = sk_tx_queue_get(sk);
 3985
 3986	sb_dev = sb_dev ? : dev;
 3987
 3988	if (queue_index < 0 || skb->ooo_okay ||
 3989	    queue_index >= dev->real_num_tx_queues) {
 3990		int new_index = get_xps_queue(dev, sb_dev, skb);
 3991
 3992		if (new_index < 0)
 3993			new_index = skb_tx_hash(dev, sb_dev, skb);
 3994
 3995		if (queue_index != new_index && sk &&
 3996		    sk_fullsock(sk) &&
 3997		    rcu_access_pointer(sk->sk_dst_cache))
 3998			sk_tx_queue_set(sk, new_index);
 3999
 4000		queue_index = new_index;
 4001	}
 4002
 4003	return queue_index;
 4004}
 4005EXPORT_SYMBOL(netdev_pick_tx);
 4006
 4007struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
 4008					 struct sk_buff *skb,
 4009					 struct net_device *sb_dev)
 4010{
 4011	int queue_index = 0;
 4012
 4013#ifdef CONFIG_XPS
 4014	u32 sender_cpu = skb->sender_cpu - 1;
 4015
 4016	if (sender_cpu >= (u32)NR_CPUS)
 4017		skb->sender_cpu = raw_smp_processor_id() + 1;
 4018#endif
 4019
 4020	if (dev->real_num_tx_queues != 1) {
 4021		const struct net_device_ops *ops = dev->netdev_ops;
 4022
 4023		if (ops->ndo_select_queue)
 4024			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
 4025		else
 4026			queue_index = netdev_pick_tx(dev, skb, sb_dev);
 4027
 4028		queue_index = netdev_cap_txqueue(dev, queue_index);
 4029	}
 4030
 4031	skb_set_queue_mapping(skb, queue_index);
 4032	return netdev_get_tx_queue(dev, queue_index);
 4033}
 4034
 4035/**
 4036 *	__dev_queue_xmit - transmit a buffer
 4037 *	@skb: buffer to transmit
 4038 *	@sb_dev: suboordinate device used for L2 forwarding offload
 4039 *
 4040 *	Queue a buffer for transmission to a network device. The caller must
 4041 *	have set the device and priority and built the buffer before calling
 4042 *	this function. The function can be called from an interrupt.
 4043 *
 4044 *	A negative errno code is returned on a failure. A success does not
 4045 *	guarantee the frame will be transmitted as it may be dropped due
 4046 *	to congestion or traffic shaping.
 4047 *
 4048 * -----------------------------------------------------------------------------------
 4049 *      I notice this method can also return errors from the queue disciplines,
 4050 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
 4051 *      be positive.
 4052 *
 4053 *      Regardless of the return value, the skb is consumed, so it is currently
 4054 *      difficult to retry a send to this method.  (You can bump the ref count
 4055 *      before sending to hold a reference for retry if you are careful.)
 4056 *
 4057 *      When calling this method, interrupts MUST be enabled.  This is because
 4058 *      the BH enable code must have IRQs enabled so that it will not deadlock.
 4059 *          --BLG
 4060 */
 4061static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 4062{
 4063	struct net_device *dev = skb->dev;
 4064	struct netdev_queue *txq;
 4065	struct Qdisc *q;
 4066	int rc = -ENOMEM;
 4067	bool again = false;
 4068
 4069	skb_reset_mac_header(skb);
 4070
 4071	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
 4072		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
 4073
 4074	/* Disable soft irqs for various locks below. Also
 4075	 * stops preemption for RCU.
 4076	 */
 4077	rcu_read_lock_bh();
 4078
 4079	skb_update_prio(skb);
 4080
 4081	qdisc_pkt_len_init(skb);
 4082#ifdef CONFIG_NET_CLS_ACT
 4083	skb->tc_at_ingress = 0;
 4084# ifdef CONFIG_NET_EGRESS
 4085	if (static_branch_unlikely(&egress_needed_key)) {
 4086		skb = sch_handle_egress(skb, &rc, dev);
 4087		if (!skb)
 4088			goto out;
 4089	}
 4090# endif
 4091#endif
 4092	/* If device/qdisc don't need skb->dst, release it right now while
 4093	 * its hot in this cpu cache.
 4094	 */
 4095	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
 4096		skb_dst_drop(skb);
 4097	else
 4098		skb_dst_force(skb);
 4099
 4100	txq = netdev_core_pick_tx(dev, skb, sb_dev);
 4101	q = rcu_dereference_bh(txq->qdisc);
 4102
 4103	trace_net_dev_queue(skb);
 4104	if (q->enqueue) {
 4105		rc = __dev_xmit_skb(skb, q, dev, txq);
 4106		goto out;
 4107	}
 4108
 4109	/* The device has no queue. Common case for software devices:
 4110	 * loopback, all the sorts of tunnels...
 4111
 4112	 * Really, it is unlikely that netif_tx_lock protection is necessary
 4113	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
 4114	 * counters.)
 4115	 * However, it is possible, that they rely on protection
 4116	 * made by us here.
 4117
 4118	 * Check this and shot the lock. It is not prone from deadlocks.
 4119	 *Either shot noqueue qdisc, it is even simpler 8)
 4120	 */
 4121	if (dev->flags & IFF_UP) {
 4122		int cpu = smp_processor_id(); /* ok because BHs are off */
 4123
 4124		if (txq->xmit_lock_owner != cpu) {
 4125			if (dev_xmit_recursion())
 4126				goto recursion_alert;
 4127
 4128			skb = validate_xmit_skb(skb, dev, &again);
 4129			if (!skb)
 4130				goto out;
 4131
 4132			HARD_TX_LOCK(dev, txq, cpu);
 4133
 4134			if (!netif_xmit_stopped(txq)) {
 4135				dev_xmit_recursion_inc();
 4136				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
 4137				dev_xmit_recursion_dec();
 4138				if (dev_xmit_complete(rc)) {
 4139					HARD_TX_UNLOCK(dev, txq);
 4140					goto out;
 4141				}
 4142			}
 4143			HARD_TX_UNLOCK(dev, txq);
 4144			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
 4145					     dev->name);
 4146		} else {
 4147			/* Recursion is detected! It is possible,
 4148			 * unfortunately
 4149			 */
 4150recursion_alert:
 4151			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
 4152					     dev->name);
 4153		}
 4154	}
 4155
 4156	rc = -ENETDOWN;
 4157	rcu_read_unlock_bh();
 4158
 4159	atomic_long_inc(&dev->tx_dropped);
 4160	kfree_skb_list(skb);
 4161	return rc;
 4162out:
 4163	rcu_read_unlock_bh();
 4164	return rc;
 4165}
 4166
 4167int dev_queue_xmit(struct sk_buff *skb)
 4168{
 4169	return __dev_queue_xmit(skb, NULL);
 4170}
 4171EXPORT_SYMBOL(dev_queue_xmit);
 4172
 4173int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
 4174{
 4175	return __dev_queue_xmit(skb, sb_dev);
 4176}
 4177EXPORT_SYMBOL(dev_queue_xmit_accel);
 4178
 4179int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 4180{
 4181	struct net_device *dev = skb->dev;
 4182	struct sk_buff *orig_skb = skb;
 4183	struct netdev_queue *txq;
 4184	int ret = NETDEV_TX_BUSY;
 4185	bool again = false;
 4186
 4187	if (unlikely(!netif_running(dev) ||
 4188		     !netif_carrier_ok(dev)))
 4189		goto drop;
 4190
 4191	skb = validate_xmit_skb_list(skb, dev, &again);
 4192	if (skb != orig_skb)
 4193		goto drop;
 4194
 4195	skb_set_queue_mapping(skb, queue_id);
 4196	txq = skb_get_tx_queue(dev, skb);
 4197
 4198	local_bh_disable();
 4199
 4200	dev_xmit_recursion_inc();
 4201	HARD_TX_LOCK(dev, txq, smp_processor_id());
 4202	if (!netif_xmit_frozen_or_drv_stopped(txq))
 4203		ret = netdev_start_xmit(skb, dev, txq, false);
 4204	HARD_TX_UNLOCK(dev, txq);
 4205	dev_xmit_recursion_dec();
 4206
 4207	local_bh_enable();
 4208
 4209	if (!dev_xmit_complete(ret))
 4210		kfree_skb(skb);
 4211
 4212	return ret;
 4213drop:
 4214	atomic_long_inc(&dev->tx_dropped);
 4215	kfree_skb_list(skb);
 4216	return NET_XMIT_DROP;
 4217}
 4218EXPORT_SYMBOL(dev_direct_xmit);
 4219
 4220/*************************************************************************
 4221 *			Receiver routines
 4222 *************************************************************************/
 4223
 4224int netdev_max_backlog __read_mostly = 1000;
 4225EXPORT_SYMBOL(netdev_max_backlog);
 4226
 4227int netdev_tstamp_prequeue __read_mostly = 1;
 4228int netdev_budget __read_mostly = 300;
 4229/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
 4230unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
 4231int weight_p __read_mostly = 64;           /* old backlog weight */
 4232int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 4233int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
 4234int dev_rx_weight __read_mostly = 64;
 4235int dev_tx_weight __read_mostly = 64;
 4236/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
 4237int gro_normal_batch __read_mostly = 8;
 4238
 4239/* Called with irq disabled */
 4240static inline void ____napi_schedule(struct softnet_data *sd,
 4241				     struct napi_struct *napi)
 4242{
 4243	list_add_tail(&napi->poll_list, &sd->poll_list);
 4244	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4245}
 4246
 4247#ifdef CONFIG_RPS
 4248
 4249/* One global table that all flow-based protocols share. */
 4250struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 4251EXPORT_SYMBOL(rps_sock_flow_table);
 4252u32 rps_cpu_mask __read_mostly;
 4253EXPORT_SYMBOL(rps_cpu_mask);
 4254
 4255struct static_key_false rps_needed __read_mostly;
 4256EXPORT_SYMBOL(rps_needed);
 4257struct static_key_false rfs_needed __read_mostly;
 4258EXPORT_SYMBOL(rfs_needed);
 4259
 4260static struct rps_dev_flow *
 4261set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4262	    struct rps_dev_flow *rflow, u16 next_cpu)
 4263{
 4264	if (next_cpu < nr_cpu_ids) {
 4265#ifdef CONFIG_RFS_ACCEL
 4266		struct netdev_rx_queue *rxqueue;
 4267		struct rps_dev_flow_table *flow_table;
 4268		struct rps_dev_flow *old_rflow;
 4269		u32 flow_id;
 4270		u16 rxq_index;
 4271		int rc;
 4272
 4273		/* Should we steer this flow to a different hardware queue? */
 4274		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
 4275		    !(dev->features & NETIF_F_NTUPLE))
 4276			goto out;
 4277		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
 4278		if (rxq_index == skb_get_rx_queue(skb))
 4279			goto out;
 4280
 4281		rxqueue = dev->_rx + rxq_index;
 4282		flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4283		if (!flow_table)
 4284			goto out;
 4285		flow_id = skb_get_hash(skb) & flow_table->mask;
 4286		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
 4287							rxq_index, flow_id);
 4288		if (rc < 0)
 4289			goto out;
 4290		old_rflow = rflow;
 4291		rflow = &flow_table->flows[flow_id];
 4292		rflow->filter = rc;
 4293		if (old_rflow->filter == rflow->filter)
 4294			old_rflow->filter = RPS_NO_FILTER;
 4295	out:
 4296#endif
 4297		rflow->last_qtail =
 4298			per_cpu(softnet_data, next_cpu).input_queue_head;
 4299	}
 4300
 4301	rflow->cpu = next_cpu;
 4302	return rflow;
 4303}
 4304
 4305/*
 4306 * get_rps_cpu is called from netif_receive_skb and returns the target
 4307 * CPU from the RPS map of the receiving queue for a given skb.
 4308 * rcu_read_lock must be held on entry.
 4309 */
 4310static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4311		       struct rps_dev_flow **rflowp)
 4312{
 4313	const struct rps_sock_flow_table *sock_flow_table;
 4314	struct netdev_rx_queue *rxqueue = dev->_rx;
 4315	struct rps_dev_flow_table *flow_table;
 4316	struct rps_map *map;
 4317	int cpu = -1;
 4318	u32 tcpu;
 4319	u32 hash;
 4320
 4321	if (skb_rx_queue_recorded(skb)) {
 4322		u16 index = skb_get_rx_queue(skb);
 4323
 4324		if (unlikely(index >= dev->real_num_rx_queues)) {
 4325			WARN_ONCE(dev->real_num_rx_queues > 1,
 4326				  "%s received packet on queue %u, but number "
 4327				  "of RX queues is %u\n",
 4328				  dev->name, index, dev->real_num_rx_queues);
 4329			goto done;
 4330		}
 4331		rxqueue += index;
 4332	}
 4333
 4334	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
 4335
 4336	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4337	map = rcu_dereference(rxqueue->rps_map);
 4338	if (!flow_table && !map)
 4339		goto done;
 4340
 4341	skb_reset_network_header(skb);
 4342	hash = skb_get_hash(skb);
 4343	if (!hash)
 4344		goto done;
 4345
 4346	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 4347	if (flow_table && sock_flow_table) {
 4348		struct rps_dev_flow *rflow;
 4349		u32 next_cpu;
 4350		u32 ident;
 4351
 4352		/* First check into global flow table if there is a match */
 4353		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
 4354		if ((ident ^ hash) & ~rps_cpu_mask)
 4355			goto try_rps;
 4356
 4357		next_cpu = ident & rps_cpu_mask;
 4358
 4359		/* OK, now we know there is a match,
 4360		 * we can look at the local (per receive queue) flow table
 4361		 */
 4362		rflow = &flow_table->flows[hash & flow_table->mask];
 4363		tcpu = rflow->cpu;
 4364
 4365		/*
 4366		 * If the desired CPU (where last recvmsg was done) is
 4367		 * different from current CPU (one in the rx-queue flow
 4368		 * table entry), switch if one of the following holds:
 4369		 *   - Current CPU is unset (>= nr_cpu_ids).
 4370		 *   - Current CPU is offline.
 4371		 *   - The current CPU's queue tail has advanced beyond the
 4372		 *     last packet that was enqueued using this table entry.
 4373		 *     This guarantees that all previous packets for the flow
 4374		 *     have been dequeued, thus preserving in order delivery.
 4375		 */
 4376		if (unlikely(tcpu != next_cpu) &&
 4377		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
 4378		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
 4379		      rflow->last_qtail)) >= 0)) {
 4380			tcpu = next_cpu;
 4381			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 4382		}
 4383
 4384		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
 4385			*rflowp = rflow;
 4386			cpu = tcpu;
 4387			goto done;
 4388		}
 4389	}
 4390
 4391try_rps:
 4392
 4393	if (map) {
 4394		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 4395		if (cpu_online(tcpu)) {
 4396			cpu = tcpu;
 4397			goto done;
 4398		}
 4399	}
 4400
 4401done:
 4402	return cpu;
 4403}
 4404
 4405#ifdef CONFIG_RFS_ACCEL
 4406
 4407/**
 4408 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 4409 * @dev: Device on which the filter was set
 4410 * @rxq_index: RX queue index
 4411 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 4412 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 4413 *
 4414 * Drivers that implement ndo_rx_flow_steer() should periodically call
 4415 * this function for each installed filter and remove the filters for
 4416 * which it returns %true.
 4417 */
 4418bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 4419			 u32 flow_id, u16 filter_id)
 4420{
 4421	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
 4422	struct rps_dev_flow_table *flow_table;
 4423	struct rps_dev_flow *rflow;
 4424	bool expire = true;
 4425	unsigned int cpu;
 4426
 4427	rcu_read_lock();
 4428	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4429	if (flow_table && flow_id <= flow_table->mask) {
 4430		rflow = &flow_table->flows[flow_id];
 4431		cpu = READ_ONCE(rflow->cpu);
 4432		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 4433		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
 4434			   rflow->last_qtail) <
 4435		     (int)(10 * flow_table->mask)))
 4436			expire = false;
 4437	}
 4438	rcu_read_unlock();
 4439	return expire;
 4440}
 4441EXPORT_SYMBOL(rps_may_expire_flow);
 4442
 4443#endif /* CONFIG_RFS_ACCEL */
 4444
 4445/* Called from hardirq (IPI) context */
 4446static void rps_trigger_softirq(void *data)
 4447{
 4448	struct softnet_data *sd = data;
 4449
 4450	____napi_schedule(sd, &sd->backlog);
 4451	sd->received_rps++;
 4452}
 4453
 4454#endif /* CONFIG_RPS */
 4455
 4456/*
 4457 * Check if this softnet_data structure is another cpu one
 4458 * If yes, queue it to our IPI list and return 1
 4459 * If no, return 0
 4460 */
 4461static int rps_ipi_queued(struct softnet_data *sd)
 4462{
 4463#ifdef CONFIG_RPS
 4464	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 4465
 4466	if (sd != mysd) {
 4467		sd->rps_ipi_next = mysd->rps_ipi_list;
 4468		mysd->rps_ipi_list = sd;
 4469
 4470		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4471		return 1;
 4472	}
 4473#endif /* CONFIG_RPS */
 4474	return 0;
 4475}
 4476
 4477#ifdef CONFIG_NET_FLOW_LIMIT
 4478int netdev_flow_limit_table_len __read_mostly = (1 << 12);
 4479#endif
 4480
 4481static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
 4482{
 4483#ifdef CONFIG_NET_FLOW_LIMIT
 4484	struct sd_flow_limit *fl;
 4485	struct softnet_data *sd;
 4486	unsigned int old_flow, new_flow;
 4487
 4488	if (qlen < (netdev_max_backlog >> 1))
 4489		return false;
 4490
 4491	sd = this_cpu_ptr(&softnet_data);
 4492
 4493	rcu_read_lock();
 4494	fl = rcu_dereference(sd->flow_limit);
 4495	if (fl) {
 4496		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
 4497		old_flow = fl->history[fl->history_head];
 4498		fl->history[fl->history_head] = new_flow;
 4499
 4500		fl->history_head++;
 4501		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
 4502
 4503		if (likely(fl->buckets[old_flow]))
 4504			fl->buckets[old_flow]--;
 4505
 4506		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
 4507			fl->count++;
 4508			rcu_read_unlock();
 4509			return true;
 4510		}
 4511	}
 4512	rcu_read_unlock();
 4513#endif
 4514	return false;
 4515}
 4516
 4517/*
 4518 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 4519 * queue (may be a remote CPU queue).
 4520 */
 4521static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 4522			      unsigned int *qtail)
 4523{
 4524	struct softnet_data *sd;
 4525	unsigned long flags;
 4526	unsigned int qlen;
 4527
 4528	sd = &per_cpu(softnet_data, cpu);
 4529
 4530	local_irq_save(flags);
 4531
 4532	rps_lock(sd);
 4533	if (!netif_running(skb->dev))
 4534		goto drop;
 4535	qlen = skb_queue_len(&sd->input_pkt_queue);
 4536	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 4537		if (qlen) {
 4538enqueue:
 4539			__skb_queue_tail(&sd->input_pkt_queue, skb);
 4540			input_queue_tail_incr_save(sd, qtail);
 4541			rps_unlock(sd);
 4542			local_irq_restore(flags);
 4543			return NET_RX_SUCCESS;
 4544		}
 4545
 4546		/* Schedule NAPI for backlog device
 4547		 * We can use non atomic operation since we own the queue lock
 4548		 */
 4549		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
 4550			if (!rps_ipi_queued(sd))
 4551				____napi_schedule(sd, &sd->backlog);
 4552		}
 4553		goto enqueue;
 4554	}
 4555
 4556drop:
 4557	sd->dropped++;
 4558	rps_unlock(sd);
 4559
 4560	local_irq_restore(flags);
 4561
 4562	atomic_long_inc(&skb->dev->rx_dropped);
 4563	kfree_skb(skb);
 4564	return NET_RX_DROP;
 4565}
 4566
 4567static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
 4568{
 4569	struct net_device *dev = skb->dev;
 4570	struct netdev_rx_queue *rxqueue;
 4571
 4572	rxqueue = dev->_rx;
 4573
 4574	if (skb_rx_queue_recorded(skb)) {
 4575		u16 index = skb_get_rx_queue(skb);
 4576
 4577		if (unlikely(index >= dev->real_num_rx_queues)) {
 4578			WARN_ONCE(dev->real_num_rx_queues > 1,
 4579				  "%s received packet on queue %u, but number "
 4580				  "of RX queues is %u\n",
 4581				  dev->name, index, dev->real_num_rx_queues);
 4582
 4583			return rxqueue; /* Return first rxqueue */
 4584		}
 4585		rxqueue += index;
 4586	}
 4587	return rxqueue;
 4588}
 4589
 4590static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 4591				     struct xdp_buff *xdp,
 4592				     struct bpf_prog *xdp_prog)
 4593{
 4594	struct netdev_rx_queue *rxqueue;
 4595	void *orig_data, *orig_data_end;
 4596	u32 metalen, act = XDP_DROP;
 4597	__be16 orig_eth_type;
 4598	struct ethhdr *eth;
 4599	bool orig_bcast;
 4600	int hlen, off;
 4601	u32 mac_len;
 4602
 4603	/* Reinjected packets coming from act_mirred or similar should
 4604	 * not get XDP generic processing.
 4605	 */
 4606	if (skb_is_redirected(skb))
 4607		return XDP_PASS;
 4608
 4609	/* XDP packets must be linear and must have sufficient headroom
 4610	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
 4611	 * native XDP provides, thus we need to do it here as well.
 4612	 */
 4613	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
 4614	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
 4615		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 4616		int troom = skb->tail + skb->data_len - skb->end;
 4617
 4618		/* In case we have to go down the path and also linearize,
 4619		 * then lets do the pskb_expand_head() work just once here.
 4620		 */
 4621		if (pskb_expand_head(skb,
 4622				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
 4623				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
 4624			goto do_drop;
 4625		if (skb_linearize(skb))
 4626			goto do_drop;
 4627	}
 4628
 4629	/* The XDP program wants to see the packet starting at the MAC
 4630	 * header.
 4631	 */
 4632	mac_len = skb->data - skb_mac_header(skb);
 4633	hlen = skb_headlen(skb) + mac_len;
 4634	xdp->data = skb->data - mac_len;
 4635	xdp->data_meta = xdp->data;
 4636	xdp->data_end = xdp->data + hlen;
 4637	xdp->data_hard_start = skb->data - skb_headroom(skb);
 4638
 4639	/* SKB "head" area always have tailroom for skb_shared_info */
 4640	xdp->frame_sz  = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
 4641	xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 4642
 4643	orig_data_end = xdp->data_end;
 4644	orig_data = xdp->data;
 4645	eth = (struct ethhdr *)xdp->data;
 4646	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
 4647	orig_eth_type = eth->h_proto;
 4648
 4649	rxqueue = netif_get_rxqueue(skb);
 4650	xdp->rxq = &rxqueue->xdp_rxq;
 4651
 4652	act = bpf_prog_run_xdp(xdp_prog, xdp);
 4653
 4654	/* check if bpf_xdp_adjust_head was used */
 4655	off = xdp->data - orig_data;
 4656	if (off) {
 4657		if (off > 0)
 4658			__skb_pull(skb, off);
 4659		else if (off < 0)
 4660			__skb_push(skb, -off);
 4661
 4662		skb->mac_header += off;
 4663		skb_reset_network_header(skb);
 4664	}
 4665
 4666	/* check if bpf_xdp_adjust_tail was used */
 4667	off = xdp->data_end - orig_data_end;
 4668	if (off != 0) {
 4669		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
 4670		skb->len += off; /* positive on grow, negative on shrink */
 4671	}
 4672
 4673	/* check if XDP changed eth hdr such SKB needs update */
 4674	eth = (struct ethhdr *)xdp->data;
 4675	if ((orig_eth_type != eth->h_proto) ||
 4676	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
 4677		__skb_push(skb, ETH_HLEN);
 4678		skb->protocol = eth_type_trans(skb, skb->dev);
 4679	}
 4680
 4681	switch (act) {
 4682	case XDP_REDIRECT:
 4683	case XDP_TX:
 4684		__skb_push(skb, mac_len);
 4685		break;
 4686	case XDP_PASS:
 4687		metalen = xdp->data - xdp->data_meta;
 4688		if (metalen)
 4689			skb_metadata_set(skb, metalen);
 4690		break;
 4691	default:
 4692		bpf_warn_invalid_xdp_action(act);
 4693		fallthrough;
 4694	case XDP_ABORTED:
 4695		trace_xdp_exception(skb->dev, xdp_prog, act);
 4696		fallthrough;
 4697	case XDP_DROP:
 4698	do_drop:
 4699		kfree_skb(skb);
 4700		break;
 4701	}
 4702
 4703	return act;
 4704}
 4705
 4706/* When doing generic XDP we have to bypass the qdisc layer and the
 4707 * network taps in order to match in-driver-XDP behavior.
 4708 */
 4709void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 4710{
 4711	struct net_device *dev = skb->dev;
 4712	struct netdev_queue *txq;
 4713	bool free_skb = true;
 4714	int cpu, rc;
 4715
 4716	txq = netdev_core_pick_tx(dev, skb, NULL);
 4717	cpu = smp_processor_id();
 4718	HARD_TX_LOCK(dev, txq, cpu);
 4719	if (!netif_xmit_stopped(txq)) {
 4720		rc = netdev_start_xmit(skb, dev, txq, 0);
 4721		if (dev_xmit_complete(rc))
 4722			free_skb = false;
 4723	}
 4724	HARD_TX_UNLOCK(dev, txq);
 4725	if (free_skb) {
 4726		trace_xdp_exception(dev, xdp_prog, XDP_TX);
 4727		kfree_skb(skb);
 4728	}
 4729}
 4730
 4731static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
 4732
 4733int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 4734{
 4735	if (xdp_prog) {
 4736		struct xdp_buff xdp;
 4737		u32 act;
 4738		int err;
 4739
 4740		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
 4741		if (act != XDP_PASS) {
 4742			switch (act) {
 4743			case XDP_REDIRECT:
 4744				err = xdp_do_generic_redirect(skb->dev, skb,
 4745							      &xdp, xdp_prog);
 4746				if (err)
 4747					goto out_redir;
 4748				break;
 4749			case XDP_TX:
 4750				generic_xdp_tx(skb, xdp_prog);
 4751				break;
 4752			}
 4753			return XDP_DROP;
 4754		}
 4755	}
 4756	return XDP_PASS;
 4757out_redir:
 4758	kfree_skb(skb);
 4759	return XDP_DROP;
 4760}
 4761EXPORT_SYMBOL_GPL(do_xdp_generic);
 4762
 4763static int netif_rx_internal(struct sk_buff *skb)
 4764{
 4765	int ret;
 4766
 4767	net_timestamp_check(netdev_tstamp_prequeue, skb);
 4768
 4769	trace_netif_rx(skb);
 4770
 4771#ifdef CONFIG_RPS
 4772	if (static_branch_unlikely(&rps_needed)) {
 4773		struct rps_dev_flow voidflow, *rflow = &voidflow;
 4774		int cpu;
 4775
 4776		preempt_disable();
 4777		rcu_read_lock();
 4778
 4779		cpu = get_rps_cpu(skb->dev, skb, &rflow);
 4780		if (cpu < 0)
 4781			cpu = smp_processor_id();
 4782
 4783		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 4784
 4785		rcu_read_unlock();
 4786		preempt_enable();
 4787	} else
 4788#endif
 4789	{
 4790		unsigned int qtail;
 4791
 4792		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
 4793		put_cpu();
 4794	}
 4795	return ret;
 4796}
 4797
 4798/**
 4799 *	netif_rx	-	post buffer to the network code
 4800 *	@skb: buffer to post
 4801 *
 4802 *	This function receives a packet from a device driver and queues it for
 4803 *	the upper (protocol) levels to process.  It always succeeds. The buffer
 4804 *	may be dropped during processing for congestion control or by the
 4805 *	protocol layers.
 4806 *
 4807 *	return values:
 4808 *	NET_RX_SUCCESS	(no congestion)
 4809 *	NET_RX_DROP     (packet was dropped)
 4810 *
 4811 */
 4812
 4813int netif_rx(struct sk_buff *skb)
 4814{
 4815	int ret;
 4816
 4817	trace_netif_rx_entry(skb);
 4818
 4819	ret = netif_rx_internal(skb);
 4820	trace_netif_rx_exit(ret);
 4821
 4822	return ret;
 4823}
 4824EXPORT_SYMBOL(netif_rx);
 4825
 4826int netif_rx_ni(struct sk_buff *skb)
 4827{
 4828	int err;
 4829
 4830	trace_netif_rx_ni_entry(skb);
 4831
 4832	preempt_disable();
 4833	err = netif_rx_internal(skb);
 4834	if (local_softirq_pending())
 4835		do_softirq();
 4836	preempt_enable();
 4837	trace_netif_rx_ni_exit(err);
 4838
 4839	return err;
 4840}
 4841EXPORT_SYMBOL(netif_rx_ni);
 4842
 4843static __latent_entropy void net_tx_action(struct softirq_action *h)
 4844{
 4845	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 4846
 4847	if (sd->completion_queue) {
 4848		struct sk_buff *clist;
 4849
 4850		local_irq_disable();
 4851		clist = sd->completion_queue;
 4852		sd->completion_queue = NULL;
 4853		local_irq_enable();
 4854
 4855		while (clist) {
 4856			struct sk_buff *skb = clist;
 4857
 4858			clist = clist->next;
 4859
 4860			WARN_ON(refcount_read(&skb->users));
 4861			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
 4862				trace_consume_skb(skb);
 4863			else
 4864				trace_kfree_skb(skb, net_tx_action);
 4865
 4866			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 4867				__kfree_skb(skb);
 4868			else
 4869				__kfree_skb_defer(skb);
 4870		}
 4871
 4872		__kfree_skb_flush();
 4873	}
 4874
 4875	if (sd->output_queue) {
 4876		struct Qdisc *head;
 4877
 4878		local_irq_disable();
 4879		head = sd->output_queue;
 4880		sd->output_queue = NULL;
 4881		sd->output_queue_tailp = &sd->output_queue;
 4882		local_irq_enable();
 4883
 4884		while (head) {
 4885			struct Qdisc *q = head;
 4886			spinlock_t *root_lock = NULL;
 4887
 4888			head = head->next_sched;
 4889
 4890			if (!(q->flags & TCQ_F_NOLOCK)) {
 4891				root_lock = qdisc_lock(q);
 4892				spin_lock(root_lock);
 4893			}
 4894			/* We need to make sure head->next_sched is read
 4895			 * before clearing __QDISC_STATE_SCHED
 4896			 */
 4897			smp_mb__before_atomic();
 4898			clear_bit(__QDISC_STATE_SCHED, &q->state);
 4899			qdisc_run(q);
 4900			if (root_lock)
 4901				spin_unlock(root_lock);
 4902		}
 4903	}
 4904
 4905	xfrm_dev_backlog(sd);
 4906}
 4907
 4908#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
 4909/* This hook is defined here for ATM LANE */
 4910int (*br_fdb_test_addr_hook)(struct net_device *dev,
 4911			     unsigned char *addr) __read_mostly;
 4912EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 4913#endif
 4914
 4915static inline struct sk_buff *
 4916sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 4917		   struct net_device *orig_dev)
 4918{
 4919#ifdef CONFIG_NET_CLS_ACT
 4920	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
 4921	struct tcf_result cl_res;
 4922
 4923	/* If there's at least one ingress present somewhere (so
 4924	 * we get here via enabled static key), remaining devices
 4925	 * that are not configured with an ingress qdisc will bail
 4926	 * out here.
 4927	 */
 4928	if (!miniq)
 4929		return skb;
 4930
 4931	if (*pt_prev) {
 4932		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 4933		*pt_prev = NULL;
 4934	}
 4935
 4936	qdisc_skb_cb(skb)->pkt_len = skb->len;
 4937	skb->tc_at_ingress = 1;
 4938	mini_qdisc_bstats_cpu_update(miniq, skb);
 4939
 4940	switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
 4941				     &cl_res, false)) {
 4942	case TC_ACT_OK:
 4943	case TC_ACT_RECLASSIFY:
 4944		skb->tc_index = TC_H_MIN(cl_res.classid);
 4945		break;
 4946	case TC_ACT_SHOT:
 4947		mini_qdisc_qstats_cpu_drop(miniq);
 4948		kfree_skb(skb);
 4949		return NULL;
 4950	case TC_ACT_STOLEN:
 4951	case TC_ACT_QUEUED:
 4952	case TC_ACT_TRAP:
 4953		consume_skb(skb);
 4954		return NULL;
 4955	case TC_ACT_REDIRECT:
 4956		/* skb_mac_header check was done by cls/act_bpf, so
 4957		 * we can safely push the L2 header back before
 4958		 * redirecting to another netdev
 4959		 */
 4960		__skb_push(skb, skb->mac_len);
 4961		skb_do_redirect(skb);
 4962		return NULL;
 4963	case TC_ACT_CONSUMED:
 4964		return NULL;
 4965	default:
 4966		break;
 4967	}
 4968#endif /* CONFIG_NET_CLS_ACT */
 4969	return skb;
 4970}
 4971
 4972/**
 4973 *	netdev_is_rx_handler_busy - check if receive handler is registered
 4974 *	@dev: device to check
 4975 *
 4976 *	Check if a receive handler is already registered for a given device.
 4977 *	Return true if there one.
 4978 *
 4979 *	The caller must hold the rtnl_mutex.
 4980 */
 4981bool netdev_is_rx_handler_busy(struct net_device *dev)
 4982{
 4983	ASSERT_RTNL();
 4984	return dev && rtnl_dereference(dev->rx_handler);
 4985}
 4986EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
 4987
 4988/**
 4989 *	netdev_rx_handler_register - register receive handler
 4990 *	@dev: device to register a handler for
 4991 *	@rx_handler: receive handler to register
 4992 *	@rx_handler_data: data pointer that is used by rx handler
 4993 *
 4994 *	Register a receive handler for a device. This handler will then be
 4995 *	called from __netif_receive_skb. A negative errno code is returned
 4996 *	on a failure.
 4997 *
 4998 *	The caller must hold the rtnl_mutex.
 4999 *
 5000 *	For a general description of rx_handler, see enum rx_handler_result.
 5001 */
 5002int netdev_rx_handler_register(struct net_device *dev,
 5003			       rx_handler_func_t *rx_handler,
 5004			       void *rx_handler_data)
 5005{
 5006	if (netdev_is_rx_handler_busy(dev))
 5007		return -EBUSY;
 5008
 5009	if (dev->priv_flags & IFF_NO_RX_HANDLER)
 5010		return -EINVAL;
 5011
 5012	/* Note: rx_handler_data must be set before rx_handler */
 5013	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
 5014	rcu_assign_pointer(dev->rx_handler, rx_handler);
 5015
 5016	return 0;
 5017}
 5018EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
 5019
 5020/**
 5021 *	netdev_rx_handler_unregister - unregister receive handler
 5022 *	@dev: device to unregister a handler from
 5023 *
 5024 *	Unregister a receive handler from a device.
 5025 *
 5026 *	The caller must hold the rtnl_mutex.
 5027 */
 5028void netdev_rx_handler_unregister(struct net_device *dev)
 5029{
 5030
 5031	ASSERT_RTNL();
 5032	RCU_INIT_POINTER(dev->rx_handler, NULL);
 5033	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
 5034	 * section has a guarantee to see a non NULL rx_handler_data
 5035	 * as well.
 5036	 */
 5037	synchronize_net();
 5038	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
 5039}
 5040EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 5041
 5042/*
 5043 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 5044 * the special handling of PFMEMALLOC skbs.
 5045 */
 5046static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
 5047{
 5048	switch (skb->protocol) {
 5049	case htons(ETH_P_ARP):
 5050	case htons(ETH_P_IP):
 5051	case htons(ETH_P_IPV6):
 5052	case htons(ETH_P_8021Q):
 5053	case htons(ETH_P_8021AD):
 5054		return true;
 5055	default:
 5056		return false;
 5057	}
 5058}
 5059
 5060static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 5061			     int *ret, struct net_device *orig_dev)
 5062{
 5063	if (nf_hook_ingress_active(skb)) {
 5064		int ingress_retval;
 5065
 5066		if (*pt_prev) {
 5067			*ret = deliver_skb(skb, *pt_prev, orig_dev);
 5068			*pt_prev = NULL;
 5069		}
 5070
 5071		rcu_read_lock();
 5072		ingress_retval = nf_hook_ingress(skb);
 5073		rcu_read_unlock();
 5074		return ingress_retval;
 5075	}
 5076	return 0;
 5077}
 5078
 5079static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 5080				    struct packet_type **ppt_prev)
 5081{
 5082	struct packet_type *ptype, *pt_prev;
 5083	rx_handler_func_t *rx_handler;
 5084	struct sk_buff *skb = *pskb;
 5085	struct net_device *orig_dev;
 5086	bool deliver_exact = false;
 5087	int ret = NET_RX_DROP;
 5088	__be16 type;
 5089
 5090	net_timestamp_check(!netdev_tstamp_prequeue, skb);
 5091
 5092	trace_netif_receive_skb(skb);
 5093
 5094	orig_dev = skb->dev;
 5095
 5096	skb_reset_network_header(skb);
 5097	if (!skb_transport_header_was_set(skb))
 5098		skb_reset_transport_header(skb);
 5099	skb_reset_mac_len(skb);
 5100
 5101	pt_prev = NULL;
 5102
 5103another_round:
 5104	skb->skb_iif = skb->dev->ifindex;
 5105
 5106	__this_cpu_inc(softnet_data.processed);
 5107
 5108	if (static_branch_unlikely(&generic_xdp_needed_key)) {
 5109		int ret2;
 5110
 5111		preempt_disable();
 5112		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
 5113		preempt_enable();
 5114
 5115		if (ret2 != XDP_PASS) {
 5116			ret = NET_RX_DROP;
 5117			goto out;
 5118		}
 5119		skb_reset_mac_len(skb);
 5120	}
 5121
 5122	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 5123	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 5124		skb = skb_vlan_untag(skb);
 5125		if (unlikely(!skb))
 5126			goto out;
 5127	}
 5128
 5129	if (skb_skip_tc_classify(skb))
 5130		goto skip_classify;
 5131
 5132	if (pfmemalloc)
 5133		goto skip_taps;
 5134
 5135	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 5136		if (pt_prev)
 5137			ret = deliver_skb(skb, pt_prev, orig_dev);
 5138		pt_prev = ptype;
 5139	}
 5140
 5141	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
 5142		if (pt_prev)
 5143			ret = deliver_skb(skb, pt_prev, orig_dev);
 5144		pt_prev = ptype;
 5145	}
 5146
 5147skip_taps:
 5148#ifdef CONFIG_NET_INGRESS
 5149	if (static_branch_unlikely(&ingress_needed_key)) {
 5150		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
 5151		if (!skb)
 5152			goto out;
 5153
 5154		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
 5155			goto out;
 5156	}
 5157#endif
 5158	skb_reset_redirect(skb);
 5159skip_classify:
 5160	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 5161		goto drop;
 5162
 5163	if (skb_vlan_tag_present(skb)) {
 5164		if (pt_prev) {
 5165			ret = deliver_skb(skb, pt_prev, orig_dev);
 5166			pt_prev = NULL;
 5167		}
 5168		if (vlan_do_receive(&skb))
 5169			goto another_round;
 5170		else if (unlikely(!skb))
 5171			goto out;
 5172	}
 5173
 5174	rx_handler = rcu_dereference(skb->dev->rx_handler);
 5175	if (rx_handler) {
 5176		if (pt_prev) {
 5177			ret = deliver_skb(skb, pt_prev, orig_dev);
 5178			pt_prev = NULL;
 5179		}
 5180		switch (rx_handler(&skb)) {
 5181		case RX_HANDLER_CONSUMED:
 5182			ret = NET_RX_SUCCESS;
 5183			goto out;
 5184		case RX_HANDLER_ANOTHER:
 5185			goto another_round;
 5186		case RX_HANDLER_EXACT:
 5187			deliver_exact = true;
 5188		case RX_HANDLER_PASS:
 5189			break;
 5190		default:
 5191			BUG();
 5192		}
 5193	}
 5194
 5195	if (unlikely(skb_vlan_tag_present(skb))) {
 5196check_vlan_id:
 5197		if (skb_vlan_tag_get_id(skb)) {
 5198			/* Vlan id is non 0 and vlan_do_receive() above couldn't
 5199			 * find vlan device.
 5200			 */
 5201			skb->pkt_type = PACKET_OTHERHOST;
 5202		} else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 5203			   skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 5204			/* Outer header is 802.1P with vlan 0, inner header is
 5205			 * 802.1Q or 802.1AD and vlan_do_receive() above could
 5206			 * not find vlan dev for vlan id 0.
 5207			 */
 5208			__vlan_hwaccel_clear_tag(skb);
 5209			skb = skb_vlan_untag(skb);
 5210			if (unlikely(!skb))
 5211				goto out;
 5212			if (vlan_do_receive(&skb))
 5213				/* After stripping off 802.1P header with vlan 0
 5214				 * vlan dev is found for inner header.
 5215				 */
 5216				goto another_round;
 5217			else if (unlikely(!skb))
 5218				goto out;
 5219			else
 5220				/* We have stripped outer 802.1P vlan 0 header.
 5221				 * But could not find vlan dev.
 5222				 * check again for vlan id to set OTHERHOST.
 5223				 */
 5224				goto check_vlan_id;
 5225		}
 5226		/* Note: we might in the future use prio bits
 5227		 * and set skb->priority like in vlan_do_receive()
 5228		 * For the time being, just ignore Priority Code Point
 5229		 */
 5230		__vlan_hwaccel_clear_tag(skb);
 5231	}
 5232
 5233	type = skb->protocol;
 5234
 5235	/* deliver only exact match when indicated */
 5236	if (likely(!deliver_exact)) {
 5237		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5238				       &ptype_base[ntohs(type) &
 5239						   PTYPE_HASH_MASK]);
 5240	}
 5241
 5242	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5243			       &orig_dev->ptype_specific);
 5244
 5245	if (unlikely(skb->dev != orig_dev)) {
 5246		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5247				       &skb->dev->ptype_specific);
 5248	}
 5249
 5250	if (pt_prev) {
 5251		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 5252			goto drop;
 5253		*ppt_prev = pt_prev;
 5254	} else {
 5255drop:
 5256		if (!deliver_exact)
 5257			atomic_long_inc(&skb->dev->rx_dropped);
 5258		else
 5259			atomic_long_inc(&skb->dev->rx_nohandler);
 5260		kfree_skb(skb);
 5261		/* Jamal, now you will not able to escape explaining
 5262		 * me how you were going to use this. :-)
 5263		 */
 5264		ret = NET_RX_DROP;
 5265	}
 5266
 5267out:
 5268	/* The invariant here is that if *ppt_prev is not NULL
 5269	 * then skb should also be non-NULL.
 5270	 *
 5271	 * Apparently *ppt_prev assignment above holds this invariant due to
 5272	 * skb dereferencing near it.
 5273	 */
 5274	*pskb = skb;
 5275	return ret;
 5276}
 5277
 5278static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
 5279{
 5280	struct net_device *orig_dev = skb->dev;
 5281	struct packet_type *pt_prev = NULL;
 5282	int ret;
 5283
 5284	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5285	if (pt_prev)
 5286		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
 5287					 skb->dev, pt_prev, orig_dev);
 5288	return ret;
 5289}
 5290
 5291/**
 5292 *	netif_receive_skb_core - special purpose version of netif_receive_skb
 5293 *	@skb: buffer to process
 5294 *
 5295 *	More direct receive version of netif_receive_skb().  It should
 5296 *	only be used by callers that have a need to skip RPS and Generic XDP.
 5297 *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
 5298 *
 5299 *	This function may only be called from softirq context and interrupts
 5300 *	should be enabled.
 5301 *
 5302 *	Return values (usually ignored):
 5303 *	NET_RX_SUCCESS: no congestion
 5304 *	NET_RX_DROP: packet was dropped
 5305 */
 5306int netif_receive_skb_core(struct sk_buff *skb)
 5307{
 5308	int ret;
 5309
 5310	rcu_read_lock();
 5311	ret = __netif_receive_skb_one_core(skb, false);
 5312	rcu_read_unlock();
 5313
 5314	return ret;
 5315}
 5316EXPORT_SYMBOL(netif_receive_skb_core);
 5317
 5318static inline void __netif_receive_skb_list_ptype(struct list_head *head,
 5319						  struct packet_type *pt_prev,
 5320						  struct net_device *orig_dev)
 5321{
 5322	struct sk_buff *skb, *next;
 5323
 5324	if (!pt_prev)
 5325		return;
 5326	if (list_empty(head))
 5327		return;
 5328	if (pt_prev->list_func != NULL)
 5329		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
 5330				   ip_list_rcv, head, pt_prev, orig_dev);
 5331	else
 5332		list_for_each_entry_safe(skb, next, head, list) {
 5333			skb_list_del_init(skb);
 5334			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 5335		}
 5336}
 5337
 5338static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
 5339{
 5340	/* Fast-path assumptions:
 5341	 * - There is no RX handler.
 5342	 * - Only one packet_type matches.
 5343	 * If either of these fails, we will end up doing some per-packet
 5344	 * processing in-line, then handling the 'last ptype' for the whole
 5345	 * sublist.  This can't cause out-of-order delivery to any single ptype,
 5346	 * because the 'last ptype' must be constant across the sublist, and all
 5347	 * other ptypes are handled per-packet.
 5348	 */
 5349	/* Current (common) ptype of sublist */
 5350	struct packet_type *pt_curr = NULL;
 5351	/* Current (common) orig_dev of sublist */
 5352	struct net_device *od_curr = NULL;
 5353	struct list_head sublist;
 5354	struct sk_buff *skb, *next;
 5355
 5356	INIT_LIST_HEAD(&sublist);
 5357	list_for_each_entry_safe(skb, next, head, list) {
 5358		struct net_device *orig_dev = skb->dev;
 5359		struct packet_type *pt_prev = NULL;
 5360
 5361		skb_list_del_init(skb);
 5362		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5363		if (!pt_prev)
 5364			continue;
 5365		if (pt_curr != pt_prev || od_curr != orig_dev) {
 5366			/* dispatch old sublist */
 5367			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5368			/* start new sublist */
 5369			INIT_LIST_HEAD(&sublist);
 5370			pt_curr = pt_prev;
 5371			od_curr = orig_dev;
 5372		}
 5373		list_add_tail(&skb->list, &sublist);
 5374	}
 5375
 5376	/* dispatch final sublist */
 5377	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5378}
 5379
 5380static int __netif_receive_skb(struct sk_buff *skb)
 5381{
 5382	int ret;
 5383
 5384	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
 5385		unsigned int noreclaim_flag;
 5386
 5387		/*
 5388		 * PFMEMALLOC skbs are special, they should
 5389		 * - be delivered to SOCK_MEMALLOC sockets only
 5390		 * - stay away from userspace
 5391		 * - have bounded memory usage
 5392		 *
 5393		 * Use PF_MEMALLOC as this saves us from propagating the allocation
 5394		 * context down to all allocation sites.
 5395		 */
 5396		noreclaim_flag = memalloc_noreclaim_save();
 5397		ret = __netif_receive_skb_one_core(skb, true);
 5398		memalloc_noreclaim_restore(noreclaim_flag);
 5399	} else
 5400		ret = __netif_receive_skb_one_core(skb, false);
 5401
 5402	return ret;
 5403}
 5404
 5405static void __netif_receive_skb_list(struct list_head *head)
 5406{
 5407	unsigned long noreclaim_flag = 0;
 5408	struct sk_buff *skb, *next;
 5409	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
 5410
 5411	list_for_each_entry_safe(skb, next, head, list) {
 5412		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
 5413			struct list_head sublist;
 5414
 5415			/* Handle the previous sublist */
 5416			list_cut_before(&sublist, head, &skb->list);
 5417			if (!list_empty(&sublist))
 5418				__netif_receive_skb_list_core(&sublist, pfmemalloc);
 5419			pfmemalloc = !pfmemalloc;
 5420			/* See comments in __netif_receive_skb */
 5421			if (pfmemalloc)
 5422				noreclaim_flag = memalloc_noreclaim_save();
 5423			else
 5424				memalloc_noreclaim_restore(noreclaim_flag);
 5425		}
 5426	}
 5427	/* Handle the remaining sublist */
 5428	if (!list_empty(head))
 5429		__netif_receive_skb_list_core(head, pfmemalloc);
 5430	/* Restore pflags */
 5431	if (pfmemalloc)
 5432		memalloc_noreclaim_restore(noreclaim_flag);
 5433}
 5434
 5435static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 5436{
 5437	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
 5438	struct bpf_prog *new = xdp->prog;
 5439	int ret = 0;
 5440
 5441	if (new) {
 5442		u32 i;
 5443
 5444		/* generic XDP does not work with DEVMAPs that can
 5445		 * have a bpf_prog installed on an entry
 5446		 */
 5447		for (i = 0; i < new->aux->used_map_cnt; i++) {
 5448			if (dev_map_can_have_prog(new->aux->used_maps[i]))
 5449				return -EINVAL;
 5450			if (cpu_map_prog_allowed(new->aux->used_maps[i]))
 5451				return -EINVAL;
 5452		}
 5453	}
 5454
 5455	switch (xdp->command) {
 5456	case XDP_SETUP_PROG:
 5457		rcu_assign_pointer(dev->xdp_prog, new);
 5458		if (old)
 5459			bpf_prog_put(old);
 5460
 5461		if (old && !new) {
 5462			static_branch_dec(&generic_xdp_needed_key);
 5463		} else if (new && !old) {
 5464			static_branch_inc(&generic_xdp_needed_key);
 5465			dev_disable_lro(dev);
 5466			dev_disable_gro_hw(dev);
 5467		}
 5468		break;
 5469
 5470	default:
 5471		ret = -EINVAL;
 5472		break;
 5473	}
 5474
 5475	return ret;
 5476}
 5477
 5478static int netif_receive_skb_internal(struct sk_buff *skb)
 5479{
 5480	int ret;
 5481
 5482	net_timestamp_check(netdev_tstamp_prequeue, skb);
 5483
 5484	if (skb_defer_rx_timestamp(skb))
 5485		return NET_RX_SUCCESS;
 5486
 5487	rcu_read_lock();
 5488#ifdef CONFIG_RPS
 5489	if (static_branch_unlikely(&rps_needed)) {
 5490		struct rps_dev_flow voidflow, *rflow = &voidflow;
 5491		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5492
 5493		if (cpu >= 0) {
 5494			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5495			rcu_read_unlock();
 5496			return ret;
 5497		}
 5498	}
 5499#endif
 5500	ret = __netif_receive_skb(skb);
 5501	rcu_read_unlock();
 5502	return ret;
 5503}
 5504
 5505static void netif_receive_skb_list_internal(struct list_head *head)
 5506{
 5507	struct sk_buff *skb, *next;
 5508	struct list_head sublist;
 5509
 5510	INIT_LIST_HEAD(&sublist);
 5511	list_for_each_entry_safe(skb, next, head, list) {
 5512		net_timestamp_check(netdev_tstamp_prequeue, skb);
 5513		skb_list_del_init(skb);
 5514		if (!skb_defer_rx_timestamp(skb))
 5515			list_add_tail(&skb->list, &sublist);
 5516	}
 5517	list_splice_init(&sublist, head);
 5518
 5519	rcu_read_lock();
 5520#ifdef CONFIG_RPS
 5521	if (static_branch_unlikely(&rps_needed)) {
 5522		list_for_each_entry_safe(skb, next, head, list) {
 5523			struct rps_dev_flow voidflow, *rflow = &voidflow;
 5524			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5525
 5526			if (cpu >= 0) {
 5527				/* Will be handled, remove from list */
 5528				skb_list_del_init(skb);
 5529				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5530			}
 5531		}
 5532	}
 5533#endif
 5534	__netif_receive_skb_list(head);
 5535	rcu_read_unlock();
 5536}
 5537
 5538/**
 5539 *	netif_receive_skb - process receive buffer from network
 5540 *	@skb: buffer to process
 5541 *
 5542 *	netif_receive_skb() is the main receive data processing function.
 5543 *	It always succeeds. The buffer may be dropped during processing
 5544 *	for congestion control or by the protocol layers.
 5545 *
 5546 *	This function may only be called from softirq context and interrupts
 5547 *	should be enabled.
 5548 *
 5549 *	Return values (usually ignored):
 5550 *	NET_RX_SUCCESS: no congestion
 5551 *	NET_RX_DROP: packet was dropped
 5552 */
 5553int netif_receive_skb(struct sk_buff *skb)
 5554{
 5555	int ret;
 5556
 5557	trace_netif_receive_skb_entry(skb);
 5558
 5559	ret = netif_receive_skb_internal(skb);
 5560	trace_netif_receive_skb_exit(ret);
 5561
 5562	return ret;
 5563}
 5564EXPORT_SYMBOL(netif_receive_skb);
 5565
 5566/**
 5567 *	netif_receive_skb_list - process many receive buffers from network
 5568 *	@head: list of skbs to process.
 5569 *
 5570 *	Since return value of netif_receive_skb() is normally ignored, and
 5571 *	wouldn't be meaningful for a list, this function returns void.
 5572 *
 5573 *	This function may only be called from softirq context and interrupts
 5574 *	should be enabled.
 5575 */
 5576void netif_receive_skb_list(struct list_head *head)
 5577{
 5578	struct sk_buff *skb;
 5579
 5580	if (list_empty(head))
 5581		return;
 5582	if (trace_netif_receive_skb_list_entry_enabled()) {
 5583		list_for_each_entry(skb, head, list)
 5584			trace_netif_receive_skb_list_entry(skb);
 5585	}
 5586	netif_receive_skb_list_internal(head);
 5587	trace_netif_receive_skb_list_exit(0);
 5588}
 5589EXPORT_SYMBOL(netif_receive_skb_list);
 5590
 5591static DEFINE_PER_CPU(struct work_struct, flush_works);
 5592
 5593/* Network device is going away, flush any packets still pending */
 5594static void flush_backlog(struct work_struct *work)
 5595{
 5596	struct sk_buff *skb, *tmp;
 5597	struct softnet_data *sd;
 5598
 5599	local_bh_disable();
 5600	sd = this_cpu_ptr(&softnet_data);
 5601
 5602	local_irq_disable();
 5603	rps_lock(sd);
 5604	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 5605		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5606			__skb_unlink(skb, &sd->input_pkt_queue);
 5607			dev_kfree_skb_irq(skb);
 5608			input_queue_head_incr(sd);
 5609		}
 5610	}
 5611	rps_unlock(sd);
 5612	local_irq_enable();
 5613
 5614	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 5615		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5616			__skb_unlink(skb, &sd->process_queue);
 5617			kfree_skb(skb);
 5618			input_queue_head_incr(sd);
 5619		}
 5620	}
 5621	local_bh_enable();
 5622}
 5623
 5624static void flush_all_backlogs(void)
 5625{
 5626	unsigned int cpu;
 5627
 5628	get_online_cpus();
 5629
 5630	for_each_online_cpu(cpu)
 5631		queue_work_on(cpu, system_highpri_wq,
 5632			      per_cpu_ptr(&flush_works, cpu));
 5633
 5634	for_each_online_cpu(cpu)
 5635		flush_work(per_cpu_ptr(&flush_works, cpu));
 5636
 5637	put_online_cpus();
 5638}
 5639
 5640/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
 5641static void gro_normal_list(struct napi_struct *napi)
 5642{
 5643	if (!napi->rx_count)
 5644		return;
 5645	netif_receive_skb_list_internal(&napi->rx_list);
 5646	INIT_LIST_HEAD(&napi->rx_list);
 5647	napi->rx_count = 0;
 5648}
 5649
 5650/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
 5651 * pass the whole batch up to the stack.
 5652 */
 5653static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
 5654{
 5655	list_add_tail(&skb->list, &napi->rx_list);
 5656	if (++napi->rx_count >= gro_normal_batch)
 5657		gro_normal_list(napi);
 5658}
 5659
 5660INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
 5661INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
 5662static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
 5663{
 5664	struct packet_offload *ptype;
 5665	__be16 type = skb->protocol;
 5666	struct list_head *head = &offload_base;
 5667	int err = -ENOENT;
 5668
 5669	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
 5670
 5671	if (NAPI_GRO_CB(skb)->count == 1) {
 5672		skb_shinfo(skb)->gso_size = 0;
 5673		goto out;
 5674	}
 5675
 5676	rcu_read_lock();
 5677	list_for_each_entry_rcu(ptype, head, list) {
 5678		if (ptype->type != type || !ptype->callbacks.gro_complete)
 5679			continue;
 5680
 5681		err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
 5682					 ipv6_gro_complete, inet_gro_complete,
 5683					 skb, 0);
 5684		break;
 5685	}
 5686	rcu_read_unlock();
 5687
 5688	if (err) {
 5689		WARN_ON(&ptype->list == head);
 5690		kfree_skb(skb);
 5691		return NET_RX_SUCCESS;
 5692	}
 5693
 5694out:
 5695	gro_normal_one(napi, skb);
 5696	return NET_RX_SUCCESS;
 5697}
 5698
 5699static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 5700				   bool flush_old)
 5701{
 5702	struct list_head *head = &napi->gro_hash[index].list;
 5703	struct sk_buff *skb, *p;
 5704
 5705	list_for_each_entry_safe_reverse(skb, p, head, list) {
 5706		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 5707			return;
 5708		skb_list_del_init(skb);
 5709		napi_gro_complete(napi, skb);
 5710		napi->gro_hash[index].count--;
 5711	}
 5712
 5713	if (!napi->gro_hash[index].count)
 5714		__clear_bit(index, &napi->gro_bitmask);
 5715}
 5716
 5717/* napi->gro_hash[].list contains packets ordered by age.
 5718 * youngest packets at the head of it.
 5719 * Complete skbs in reverse order to reduce latencies.
 5720 */
 5721void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 5722{
 5723	unsigned long bitmask = napi->gro_bitmask;
 5724	unsigned int i, base = ~0U;
 5725
 5726	while ((i = ffs(bitmask)) != 0) {
 5727		bitmask >>= i;
 5728		base += i;
 5729		__napi_gro_flush_chain(napi, base, flush_old);
 5730	}
 5731}
 5732EXPORT_SYMBOL(napi_gro_flush);
 5733
 5734static struct list_head *gro_list_prepare(struct napi_struct *napi,
 5735					  struct sk_buff *skb)
 5736{
 5737	unsigned int maclen = skb->dev->hard_header_len;
 5738	u32 hash = skb_get_hash_raw(skb);
 5739	struct list_head *head;
 5740	struct sk_buff *p;
 5741
 5742	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
 5743	list_for_each_entry(p, head, list) {
 5744		unsigned long diffs;
 5745
 5746		NAPI_GRO_CB(p)->flush = 0;
 5747
 5748		if (hash != skb_get_hash_raw(p)) {
 5749			NAPI_GRO_CB(p)->same_flow = 0;
 5750			continue;
 5751		}
 5752
 5753		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 5754		diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
 5755		if (skb_vlan_tag_present(p))
 5756			diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
 5757		diffs |= skb_metadata_dst_cmp(p, skb);
 5758		diffs |= skb_metadata_differs(p, skb);
 5759		if (maclen == ETH_HLEN)
 5760			diffs |= compare_ether_header(skb_mac_header(p),
 5761						      skb_mac_header(skb));
 5762		else if (!diffs)
 5763			diffs = memcmp(skb_mac_header(p),
 5764				       skb_mac_header(skb),
 5765				       maclen);
 5766		NAPI_GRO_CB(p)->same_flow = !diffs;
 5767	}
 5768
 5769	return head;
 5770}
 5771
 5772static void skb_gro_reset_offset(struct sk_buff *skb)
 5773{
 5774	const struct skb_shared_info *pinfo = skb_shinfo(skb);
 5775	const skb_frag_t *frag0 = &pinfo->frags[0];
 5776
 5777	NAPI_GRO_CB(skb)->data_offset = 0;
 5778	NAPI_GRO_CB(skb)->frag0 = NULL;
 5779	NAPI_GRO_CB(skb)->frag0_len = 0;
 5780
 5781	if (!skb_headlen(skb) && pinfo->nr_frags &&
 5782	    !PageHighMem(skb_frag_page(frag0))) {
 5783		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
 5784		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
 5785						    skb_frag_size(frag0),
 5786						    skb->end - skb->tail);
 5787	}
 5788}
 5789
 5790static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
 5791{
 5792	struct skb_shared_info *pinfo = skb_shinfo(skb);
 5793
 5794	BUG_ON(skb->end - skb->tail < grow);
 5795
 5796	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
 5797
 5798	skb->data_len -= grow;
 5799	skb->tail += grow;
 5800
 5801	skb_frag_off_add(&pinfo->frags[0], grow);
 5802	skb_frag_size_sub(&pinfo->frags[0], grow);
 5803
 5804	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
 5805		skb_frag_unref(skb, 0);
 5806		memmove(pinfo->frags, pinfo->frags + 1,
 5807			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
 5808	}
 5809}
 5810
 5811static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
 5812{
 5813	struct sk_buff *oldest;
 5814
 5815	oldest = list_last_entry(head, struct sk_buff, list);
 5816
 5817	/* We are called with head length >= MAX_GRO_SKBS, so this is
 5818	 * impossible.
 5819	 */
 5820	if (WARN_ON_ONCE(!oldest))
 5821		return;
 5822
 5823	/* Do not adjust napi->gro_hash[].count, caller is adding a new
 5824	 * SKB to the chain.
 5825	 */
 5826	skb_list_del_init(oldest);
 5827	napi_gro_complete(napi, oldest);
 5828}
 5829
 5830INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
 5831							   struct sk_buff *));
 5832INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
 5833							   struct sk_buff *));
 5834static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 5835{
 5836	u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
 5837	struct list_head *head = &offload_base;
 5838	struct packet_offload *ptype;
 5839	__be16 type = skb->protocol;
 5840	struct list_head *gro_head;
 5841	struct sk_buff *pp = NULL;
 5842	enum gro_result ret;
 5843	int same_flow;
 5844	int grow;
 5845
 5846	if (netif_elide_gro(skb->dev))
 5847		goto normal;
 5848
 5849	gro_head = gro_list_prepare(napi, skb);
 5850
 5851	rcu_read_lock();
 5852	list_for_each_entry_rcu(ptype, head, list) {
 5853		if (ptype->type != type || !ptype->callbacks.gro_receive)
 5854			continue;
 5855
 5856		skb_set_network_header(skb, skb_gro_offset(skb));
 5857		skb_reset_mac_len(skb);
 5858		NAPI_GRO_CB(skb)->same_flow = 0;
 5859		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
 5860		NAPI_GRO_CB(skb)->free = 0;
 5861		NAPI_GRO_CB(skb)->encap_mark = 0;
 5862		NAPI_GRO_CB(skb)->recursion_counter = 0;
 5863		NAPI_GRO_CB(skb)->is_fou = 0;
 5864		NAPI_GRO_CB(skb)->is_atomic = 1;
 5865		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 5866
 5867		/* Setup for GRO checksum validation */
 5868		switch (skb->ip_summed) {
 5869		case CHECKSUM_COMPLETE:
 5870			NAPI_GRO_CB(skb)->csum = skb->csum;
 5871			NAPI_GRO_CB(skb)->csum_valid = 1;
 5872			NAPI_GRO_CB(skb)->csum_cnt = 0;
 5873			break;
 5874		case CHECKSUM_UNNECESSARY:
 5875			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
 5876			NAPI_GRO_CB(skb)->csum_valid = 0;
 5877			break;
 5878		default:
 5879			NAPI_GRO_CB(skb)->csum_cnt = 0;
 5880			NAPI_GRO_CB(skb)->csum_valid = 0;
 5881		}
 5882
 5883		pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
 5884					ipv6_gro_receive, inet_gro_receive,
 5885					gro_head, skb);
 5886		break;
 5887	}
 5888	rcu_read_unlock();
 5889
 5890	if (&ptype->list == head)
 5891		goto normal;
 5892
 5893	if (PTR_ERR(pp) == -EINPROGRESS) {
 5894		ret = GRO_CONSUMED;
 5895		goto ok;
 5896	}
 5897
 5898	same_flow = NAPI_GRO_CB(skb)->same_flow;
 5899	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 5900
 5901	if (pp) {
 5902		skb_list_del_init(pp);
 5903		napi_gro_complete(napi, pp);
 5904		napi->gro_hash[hash].count--;
 5905	}
 5906
 5907	if (same_flow)
 5908		goto ok;
 5909
 5910	if (NAPI_GRO_CB(skb)->flush)
 5911		goto normal;
 5912
 5913	if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
 5914		gro_flush_oldest(napi, gro_head);
 5915	} else {
 5916		napi->gro_hash[hash].count++;
 5917	}
 5918	NAPI_GRO_CB(skb)->count = 1;
 5919	NAPI_GRO_CB(skb)->age = jiffies;
 5920	NAPI_GRO_CB(skb)->last = skb;
 5921	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 5922	list_add(&skb->list, gro_head);
 5923	ret = GRO_HELD;
 5924
 5925pull:
 5926	grow = skb_gro_offset(skb) - skb_headlen(skb);
 5927	if (grow > 0)
 5928		gro_pull_from_frag0(skb, grow);
 5929ok:
 5930	if (napi->gro_hash[hash].count) {
 5931		if (!test_bit(hash, &napi->gro_bitmask))
 5932			__set_bit(hash, &napi->gro_bitmask);
 5933	} else if (test_bit(hash, &napi->gro_bitmask)) {
 5934		__clear_bit(hash, &napi->gro_bitmask);
 5935	}
 5936
 5937	return ret;
 5938
 5939normal:
 5940	ret = GRO_NORMAL;
 5941	goto pull;
 5942}
 5943
 5944struct packet_offload *gro_find_receive_by_type(__be16 type)
 5945{
 5946	struct list_head *offload_head = &offload_base;
 5947	struct packet_offload *ptype;
 5948
 5949	list_for_each_entry_rcu(ptype, offload_head, list) {
 5950		if (ptype->type != type || !ptype->callbacks.gro_receive)
 5951			continue;
 5952		return ptype;
 5953	}
 5954	return NULL;
 5955}
 5956EXPORT_SYMBOL(gro_find_receive_by_type);
 5957
 5958struct packet_offload *gro_find_complete_by_type(__be16 type)
 5959{
 5960	struct list_head *offload_head = &offload_base;
 5961	struct packet_offload *ptype;
 5962
 5963	list_for_each_entry_rcu(ptype, offload_head, list) {
 5964		if (ptype->type != type || !ptype->callbacks.gro_complete)
 5965			continue;
 5966		return ptype;
 5967	}
 5968	return NULL;
 5969}
 5970EXPORT_SYMBOL(gro_find_complete_by_type);
 5971
 5972static void napi_skb_free_stolen_head(struct sk_buff *skb)
 5973{
 5974	skb_dst_drop(skb);
 5975	skb_ext_put(skb);
 5976	kmem_cache_free(skbuff_head_cache, skb);
 5977}
 5978
 5979static gro_result_t napi_skb_finish(struct napi_struct *napi,
 5980				    struct sk_buff *skb,
 5981				    gro_result_t ret)
 5982{
 5983	switch (ret) {
 5984	case GRO_NORMAL:
 5985		gro_normal_one(napi, skb);
 5986		break;
 5987
 5988	case GRO_DROP:
 5989		kfree_skb(skb);
 5990		break;
 5991
 5992	case GRO_MERGED_FREE:
 5993		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 5994			napi_skb_free_stolen_head(skb);
 5995		else
 5996			__kfree_skb(skb);
 5997		break;
 5998
 5999	case GRO_HELD:
 6000	case GRO_MERGED:
 6001	case GRO_CONSUMED:
 6002		break;
 6003	}
 6004
 6005	return ret;
 6006}
 6007
 6008gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 6009{
 6010	gro_result_t ret;
 6011
 6012	skb_mark_napi_id(skb, napi);
 6013	trace_napi_gro_receive_entry(skb);
 6014
 6015	skb_gro_reset_offset(skb);
 6016
 6017	ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
 6018	trace_napi_gro_receive_exit(ret);
 6019
 6020	return ret;
 6021}
 6022EXPORT_SYMBOL(napi_gro_receive);
 6023
 6024static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 6025{
 6026	if (unlikely(skb->pfmemalloc)) {
 6027		consume_skb(skb);
 6028		return;
 6029	}
 6030	__skb_pull(skb, skb_headlen(skb));
 6031	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
 6032	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
 6033	__vlan_hwaccel_clear_tag(skb);
 6034	skb->dev = napi->dev;
 6035	skb->skb_iif = 0;
 6036
 6037	/* eth_type_trans() assumes pkt_type is PACKET_HOST */
 6038	skb->pkt_type = PACKET_HOST;
 6039
 6040	skb->encapsulation = 0;
 6041	skb_shinfo(skb)->gso_type = 0;
 6042	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 6043	skb_ext_reset(skb);
 6044
 6045	napi->skb = skb;
 6046}
 6047
 6048struct sk_buff *napi_get_frags(struct napi_struct *napi)
 6049{
 6050	struct sk_buff *skb = napi->skb;
 6051
 6052	if (!skb) {
 6053		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
 6054		if (skb) {
 6055			napi->skb = skb;
 6056			skb_mark_napi_id(skb, napi);
 6057		}
 6058	}
 6059	return skb;
 6060}
 6061EXPORT_SYMBOL(napi_get_frags);
 6062
 6063static gro_result_t napi_frags_finish(struct napi_struct *napi,
 6064				      struct sk_buff *skb,
 6065				      gro_result_t ret)
 6066{
 6067	switch (ret) {
 6068	case GRO_NORMAL:
 6069	case GRO_HELD:
 6070		__skb_push(skb, ETH_HLEN);
 6071		skb->protocol = eth_type_trans(skb, skb->dev);
 6072		if (ret == GRO_NORMAL)
 6073			gro_normal_one(napi, skb);
 6074		break;
 6075
 6076	case GRO_DROP:
 6077		napi_reuse_skb(napi, skb);
 6078		break;
 6079
 6080	case GRO_MERGED_FREE:
 6081		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 6082			napi_skb_free_stolen_head(skb);
 6083		else
 6084			napi_reuse_skb(napi, skb);
 6085		break;
 6086
 6087	case GRO_MERGED:
 6088	case GRO_CONSUMED:
 6089		break;
 6090	}
 6091
 6092	return ret;
 6093}
 6094
 6095/* Upper GRO stack assumes network header starts at gro_offset=0
 6096 * Drivers could call both napi_gro_frags() and napi_gro_receive()
 6097 * We copy ethernet header into skb->data to have a common layout.
 6098 */
 6099static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 6100{
 6101	struct sk_buff *skb = napi->skb;
 6102	const struct ethhdr *eth;
 6103	unsigned int hlen = sizeof(*eth);
 6104
 6105	napi->skb = NULL;
 6106
 6107	skb_reset_mac_header(skb);
 6108	skb_gro_reset_offset(skb);
 6109
 6110	if (unlikely(skb_gro_header_hard(skb, hlen))) {
 6111		eth = skb_gro_header_slow(skb, hlen, 0);
 6112		if (unlikely(!eth)) {
 6113			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
 6114					     __func__, napi->dev->name);
 6115			napi_reuse_skb(napi, skb);
 6116			return NULL;
 6117		}
 6118	} else {
 6119		eth = (const struct ethhdr *)skb->data;
 6120		gro_pull_from_frag0(skb, hlen);
 6121		NAPI_GRO_CB(skb)->frag0 += hlen;
 6122		NAPI_GRO_CB(skb)->frag0_len -= hlen;
 6123	}
 6124	__skb_pull(skb, hlen);
 6125
 6126	/*
 6127	 * This works because the only protocols we care about don't require
 6128	 * special handling.
 6129	 * We'll fix it up properly in napi_frags_finish()
 6130	 */
 6131	skb->protocol = eth->h_proto;
 6132
 6133	return skb;
 6134}
 6135
 6136gro_result_t napi_gro_frags(struct napi_struct *napi)
 6137{
 6138	gro_result_t ret;
 6139	struct sk_buff *skb = napi_frags_skb(napi);
 6140
 6141	if (!skb)
 6142		return GRO_DROP;
 6143
 6144	trace_napi_gro_frags_entry(skb);
 6145
 6146	ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
 6147	trace_napi_gro_frags_exit(ret);
 6148
 6149	return ret;
 6150}
 6151EXPORT_SYMBOL(napi_gro_frags);
 6152
 6153/* Compute the checksum from gro_offset and return the folded value
 6154 * after adding in any pseudo checksum.
 6155 */
 6156__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
 6157{
 6158	__wsum wsum;
 6159	__sum16 sum;
 6160
 6161	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
 6162
 6163	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
 6164	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
 6165	/* See comments in __skb_checksum_complete(). */
 6166	if (likely(!sum)) {
 6167		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
 6168		    !skb->csum_complete_sw)
 6169			netdev_rx_csum_fault(skb->dev, skb);
 6170	}
 6171
 6172	NAPI_GRO_CB(skb)->csum = wsum;
 6173	NAPI_GRO_CB(skb)->csum_valid = 1;
 6174
 6175	return sum;
 6176}
 6177EXPORT_SYMBOL(__skb_gro_checksum_complete);
 6178
 6179static void net_rps_send_ipi(struct softnet_data *remsd)
 6180{
 6181#ifdef CONFIG_RPS
 6182	while (remsd) {
 6183		struct softnet_data *next = remsd->rps_ipi_next;
 6184
 6185		if (cpu_online(remsd->cpu))
 6186			smp_call_function_single_async(remsd->cpu, &remsd->csd);
 6187		remsd = next;
 6188	}
 6189#endif
 6190}
 6191
 6192/*
 6193 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 6194 * Note: called with local irq disabled, but exits with local irq enabled.
 6195 */
 6196static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 6197{
 6198#ifdef CONFIG_RPS
 6199	struct softnet_data *remsd = sd->rps_ipi_list;
 6200
 6201	if (remsd) {
 6202		sd->rps_ipi_list = NULL;
 6203
 6204		local_irq_enable();
 6205
 6206		/* Send pending IPI's to kick RPS processing on remote cpus. */
 6207		net_rps_send_ipi(remsd);
 6208	} else
 6209#endif
 6210		local_irq_enable();
 6211}
 6212
 6213static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
 6214{
 6215#ifdef CONFIG_RPS
 6216	return sd->rps_ipi_list != NULL;
 6217#else
 6218	return false;
 6219#endif
 6220}
 6221
 6222static int process_backlog(struct napi_struct *napi, int quota)
 6223{
 6224	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 6225	bool again = true;
 6226	int work = 0;
 6227
 6228	/* Check if we have pending ipi, its better to send them now,
 6229	 * not waiting net_rx_action() end.
 6230	 */
 6231	if (sd_has_rps_ipi_waiting(sd)) {
 6232		local_irq_disable();
 6233		net_rps_action_and_irq_enable(sd);
 6234	}
 6235
 6236	napi->weight = dev_rx_weight;
 6237	while (again) {
 6238		struct sk_buff *skb;
 6239
 6240		while ((skb = __skb_dequeue(&sd->process_queue))) {
 6241			rcu_read_lock();
 6242			__netif_receive_skb(skb);
 6243			rcu_read_unlock();
 6244			input_queue_head_incr(sd);
 6245			if (++work >= quota)
 6246				return work;
 6247
 6248		}
 6249
 6250		local_irq_disable();
 6251		rps_lock(sd);
 6252		if (skb_queue_empty(&sd->input_pkt_queue)) {
 6253			/*
 6254			 * Inline a custom version of __napi_complete().
 6255			 * only current cpu owns and manipulates this napi,
 6256			 * and NAPI_STATE_SCHED is the only possible flag set
 6257			 * on backlog.
 6258			 * We can use a plain write instead of clear_bit(),
 6259			 * and we dont need an smp_mb() memory barrier.
 6260			 */
 6261			napi->state = 0;
 6262			again = false;
 6263		} else {
 6264			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 6265						   &sd->process_queue);
 6266		}
 6267		rps_unlock(sd);
 6268		local_irq_enable();
 6269	}
 6270
 6271	return work;
 6272}
 6273
 6274/**
 6275 * __napi_schedule - schedule for receive
 6276 * @n: entry to schedule
 6277 *
 6278 * The entry's receive function will be scheduled to run.
 6279 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 6280 */
 6281void __napi_schedule(struct napi_struct *n)
 6282{
 6283	unsigned long flags;
 6284
 6285	local_irq_save(flags);
 6286	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6287	local_irq_restore(flags);
 6288}
 6289EXPORT_SYMBOL(__napi_schedule);
 6290
 6291/**
 6292 *	napi_schedule_prep - check if napi can be scheduled
 6293 *	@n: napi context
 6294 *
 6295 * Test if NAPI routine is already running, and if not mark
 6296 * it as running.  This is used as a condition variable
 6297 * insure only one NAPI poll instance runs.  We also make
 6298 * sure there is no pending NAPI disable.
 6299 */
 6300bool napi_schedule_prep(struct napi_struct *n)
 6301{
 6302	unsigned long val, new;
 6303
 6304	do {
 6305		val = READ_ONCE(n->state);
 6306		if (unlikely(val & NAPIF_STATE_DISABLE))
 6307			return false;
 6308		new = val | NAPIF_STATE_SCHED;
 6309
 6310		/* Sets STATE_MISSED bit if STATE_SCHED was already set
 6311		 * This was suggested by Alexander Duyck, as compiler
 6312		 * emits better code than :
 6313		 * if (val & NAPIF_STATE_SCHED)
 6314		 *     new |= NAPIF_STATE_MISSED;
 6315		 */
 6316		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
 6317						   NAPIF_STATE_MISSED;
 6318	} while (cmpxchg(&n->state, val, new) != val);
 6319
 6320	return !(val & NAPIF_STATE_SCHED);
 6321}
 6322EXPORT_SYMBOL(napi_schedule_prep);
 6323
 6324/**
 6325 * __napi_schedule_irqoff - schedule for receive
 6326 * @n: entry to schedule
 6327 *
 6328 * Variant of __napi_schedule() assuming hard irqs are masked
 6329 */
 6330void __napi_schedule_irqoff(struct napi_struct *n)
 6331{
 6332	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6333}
 6334EXPORT_SYMBOL(__napi_schedule_irqoff);
 6335
 6336bool napi_complete_done(struct napi_struct *n, int work_done)
 6337{
 6338	unsigned long flags, val, new, timeout = 0;
 6339	bool ret = true;
 6340
 6341	/*
 6342	 * 1) Don't let napi dequeue from the cpu poll list
 6343	 *    just in case its running on a different cpu.
 6344	 * 2) If we are busy polling, do nothing here, we have
 6345	 *    the guarantee we will be called later.
 6346	 */
 6347	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
 6348				 NAPIF_STATE_IN_BUSY_POLL)))
 6349		return false;
 6350
 6351	if (work_done) {
 6352		if (n->gro_bitmask)
 6353			timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6354		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
 6355	}
 6356	if (n->defer_hard_irqs_count > 0) {
 6357		n->defer_hard_irqs_count--;
 6358		timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6359		if (timeout)
 6360			ret = false;
 6361	}
 6362	if (n->gro_bitmask) {
 6363		/* When the NAPI instance uses a timeout and keeps postponing
 6364		 * it, we need to bound somehow the time packets are kept in
 6365		 * the GRO layer
 6366		 */
 6367		napi_gro_flush(n, !!timeout);
 6368	}
 6369
 6370	gro_normal_list(n);
 6371
 6372	if (unlikely(!list_empty(&n->poll_list))) {
 6373		/* If n->poll_list is not empty, we need to mask irqs */
 6374		local_irq_save(flags);
 6375		list_del_init(&n->poll_list);
 6376		local_irq_restore(flags);
 6377	}
 6378
 6379	do {
 6380		val = READ_ONCE(n->state);
 6381
 6382		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
 6383
 6384		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
 6385
 6386		/* If STATE_MISSED was set, leave STATE_SCHED set,
 6387		 * because we will call napi->poll() one more time.
 6388		 * This C code was suggested by Alexander Duyck to help gcc.
 6389		 */
 6390		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
 6391						    NAPIF_STATE_SCHED;
 6392	} while (cmpxchg(&n->state, val, new) != val);
 6393
 6394	if (unlikely(val & NAPIF_STATE_MISSED)) {
 6395		__napi_schedule(n);
 6396		return false;
 6397	}
 6398
 6399	if (timeout)
 6400		hrtimer_start(&n->timer, ns_to_ktime(timeout),
 6401			      HRTIMER_MODE_REL_PINNED);
 6402	return ret;
 6403}
 6404EXPORT_SYMBOL(napi_complete_done);
 6405
 6406/* must be called under rcu_read_lock(), as we dont take a reference */
 6407static struct napi_struct *napi_by_id(unsigned int napi_id)
 6408{
 6409	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
 6410	struct napi_struct *napi;
 6411
 6412	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
 6413		if (napi->napi_id == napi_id)
 6414			return napi;
 6415
 6416	return NULL;
 6417}
 6418
 6419#if defined(CONFIG_NET_RX_BUSY_POLL)
 6420
 6421#define BUSY_POLL_BUDGET 8
 6422
 6423static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 6424{
 6425	int rc;
 6426
 6427	/* Busy polling means there is a high chance device driver hard irq
 6428	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
 6429	 * set in napi_schedule_prep().
 6430	 * Since we are about to call napi->poll() once more, we can safely
 6431	 * clear NAPI_STATE_MISSED.
 6432	 *
 6433	 * Note: x86 could use a single "lock and ..." instruction
 6434	 * to perform these two clear_bit()
 6435	 */
 6436	clear_bit(NAPI_STATE_MISSED, &napi->state);
 6437	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
 6438
 6439	local_bh_disable();
 6440
 6441	/* All we really want here is to re-enable device interrupts.
 6442	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
 6443	 */
 6444	rc = napi->poll(napi, BUSY_POLL_BUDGET);
 6445	/* We can't gro_normal_list() here, because napi->poll() might have
 6446	 * rearmed the napi (napi_complete_done()) in which case it could
 6447	 * already be running on another CPU.
 6448	 */
 6449	trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
 6450	netpoll_poll_unlock(have_poll_lock);
 6451	if (rc == BUSY_POLL_BUDGET) {
 6452		/* As the whole budget was spent, we still own the napi so can
 6453		 * safely handle the rx_list.
 6454		 */
 6455		gro_normal_list(napi);
 6456		__napi_schedule(napi);
 6457	}
 6458	local_bh_enable();
 6459}
 6460
 6461void napi_busy_loop(unsigned int napi_id,
 6462		    bool (*loop_end)(void *, unsigned long),
 6463		    void *loop_end_arg)
 6464{
 6465	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
 6466	int (*napi_poll)(struct napi_struct *napi, int budget);
 6467	void *have_poll_lock = NULL;
 6468	struct napi_struct *napi;
 6469
 6470restart:
 6471	napi_poll = NULL;
 6472
 6473	rcu_read_lock();
 6474
 6475	napi = napi_by_id(napi_id);
 6476	if (!napi)
 6477		goto out;
 6478
 6479	preempt_disable();
 6480	for (;;) {
 6481		int work = 0;
 6482
 6483		local_bh_disable();
 6484		if (!napi_poll) {
 6485			unsigned long val = READ_ONCE(napi->state);
 6486
 6487			/* If multiple threads are competing for this napi,
 6488			 * we avoid dirtying napi->state as much as we can.
 6489			 */
 6490			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
 6491				   NAPIF_STATE_IN_BUSY_POLL))
 6492				goto count;
 6493			if (cmpxchg(&napi->state, val,
 6494				    val | NAPIF_STATE_IN_BUSY_POLL |
 6495					  NAPIF_STATE_SCHED) != val)
 6496				goto count;
 6497			have_poll_lock = netpoll_poll_lock(napi);
 6498			napi_poll = napi->poll;
 6499		}
 6500		work = napi_poll(napi, BUSY_POLL_BUDGET);
 6501		trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
 6502		gro_normal_list(napi);
 6503count:
 6504		if (work > 0)
 6505			__NET_ADD_STATS(dev_net(napi->dev),
 6506					LINUX_MIB_BUSYPOLLRXPACKETS, work);
 6507		local_bh_enable();
 6508
 6509		if (!loop_end || loop_end(loop_end_arg, start_time))
 6510			break;
 6511
 6512		if (unlikely(need_resched())) {
 6513			if (napi_poll)
 6514				busy_poll_stop(napi, have_poll_lock);
 6515			preempt_enable();
 6516			rcu_read_unlock();
 6517			cond_resched();
 6518			if (loop_end(loop_end_arg, start_time))
 6519				return;
 6520			goto restart;
 6521		}
 6522		cpu_relax();
 6523	}
 6524	if (napi_poll)
 6525		busy_poll_stop(napi, have_poll_lock);
 6526	preempt_enable();
 6527out:
 6528	rcu_read_unlock();
 6529}
 6530EXPORT_SYMBOL(napi_busy_loop);
 6531
 6532#endif /* CONFIG_NET_RX_BUSY_POLL */
 6533
 6534static void napi_hash_add(struct napi_struct *napi)
 6535{
 6536	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
 6537	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
 6538		return;
 6539
 6540	spin_lock(&napi_hash_lock);
 6541
 6542	/* 0..NR_CPUS range is reserved for sender_cpu use */
 6543	do {
 6544		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
 6545			napi_gen_id = MIN_NAPI_ID;
 6546	} while (napi_by_id(napi_gen_id));
 6547	napi->napi_id = napi_gen_id;
 6548
 6549	hlist_add_head_rcu(&napi->napi_hash_node,
 6550			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
 6551
 6552	spin_unlock(&napi_hash_lock);
 6553}
 6554
 6555/* Warning : caller is responsible to make sure rcu grace period
 6556 * is respected before freeing memory containing @napi
 6557 */
 6558bool napi_hash_del(struct napi_struct *napi)
 6559{
 6560	bool rcu_sync_needed = false;
 6561
 6562	spin_lock(&napi_hash_lock);
 6563
 6564	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
 6565		rcu_sync_needed = true;
 6566		hlist_del_rcu(&napi->napi_hash_node);
 6567	}
 6568	spin_unlock(&napi_hash_lock);
 6569	return rcu_sync_needed;
 6570}
 6571EXPORT_SYMBOL_GPL(napi_hash_del);
 6572
 6573static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 6574{
 6575	struct napi_struct *napi;
 6576
 6577	napi = container_of(timer, struct napi_struct, timer);
 6578
 6579	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 6580	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 6581	 */
 6582	if (!napi_disable_pending(napi) &&
 6583	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 6584		__napi_schedule_irqoff(napi);
 6585
 6586	return HRTIMER_NORESTART;
 6587}
 6588
 6589static void init_gro_hash(struct napi_struct *napi)
 6590{
 6591	int i;
 6592
 6593	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6594		INIT_LIST_HEAD(&napi->gro_hash[i].list);
 6595		napi->gro_hash[i].count = 0;
 6596	}
 6597	napi->gro_bitmask = 0;
 6598}
 6599
 6600void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 6601		    int (*poll)(struct napi_struct *, int), int weight)
 6602{
 6603	INIT_LIST_HEAD(&napi->poll_list);
 6604	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 6605	napi->timer.function = napi_watchdog;
 6606	init_gro_hash(napi);
 6607	napi->skb = NULL;
 6608	INIT_LIST_HEAD(&napi->rx_list);
 6609	napi->rx_count = 0;
 6610	napi->poll = poll;
 6611	if (weight > NAPI_POLL_WEIGHT)
 6612		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
 6613				weight);
 6614	napi->weight = weight;
 6615	napi->dev = dev;
 6616#ifdef CONFIG_NETPOLL
 6617	napi->poll_owner = -1;
 6618#endif
 6619	set_bit(NAPI_STATE_SCHED, &napi->state);
 6620	set_bit(NAPI_STATE_NPSVC, &napi->state);
 6621	list_add_rcu(&napi->dev_list, &dev->napi_list);
 6622	napi_hash_add(napi);
 6623}
 6624EXPORT_SYMBOL(netif_napi_add);
 6625
 6626void napi_disable(struct napi_struct *n)
 6627{
 6628	might_sleep();
 6629	set_bit(NAPI_STATE_DISABLE, &n->state);
 6630
 6631	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
 6632		msleep(1);
 6633	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
 6634		msleep(1);
 6635
 6636	hrtimer_cancel(&n->timer);
 6637
 6638	clear_bit(NAPI_STATE_DISABLE, &n->state);
 6639}
 6640EXPORT_SYMBOL(napi_disable);
 6641
 6642static void flush_gro_hash(struct napi_struct *napi)
 6643{
 6644	int i;
 6645
 6646	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6647		struct sk_buff *skb, *n;
 6648
 6649		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
 6650			kfree_skb(skb);
 6651		napi->gro_hash[i].count = 0;
 6652	}
 6653}
 6654
 6655/* Must be called in process context */
 6656void netif_napi_del(struct napi_struct *napi)
 6657{
 6658	might_sleep();
 6659	if (napi_hash_del(napi))
 6660		synchronize_net();
 6661	list_del_init(&napi->dev_list);
 6662	napi_free_frags(napi);
 6663
 6664	flush_gro_hash(napi);
 6665	napi->gro_bitmask = 0;
 6666}
 6667EXPORT_SYMBOL(netif_napi_del);
 6668
 6669static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 6670{
 6671	void *have;
 6672	int work, weight;
 6673
 6674	list_del_init(&n->poll_list);
 6675
 6676	have = netpoll_poll_lock(n);
 6677
 6678	weight = n->weight;
 6679
 6680	/* This NAPI_STATE_SCHED test is for avoiding a race
 6681	 * with netpoll's poll_napi().  Only the entity which
 6682	 * obtains the lock and sees NAPI_STATE_SCHED set will
 6683	 * actually make the ->poll() call.  Therefore we avoid
 6684	 * accidentally calling ->poll() when NAPI is not scheduled.
 6685	 */
 6686	work = 0;
 6687	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 6688		work = n->poll(n, weight);
 6689		trace_napi_poll(n, work, weight);
 6690	}
 6691
 6692	if (unlikely(work > weight))
 6693		pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
 6694			    n->poll, work, weight);
 6695
 6696	if (likely(work < weight))
 6697		goto out_unlock;
 6698
 6699	/* Drivers must not modify the NAPI state if they
 6700	 * consume the entire weight.  In such cases this code
 6701	 * still "owns" the NAPI instance and therefore can
 6702	 * move the instance around on the list at-will.
 6703	 */
 6704	if (unlikely(napi_disable_pending(n))) {
 6705		napi_complete(n);
 6706		goto out_unlock;
 6707	}
 6708
 6709	if (n->gro_bitmask) {
 6710		/* flush too old packets
 6711		 * If HZ < 1000, flush all packets.
 6712		 */
 6713		napi_gro_flush(n, HZ >= 1000);
 6714	}
 6715
 6716	gro_normal_list(n);
 6717
 6718	/* Some drivers may have called napi_schedule
 6719	 * prior to exhausting their budget.
 6720	 */
 6721	if (unlikely(!list_empty(&n->poll_list))) {
 6722		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
 6723			     n->dev ? n->dev->name : "backlog");
 6724		goto out_unlock;
 6725	}
 6726
 6727	list_add_tail(&n->poll_list, repoll);
 6728
 6729out_unlock:
 6730	netpoll_poll_unlock(have);
 6731
 6732	return work;
 6733}
 6734
 6735static __latent_entropy void net_rx_action(struct softirq_action *h)
 6736{
 6737	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 6738	unsigned long time_limit = jiffies +
 6739		usecs_to_jiffies(netdev_budget_usecs);
 6740	int budget = netdev_budget;
 6741	LIST_HEAD(list);
 6742	LIST_HEAD(repoll);
 6743
 6744	local_irq_disable();
 6745	list_splice_init(&sd->poll_list, &list);
 6746	local_irq_enable();
 6747
 6748	for (;;) {
 6749		struct napi_struct *n;
 6750
 6751		if (list_empty(&list)) {
 6752			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
 6753				goto out;
 6754			break;
 6755		}
 6756
 6757		n = list_first_entry(&list, struct napi_struct, poll_list);
 6758		budget -= napi_poll(n, &repoll);
 6759
 6760		/* If softirq window is exhausted then punt.
 6761		 * Allow this to run for 2 jiffies since which will allow
 6762		 * an average latency of 1.5/HZ.
 6763		 */
 6764		if (unlikely(budget <= 0 ||
 6765			     time_after_eq(jiffies, time_limit))) {
 6766			sd->time_squeeze++;
 6767			break;
 6768		}
 6769	}
 6770
 6771	local_irq_disable();
 6772
 6773	list_splice_tail_init(&sd->poll_list, &list);
 6774	list_splice_tail(&repoll, &list);
 6775	list_splice(&list, &sd->poll_list);
 6776	if (!list_empty(&sd->poll_list))
 6777		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 6778
 6779	net_rps_action_and_irq_enable(sd);
 6780out:
 6781	__kfree_skb_flush();
 6782}
 6783
 6784struct netdev_adjacent {
 6785	struct net_device *dev;
 6786
 6787	/* upper master flag, there can only be one master device per list */
 6788	bool master;
 6789
 6790	/* lookup ignore flag */
 6791	bool ignore;
 6792
 6793	/* counter for the number of times this device was added to us */
 6794	u16 ref_nr;
 6795
 6796	/* private field for the users */
 6797	void *private;
 6798
 6799	struct list_head list;
 6800	struct rcu_head rcu;
 6801};
 6802
 6803static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
 6804						 struct list_head *adj_list)
 6805{
 6806	struct netdev_adjacent *adj;
 6807
 6808	list_for_each_entry(adj, adj_list, list) {
 6809		if (adj->dev == adj_dev)
 6810			return adj;
 6811	}
 6812	return NULL;
 6813}
 6814
 6815static int ____netdev_has_upper_dev(struct net_device *upper_dev,
 6816				    struct netdev_nested_priv *priv)
 6817{
 6818	struct net_device *dev = (struct net_device *)priv->data;
 6819
 6820	return upper_dev == dev;
 6821}
 6822
 6823/**
 6824 * netdev_has_upper_dev - Check if device is linked to an upper device
 6825 * @dev: device
 6826 * @upper_dev: upper device to check
 6827 *
 6828 * Find out if a device is linked to specified upper device and return true
 6829 * in case it is. Note that this checks only immediate upper device,
 6830 * not through a complete stack of devices. The caller must hold the RTNL lock.
 6831 */
 6832bool netdev_has_upper_dev(struct net_device *dev,
 6833			  struct net_device *upper_dev)
 6834{
 6835	struct netdev_nested_priv priv = {
 6836		.data = (void *)upper_dev,
 6837	};
 6838
 6839	ASSERT_RTNL();
 6840
 6841	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6842					     &priv);
 6843}
 6844EXPORT_SYMBOL(netdev_has_upper_dev);
 6845
 6846/**
 6847 * netdev_has_upper_dev_all - Check if device is linked to an upper device
 6848 * @dev: device
 6849 * @upper_dev: upper device to check
 6850 *
 6851 * Find out if a device is linked to specified upper device and return true
 6852 * in case it is. Note that this checks the entire upper device chain.
 6853 * The caller must hold rcu lock.
 6854 */
 6855
 6856bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
 6857				  struct net_device *upper_dev)
 6858{
 6859	struct netdev_nested_priv priv = {
 6860		.data = (void *)upper_dev,
 6861	};
 6862
 6863	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6864					       &priv);
 6865}
 6866EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
 6867
 6868/**
 6869 * netdev_has_any_upper_dev - Check if device is linked to some device
 6870 * @dev: device
 6871 *
 6872 * Find out if a device is linked to an upper device and return true in case
 6873 * it is. The caller must hold the RTNL lock.
 6874 */
 6875bool netdev_has_any_upper_dev(struct net_device *dev)
 6876{
 6877	ASSERT_RTNL();
 6878
 6879	return !list_empty(&dev->adj_list.upper);
 6880}
 6881EXPORT_SYMBOL(netdev_has_any_upper_dev);
 6882
 6883/**
 6884 * netdev_master_upper_dev_get - Get master upper device
 6885 * @dev: device
 6886 *
 6887 * Find a master upper device and return pointer to it or NULL in case
 6888 * it's not there. The caller must hold the RTNL lock.
 6889 */
 6890struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 6891{
 6892	struct netdev_adjacent *upper;
 6893
 6894	ASSERT_RTNL();
 6895
 6896	if (list_empty(&dev->adj_list.upper))
 6897		return NULL;
 6898
 6899	upper = list_first_entry(&dev->adj_list.upper,
 6900				 struct netdev_adjacent, list);
 6901	if (likely(upper->master))
 6902		return upper->dev;
 6903	return NULL;
 6904}
 6905EXPORT_SYMBOL(netdev_master_upper_dev_get);
 6906
 6907static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
 6908{
 6909	struct netdev_adjacent *upper;
 6910
 6911	ASSERT_RTNL();
 6912
 6913	if (list_empty(&dev->adj_list.upper))
 6914		return NULL;
 6915
 6916	upper = list_first_entry(&dev->adj_list.upper,
 6917				 struct netdev_adjacent, list);
 6918	if (likely(upper->master) && !upper->ignore)
 6919		return upper->dev;
 6920	return NULL;
 6921}
 6922
 6923/**
 6924 * netdev_has_any_lower_dev - Check if device is linked to some device
 6925 * @dev: device
 6926 *
 6927 * Find out if a device is linked to a lower device and return true in case
 6928 * it is. The caller must hold the RTNL lock.
 6929 */
 6930static bool netdev_has_any_lower_dev(struct net_device *dev)
 6931{
 6932	ASSERT_RTNL();
 6933
 6934	return !list_empty(&dev->adj_list.lower);
 6935}
 6936
 6937void *netdev_adjacent_get_private(struct list_head *adj_list)
 6938{
 6939	struct netdev_adjacent *adj;
 6940
 6941	adj = list_entry(adj_list, struct netdev_adjacent, list);
 6942
 6943	return adj->private;
 6944}
 6945EXPORT_SYMBOL(netdev_adjacent_get_private);
 6946
 6947/**
 6948 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 6949 * @dev: device
 6950 * @iter: list_head ** of the current position
 6951 *
 6952 * Gets the next device from the dev's upper list, starting from iter
 6953 * position. The caller must hold RCU read lock.
 6954 */
 6955struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 6956						 struct list_head **iter)
 6957{
 6958	struct netdev_adjacent *upper;
 6959
 6960	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 6961
 6962	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6963
 6964	if (&upper->list == &dev->adj_list.upper)
 6965		return NULL;
 6966
 6967	*iter = &upper->list;
 6968
 6969	return upper->dev;
 6970}
 6971EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 6972
 6973static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
 6974						  struct list_head **iter,
 6975						  bool *ignore)
 6976{
 6977	struct netdev_adjacent *upper;
 6978
 6979	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
 6980
 6981	if (&upper->list == &dev->adj_list.upper)
 6982		return NULL;
 6983
 6984	*iter = &upper->list;
 6985	*ignore = upper->ignore;
 6986
 6987	return upper->dev;
 6988}
 6989
 6990static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 6991						    struct list_head **iter)
 6992{
 6993	struct netdev_adjacent *upper;
 6994
 6995	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 6996
 6997	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6998
 6999	if (&upper->list == &dev->adj_list.upper)
 7000		return NULL;
 7001
 7002	*iter = &upper->list;
 7003
 7004	return upper->dev;
 7005}
 7006
 7007static int __netdev_walk_all_upper_dev(struct net_device *dev,
 7008				       int (*fn)(struct net_device *dev,
 7009					 struct netdev_nested_priv *priv),
 7010				       struct netdev_nested_priv *priv)
 7011{
 7012	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7013	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7014	int ret, cur = 0;
 7015	bool ignore;
 7016
 7017	now = dev;
 7018	iter = &dev->adj_list.upper;
 7019
 7020	while (1) {
 7021		if (now != dev) {
 7022			ret = fn(now, priv);
 7023			if (ret)
 7024				return ret;
 7025		}
 7026
 7027		next = NULL;
 7028		while (1) {
 7029			udev = __netdev_next_upper_dev(now, &iter, &ignore);
 7030			if (!udev)
 7031				break;
 7032			if (ignore)
 7033				continue;
 7034
 7035			next = udev;
 7036			niter = &udev->adj_list.upper;
 7037			dev_stack[cur] = now;
 7038			iter_stack[cur++] = iter;
 7039			break;
 7040		}
 7041
 7042		if (!next) {
 7043			if (!cur)
 7044				return 0;
 7045			next = dev_stack[--cur];
 7046			niter = iter_stack[cur];
 7047		}
 7048
 7049		now = next;
 7050		iter = niter;
 7051	}
 7052
 7053	return 0;
 7054}
 7055
 7056int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 7057				  int (*fn)(struct net_device *dev,
 7058					    struct netdev_nested_priv *priv),
 7059				  struct netdev_nested_priv *priv)
 7060{
 7061	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7062	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7063	int ret, cur = 0;
 7064
 7065	now = dev;
 7066	iter = &dev->adj_list.upper;
 7067
 7068	while (1) {
 7069		if (now != dev) {
 7070			ret = fn(now, priv);
 7071			if (ret)
 7072				return ret;
 7073		}
 7074
 7075		next = NULL;
 7076		while (1) {
 7077			udev = netdev_next_upper_dev_rcu(now, &iter);
 7078			if (!udev)
 7079				break;
 7080
 7081			next = udev;
 7082			niter = &udev->adj_list.upper;
 7083			dev_stack[cur] = now;
 7084			iter_stack[cur++] = iter;
 7085			break;
 7086		}
 7087
 7088		if (!next) {
 7089			if (!cur)
 7090				return 0;
 7091			next = dev_stack[--cur];
 7092			niter = iter_stack[cur];
 7093		}
 7094
 7095		now = next;
 7096		iter = niter;
 7097	}
 7098
 7099	return 0;
 7100}
 7101EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
 7102
 7103static bool __netdev_has_upper_dev(struct net_device *dev,
 7104				   struct net_device *upper_dev)
 7105{
 7106	struct netdev_nested_priv priv = {
 7107		.flags = 0,
 7108		.data = (void *)upper_dev,
 7109	};
 7110
 7111	ASSERT_RTNL();
 7112
 7113	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
 7114					   &priv);
 7115}
 7116
 7117/**
 7118 * netdev_lower_get_next_private - Get the next ->private from the
 7119 *				   lower neighbour list
 7120 * @dev: device
 7121 * @iter: list_head ** of the current position
 7122 *
 7123 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7124 * list, starting from iter position. The caller must hold either hold the
 7125 * RTNL lock or its own locking that guarantees that the neighbour lower
 7126 * list will remain unchanged.
 7127 */
 7128void *netdev_lower_get_next_private(struct net_device *dev,
 7129				    struct list_head **iter)
 7130{
 7131	struct netdev_adjacent *lower;
 7132
 7133	lower = list_entry(*iter, struct netdev_adjacent, list);
 7134
 7135	if (&lower->list == &dev->adj_list.lower)
 7136		return NULL;
 7137
 7138	*iter = lower->list.next;
 7139
 7140	return lower->private;
 7141}
 7142EXPORT_SYMBOL(netdev_lower_get_next_private);
 7143
 7144/**
 7145 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 7146 *				       lower neighbour list, RCU
 7147 *				       variant
 7148 * @dev: device
 7149 * @iter: list_head ** of the current position
 7150 *
 7151 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7152 * list, starting from iter position. The caller must hold RCU read lock.
 7153 */
 7154void *netdev_lower_get_next_private_rcu(struct net_device *dev,
 7155					struct list_head **iter)
 7156{
 7157	struct netdev_adjacent *lower;
 7158
 7159	WARN_ON_ONCE(!rcu_read_lock_held());
 7160
 7161	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7162
 7163	if (&lower->list == &dev->adj_list.lower)
 7164		return NULL;
 7165
 7166	*iter = &lower->list;
 7167
 7168	return lower->private;
 7169}
 7170EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
 7171
 7172/**
 7173 * netdev_lower_get_next - Get the next device from the lower neighbour
 7174 *                         list
 7175 * @dev: device
 7176 * @iter: list_head ** of the current position
 7177 *
 7178 * Gets the next netdev_adjacent from the dev's lower neighbour
 7179 * list, starting from iter position. The caller must hold RTNL lock or
 7180 * its own locking that guarantees that the neighbour lower
 7181 * list will remain unchanged.
 7182 */
 7183void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 7184{
 7185	struct netdev_adjacent *lower;
 7186
 7187	lower = list_entry(*iter, struct netdev_adjacent, list);
 7188
 7189	if (&lower->list == &dev->adj_list.lower)
 7190		return NULL;
 7191
 7192	*iter = lower->list.next;
 7193
 7194	return lower->dev;
 7195}
 7196EXPORT_SYMBOL(netdev_lower_get_next);
 7197
 7198static struct net_device *netdev_next_lower_dev(struct net_device *dev,
 7199						struct list_head **iter)
 7200{
 7201	struct netdev_adjacent *lower;
 7202
 7203	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7204
 7205	if (&lower->list == &dev->adj_list.lower)
 7206		return NULL;
 7207
 7208	*iter = &lower->list;
 7209
 7210	return lower->dev;
 7211}
 7212
 7213static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
 7214						  struct list_head **iter,
 7215						  bool *ignore)
 7216{
 7217	struct netdev_adjacent *lower;
 7218
 7219	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7220
 7221	if (&lower->list == &dev->adj_list.lower)
 7222		return NULL;
 7223
 7224	*iter = &lower->list;
 7225	*ignore = lower->ignore;
 7226
 7227	return lower->dev;
 7228}
 7229
 7230int netdev_walk_all_lower_dev(struct net_device *dev,
 7231			      int (*fn)(struct net_device *dev,
 7232					struct netdev_nested_priv *priv),
 7233			      struct netdev_nested_priv *priv)
 7234{
 7235	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7236	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7237	int ret, cur = 0;
 7238
 7239	now = dev;
 7240	iter = &dev->adj_list.lower;
 7241
 7242	while (1) {
 7243		if (now != dev) {
 7244			ret = fn(now, priv);
 7245			if (ret)
 7246				return ret;
 7247		}
 7248
 7249		next = NULL;
 7250		while (1) {
 7251			ldev = netdev_next_lower_dev(now, &iter);
 7252			if (!ldev)
 7253				break;
 7254
 7255			next = ldev;
 7256			niter = &ldev->adj_list.lower;
 7257			dev_stack[cur] = now;
 7258			iter_stack[cur++] = iter;
 7259			break;
 7260		}
 7261
 7262		if (!next) {
 7263			if (!cur)
 7264				return 0;
 7265			next = dev_stack[--cur];
 7266			niter = iter_stack[cur];
 7267		}
 7268
 7269		now = next;
 7270		iter = niter;
 7271	}
 7272
 7273	return 0;
 7274}
 7275EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
 7276
 7277static int __netdev_walk_all_lower_dev(struct net_device *dev,
 7278				       int (*fn)(struct net_device *dev,
 7279					 struct netdev_nested_priv *priv),
 7280				       struct netdev_nested_priv *priv)
 7281{
 7282	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7283	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7284	int ret, cur = 0;
 7285	bool ignore;
 7286
 7287	now = dev;
 7288	iter = &dev->adj_list.lower;
 7289
 7290	while (1) {
 7291		if (now != dev) {
 7292			ret = fn(now, priv);
 7293			if (ret)
 7294				return ret;
 7295		}
 7296
 7297		next = NULL;
 7298		while (1) {
 7299			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
 7300			if (!ldev)
 7301				break;
 7302			if (ignore)
 7303				continue;
 7304
 7305			next = ldev;
 7306			niter = &ldev->adj_list.lower;
 7307			dev_stack[cur] = now;
 7308			iter_stack[cur++] = iter;
 7309			break;
 7310		}
 7311
 7312		if (!next) {
 7313			if (!cur)
 7314				return 0;
 7315			next = dev_stack[--cur];
 7316			niter = iter_stack[cur];
 7317		}
 7318
 7319		now = next;
 7320		iter = niter;
 7321	}
 7322
 7323	return 0;
 7324}
 7325
 7326struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
 7327					     struct list_head **iter)
 7328{
 7329	struct netdev_adjacent *lower;
 7330
 7331	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7332	if (&lower->list == &dev->adj_list.lower)
 7333		return NULL;
 7334
 7335	*iter = &lower->list;
 7336
 7337	return lower->dev;
 7338}
 7339EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
 7340
 7341static u8 __netdev_upper_depth(struct net_device *dev)
 7342{
 7343	struct net_device *udev;
 7344	struct list_head *iter;
 7345	u8 max_depth = 0;
 7346	bool ignore;
 7347
 7348	for (iter = &dev->adj_list.upper,
 7349	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
 7350	     udev;
 7351	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
 7352		if (ignore)
 7353			continue;
 7354		if (max_depth < udev->upper_level)
 7355			max_depth = udev->upper_level;
 7356	}
 7357
 7358	return max_depth;
 7359}
 7360
 7361static u8 __netdev_lower_depth(struct net_device *dev)
 7362{
 7363	struct net_device *ldev;
 7364	struct list_head *iter;
 7365	u8 max_depth = 0;
 7366	bool ignore;
 7367
 7368	for (iter = &dev->adj_list.lower,
 7369	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
 7370	     ldev;
 7371	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
 7372		if (ignore)
 7373			continue;
 7374		if (max_depth < ldev->lower_level)
 7375			max_depth = ldev->lower_level;
 7376	}
 7377
 7378	return max_depth;
 7379}
 7380
 7381static int __netdev_update_upper_level(struct net_device *dev,
 7382				       struct netdev_nested_priv *__unused)
 7383{
 7384	dev->upper_level = __netdev_upper_depth(dev) + 1;
 7385	return 0;
 7386}
 7387
 7388static int __netdev_update_lower_level(struct net_device *dev,
 7389				       struct netdev_nested_priv *priv)
 7390{
 7391	dev->lower_level = __netdev_lower_depth(dev) + 1;
 7392
 7393#ifdef CONFIG_LOCKDEP
 7394	if (!priv)
 7395		return 0;
 7396
 7397	if (priv->flags & NESTED_SYNC_IMM)
 7398		dev->nested_level = dev->lower_level - 1;
 7399	if (priv->flags & NESTED_SYNC_TODO)
 7400		net_unlink_todo(dev);
 7401#endif
 7402	return 0;
 7403}
 7404
 7405int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
 7406				  int (*fn)(struct net_device *dev,
 7407					    struct netdev_nested_priv *priv),
 7408				  struct netdev_nested_priv *priv)
 7409{
 7410	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7411	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7412	int ret, cur = 0;
 7413
 7414	now = dev;
 7415	iter = &dev->adj_list.lower;
 7416
 7417	while (1) {
 7418		if (now != dev) {
 7419			ret = fn(now, priv);
 7420			if (ret)
 7421				return ret;
 7422		}
 7423
 7424		next = NULL;
 7425		while (1) {
 7426			ldev = netdev_next_lower_dev_rcu(now, &iter);
 7427			if (!ldev)
 7428				break;
 7429
 7430			next = ldev;
 7431			niter = &ldev->adj_list.lower;
 7432			dev_stack[cur] = now;
 7433			iter_stack[cur++] = iter;
 7434			break;
 7435		}
 7436
 7437		if (!next) {
 7438			if (!cur)
 7439				return 0;
 7440			next = dev_stack[--cur];
 7441			niter = iter_stack[cur];
 7442		}
 7443
 7444		now = next;
 7445		iter = niter;
 7446	}
 7447
 7448	return 0;
 7449}
 7450EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
 7451
 7452/**
 7453 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 7454 *				       lower neighbour list, RCU
 7455 *				       variant
 7456 * @dev: device
 7457 *
 7458 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 7459 * list. The caller must hold RCU read lock.
 7460 */
 7461void *netdev_lower_get_first_private_rcu(struct net_device *dev)
 7462{
 7463	struct netdev_adjacent *lower;
 7464
 7465	lower = list_first_or_null_rcu(&dev->adj_list.lower,
 7466			struct netdev_adjacent, list);
 7467	if (lower)
 7468		return lower->private;
 7469	return NULL;
 7470}
 7471EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
 7472
 7473/**
 7474 * netdev_master_upper_dev_get_rcu - Get master upper device
 7475 * @dev: device
 7476 *
 7477 * Find a master upper device and return pointer to it or NULL in case
 7478 * it's not there. The caller must hold the RCU read lock.
 7479 */
 7480struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
 7481{
 7482	struct netdev_adjacent *upper;
 7483
 7484	upper = list_first_or_null_rcu(&dev->adj_list.upper,
 7485				       struct netdev_adjacent, list);
 7486	if (upper && likely(upper->master))
 7487		return upper->dev;
 7488	return NULL;
 7489}
 7490EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
 7491
 7492static int netdev_adjacent_sysfs_add(struct net_device *dev,
 7493			      struct net_device *adj_dev,
 7494			      struct list_head *dev_list)
 7495{
 7496	char linkname[IFNAMSIZ+7];
 7497
 7498	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7499		"upper_%s" : "lower_%s", adj_dev->name);
 7500	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
 7501				 linkname);
 7502}
 7503static void netdev_adjacent_sysfs_del(struct net_device *dev,
 7504			       char *name,
 7505			       struct list_head *dev_list)
 7506{
 7507	char linkname[IFNAMSIZ+7];
 7508
 7509	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7510		"upper_%s" : "lower_%s", name);
 7511	sysfs_remove_link(&(dev->dev.kobj), linkname);
 7512}
 7513
 7514static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
 7515						 struct net_device *adj_dev,
 7516						 struct list_head *dev_list)
 7517{
 7518	return (dev_list == &dev->adj_list.upper ||
 7519		dev_list == &dev->adj_list.lower) &&
 7520		net_eq(dev_net(dev), dev_net(adj_dev));
 7521}
 7522
 7523static int __netdev_adjacent_dev_insert(struct net_device *dev,
 7524					struct net_device *adj_dev,
 7525					struct list_head *dev_list,
 7526					void *private, bool master)
 7527{
 7528	struct netdev_adjacent *adj;
 7529	int ret;
 7530
 7531	adj = __netdev_find_adj(adj_dev, dev_list);
 7532
 7533	if (adj) {
 7534		adj->ref_nr += 1;
 7535		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
 7536			 dev->name, adj_dev->name, adj->ref_nr);
 7537
 7538		return 0;
 7539	}
 7540
 7541	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
 7542	if (!adj)
 7543		return -ENOMEM;
 7544
 7545	adj->dev = adj_dev;
 7546	adj->master = master;
 7547	adj->ref_nr = 1;
 7548	adj->private = private;
 7549	adj->ignore = false;
 7550	dev_hold(adj_dev);
 7551
 7552	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
 7553		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
 7554
 7555	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
 7556		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
 7557		if (ret)
 7558			goto free_adj;
 7559	}
 7560
 7561	/* Ensure that master link is always the first item in list. */
 7562	if (master) {
 7563		ret = sysfs_create_link(&(dev->dev.kobj),
 7564					&(adj_dev->dev.kobj), "master");
 7565		if (ret)
 7566			goto remove_symlinks;
 7567
 7568		list_add_rcu(&adj->list, dev_list);
 7569	} else {
 7570		list_add_tail_rcu(&adj->list, dev_list);
 7571	}
 7572
 7573	return 0;
 7574
 7575remove_symlinks:
 7576	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7577		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7578free_adj:
 7579	kfree(adj);
 7580	dev_put(adj_dev);
 7581
 7582	return ret;
 7583}
 7584
 7585static void __netdev_adjacent_dev_remove(struct net_device *dev,
 7586					 struct net_device *adj_dev,
 7587					 u16 ref_nr,
 7588					 struct list_head *dev_list)
 7589{
 7590	struct netdev_adjacent *adj;
 7591
 7592	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
 7593		 dev->name, adj_dev->name, ref_nr);
 7594
 7595	adj = __netdev_find_adj(adj_dev, dev_list);
 7596
 7597	if (!adj) {
 7598		pr_err("Adjacency does not exist for device %s from %s\n",
 7599		       dev->name, adj_dev->name);
 7600		WARN_ON(1);
 7601		return;
 7602	}
 7603
 7604	if (adj->ref_nr > ref_nr) {
 7605		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
 7606			 dev->name, adj_dev->name, ref_nr,
 7607			 adj->ref_nr - ref_nr);
 7608		adj->ref_nr -= ref_nr;
 7609		return;
 7610	}
 7611
 7612	if (adj->master)
 7613		sysfs_remove_link(&(dev->dev.kobj), "master");
 7614
 7615	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7616		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7617
 7618	list_del_rcu(&adj->list);
 7619	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
 7620		 adj_dev->name, dev->name, adj_dev->name);
 7621	dev_put(adj_dev);
 7622	kfree_rcu(adj, rcu);
 7623}
 7624
 7625static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
 7626					    struct net_device *upper_dev,
 7627					    struct list_head *up_list,
 7628					    struct list_head *down_list,
 7629					    void *private, bool master)
 7630{
 7631	int ret;
 7632
 7633	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
 7634					   private, master);
 7635	if (ret)
 7636		return ret;
 7637
 7638	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
 7639					   private, false);
 7640	if (ret) {
 7641		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
 7642		return ret;
 7643	}
 7644
 7645	return 0;
 7646}
 7647
 7648static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
 7649					       struct net_device *upper_dev,
 7650					       u16 ref_nr,
 7651					       struct list_head *up_list,
 7652					       struct list_head *down_list)
 7653{
 7654	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
 7655	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
 7656}
 7657
 7658static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
 7659						struct net_device *upper_dev,
 7660						void *private, bool master)
 7661{
 7662	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
 7663						&dev->adj_list.upper,
 7664						&upper_dev->adj_list.lower,
 7665						private, master);
 7666}
 7667
 7668static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 7669						   struct net_device *upper_dev)
 7670{
 7671	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
 7672					   &dev->adj_list.upper,
 7673					   &upper_dev->adj_list.lower);
 7674}
 7675
 7676static int __netdev_upper_dev_link(struct net_device *dev,
 7677				   struct net_device *upper_dev, bool master,
 7678				   void *upper_priv, void *upper_info,
 7679				   struct netdev_nested_priv *priv,
 7680				   struct netlink_ext_ack *extack)
 7681{
 7682	struct netdev_notifier_changeupper_info changeupper_info = {
 7683		.info = {
 7684			.dev = dev,
 7685			.extack = extack,
 7686		},
 7687		.upper_dev = upper_dev,
 7688		.master = master,
 7689		.linking = true,
 7690		.upper_info = upper_info,
 7691	};
 7692	struct net_device *master_dev;
 7693	int ret = 0;
 7694
 7695	ASSERT_RTNL();
 7696
 7697	if (dev == upper_dev)
 7698		return -EBUSY;
 7699
 7700	/* To prevent loops, check if dev is not upper device to upper_dev. */
 7701	if (__netdev_has_upper_dev(upper_dev, dev))
 7702		return -EBUSY;
 7703
 7704	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
 7705		return -EMLINK;
 7706
 7707	if (!master) {
 7708		if (__netdev_has_upper_dev(dev, upper_dev))
 7709			return -EEXIST;
 7710	} else {
 7711		master_dev = __netdev_master_upper_dev_get(dev);
 7712		if (master_dev)
 7713			return master_dev == upper_dev ? -EEXIST : -EBUSY;
 7714	}
 7715
 7716	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7717					    &changeupper_info.info);
 7718	ret = notifier_to_errno(ret);
 7719	if (ret)
 7720		return ret;
 7721
 7722	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
 7723						   master);
 7724	if (ret)
 7725		return ret;
 7726
 7727	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7728					    &changeupper_info.info);
 7729	ret = notifier_to_errno(ret);
 7730	if (ret)
 7731		goto rollback;
 7732
 7733	__netdev_update_upper_level(dev, NULL);
 7734	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7735
 7736	__netdev_update_lower_level(upper_dev, priv);
 7737	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7738				    priv);
 7739
 7740	return 0;
 7741
 7742rollback:
 7743	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7744
 7745	return ret;
 7746}
 7747
 7748/**
 7749 * netdev_upper_dev_link - Add a link to the upper device
 7750 * @dev: device
 7751 * @upper_dev: new upper device
 7752 * @extack: netlink extended ack
 7753 *
 7754 * Adds a link to device which is upper to this one. The caller must hold
 7755 * the RTNL lock. On a failure a negative errno code is returned.
 7756 * On success the reference counts are adjusted and the function
 7757 * returns zero.
 7758 */
 7759int netdev_upper_dev_link(struct net_device *dev,
 7760			  struct net_device *upper_dev,
 7761			  struct netlink_ext_ack *extack)
 7762{
 7763	struct netdev_nested_priv priv = {
 7764		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7765		.data = NULL,
 7766	};
 7767
 7768	return __netdev_upper_dev_link(dev, upper_dev, false,
 7769				       NULL, NULL, &priv, extack);
 7770}
 7771EXPORT_SYMBOL(netdev_upper_dev_link);
 7772
 7773/**
 7774 * netdev_master_upper_dev_link - Add a master link to the upper device
 7775 * @dev: device
 7776 * @upper_dev: new upper device
 7777 * @upper_priv: upper device private
 7778 * @upper_info: upper info to be passed down via notifier
 7779 * @extack: netlink extended ack
 7780 *
 7781 * Adds a link to device which is upper to this one. In this case, only
 7782 * one master upper device can be linked, although other non-master devices
 7783 * might be linked as well. The caller must hold the RTNL lock.
 7784 * On a failure a negative errno code is returned. On success the reference
 7785 * counts are adjusted and the function returns zero.
 7786 */
 7787int netdev_master_upper_dev_link(struct net_device *dev,
 7788				 struct net_device *upper_dev,
 7789				 void *upper_priv, void *upper_info,
 7790				 struct netlink_ext_ack *extack)
 7791{
 7792	struct netdev_nested_priv priv = {
 7793		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7794		.data = NULL,
 7795	};
 7796
 7797	return __netdev_upper_dev_link(dev, upper_dev, true,
 7798				       upper_priv, upper_info, &priv, extack);
 7799}
 7800EXPORT_SYMBOL(netdev_master_upper_dev_link);
 7801
 7802static void __netdev_upper_dev_unlink(struct net_device *dev,
 7803				      struct net_device *upper_dev,
 7804				      struct netdev_nested_priv *priv)
 7805{
 7806	struct netdev_notifier_changeupper_info changeupper_info = {
 7807		.info = {
 7808			.dev = dev,
 7809		},
 7810		.upper_dev = upper_dev,
 7811		.linking = false,
 7812	};
 7813
 7814	ASSERT_RTNL();
 7815
 7816	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
 7817
 7818	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7819				      &changeupper_info.info);
 7820
 7821	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7822
 7823	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7824				      &changeupper_info.info);
 7825
 7826	__netdev_update_upper_level(dev, NULL);
 7827	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7828
 7829	__netdev_update_lower_level(upper_dev, priv);
 7830	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7831				    priv);
 7832}
 7833
 7834/**
 7835 * netdev_upper_dev_unlink - Removes a link to upper device
 7836 * @dev: device
 7837 * @upper_dev: new upper device
 7838 *
 7839 * Removes a link to device which is upper to this one. The caller must hold
 7840 * the RTNL lock.
 7841 */
 7842void netdev_upper_dev_unlink(struct net_device *dev,
 7843			     struct net_device *upper_dev)
 7844{
 7845	struct netdev_nested_priv priv = {
 7846		.flags = NESTED_SYNC_TODO,
 7847		.data = NULL,
 7848	};
 7849
 7850	__netdev_upper_dev_unlink(dev, upper_dev, &priv);
 7851}
 7852EXPORT_SYMBOL(netdev_upper_dev_unlink);
 7853
 7854static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
 7855				      struct net_device *lower_dev,
 7856				      bool val)
 7857{
 7858	struct netdev_adjacent *adj;
 7859
 7860	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
 7861	if (adj)
 7862		adj->ignore = val;
 7863
 7864	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
 7865	if (adj)
 7866		adj->ignore = val;
 7867}
 7868
 7869static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
 7870					struct net_device *lower_dev)
 7871{
 7872	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
 7873}
 7874
 7875static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
 7876				       struct net_device *lower_dev)
 7877{
 7878	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
 7879}
 7880
 7881int netdev_adjacent_change_prepare(struct net_device *old_dev,
 7882				   struct net_device *new_dev,
 7883				   struct net_device *dev,
 7884				   struct netlink_ext_ack *extack)
 7885{
 7886	struct netdev_nested_priv priv = {
 7887		.flags = 0,
 7888		.data = NULL,
 7889	};
 7890	int err;
 7891
 7892	if (!new_dev)
 7893		return 0;
 7894
 7895	if (old_dev && new_dev != old_dev)
 7896		netdev_adjacent_dev_disable(dev, old_dev);
 7897	err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
 7898				      extack);
 7899	if (err) {
 7900		if (old_dev && new_dev != old_dev)
 7901			netdev_adjacent_dev_enable(dev, old_dev);
 7902		return err;
 7903	}
 7904
 7905	return 0;
 7906}
 7907EXPORT_SYMBOL(netdev_adjacent_change_prepare);
 7908
 7909void netdev_adjacent_change_commit(struct net_device *old_dev,
 7910				   struct net_device *new_dev,
 7911				   struct net_device *dev)
 7912{
 7913	struct netdev_nested_priv priv = {
 7914		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7915		.data = NULL,
 7916	};
 7917
 7918	if (!new_dev || !old_dev)
 7919		return;
 7920
 7921	if (new_dev == old_dev)
 7922		return;
 7923
 7924	netdev_adjacent_dev_enable(dev, old_dev);
 7925	__netdev_upper_dev_unlink(old_dev, dev, &priv);
 7926}
 7927EXPORT_SYMBOL(netdev_adjacent_change_commit);
 7928
 7929void netdev_adjacent_change_abort(struct net_device *old_dev,
 7930				  struct net_device *new_dev,
 7931				  struct net_device *dev)
 7932{
 7933	struct netdev_nested_priv priv = {
 7934		.flags = 0,
 7935		.data = NULL,
 7936	};
 7937
 7938	if (!new_dev)
 7939		return;
 7940
 7941	if (old_dev && new_dev != old_dev)
 7942		netdev_adjacent_dev_enable(dev, old_dev);
 7943
 7944	__netdev_upper_dev_unlink(new_dev, dev, &priv);
 7945}
 7946EXPORT_SYMBOL(netdev_adjacent_change_abort);
 7947
 7948/**
 7949 * netdev_bonding_info_change - Dispatch event about slave change
 7950 * @dev: device
 7951 * @bonding_info: info to dispatch
 7952 *
 7953 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 7954 * The caller must hold the RTNL lock.
 7955 */
 7956void netdev_bonding_info_change(struct net_device *dev,
 7957				struct netdev_bonding_info *bonding_info)
 7958{
 7959	struct netdev_notifier_bonding_info info = {
 7960		.info.dev = dev,
 7961	};
 7962
 7963	memcpy(&info.bonding_info, bonding_info,
 7964	       sizeof(struct netdev_bonding_info));
 7965	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
 7966				      &info.info);
 7967}
 7968EXPORT_SYMBOL(netdev_bonding_info_change);
 7969
 7970/**
 7971 * netdev_get_xmit_slave - Get the xmit slave of master device
 7972 * @dev: device
 7973 * @skb: The packet
 7974 * @all_slaves: assume all the slaves are active
 7975 *
 7976 * The reference counters are not incremented so the caller must be
 7977 * careful with locks. The caller must hold RCU lock.
 7978 * %NULL is returned if no slave is found.
 7979 */
 7980
 7981struct net_device *netdev_get_xmit_slave(struct net_device *dev,
 7982					 struct sk_buff *skb,
 7983					 bool all_slaves)
 7984{
 7985	const struct net_device_ops *ops = dev->netdev_ops;
 7986
 7987	if (!ops->ndo_get_xmit_slave)
 7988		return NULL;
 7989	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
 7990}
 7991EXPORT_SYMBOL(netdev_get_xmit_slave);
 7992
 7993static void netdev_adjacent_add_links(struct net_device *dev)
 7994{
 7995	struct netdev_adjacent *iter;
 7996
 7997	struct net *net = dev_net(dev);
 7998
 7999	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8000		if (!net_eq(net, dev_net(iter->dev)))
 8001			continue;
 8002		netdev_adjacent_sysfs_add(iter->dev, dev,
 8003					  &iter->dev->adj_list.lower);
 8004		netdev_adjacent_sysfs_add(dev, iter->dev,
 8005					  &dev->adj_list.upper);
 8006	}
 8007
 8008	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8009		if (!net_eq(net, dev_net(iter->dev)))
 8010			continue;
 8011		netdev_adjacent_sysfs_add(iter->dev, dev,
 8012					  &iter->dev->adj_list.upper);
 8013		netdev_adjacent_sysfs_add(dev, iter->dev,
 8014					  &dev->adj_list.lower);
 8015	}
 8016}
 8017
 8018static void netdev_adjacent_del_links(struct net_device *dev)
 8019{
 8020	struct netdev_adjacent *iter;
 8021
 8022	struct net *net = dev_net(dev);
 8023
 8024	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8025		if (!net_eq(net, dev_net(iter->dev)))
 8026			continue;
 8027		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8028					  &iter->dev->adj_list.lower);
 8029		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8030					  &dev->adj_list.upper);
 8031	}
 8032
 8033	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8034		if (!net_eq(net, dev_net(iter->dev)))
 8035			continue;
 8036		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8037					  &iter->dev->adj_list.upper);
 8038		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8039					  &dev->adj_list.lower);
 8040	}
 8041}
 8042
 8043void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
 8044{
 8045	struct netdev_adjacent *iter;
 8046
 8047	struct net *net = dev_net(dev);
 8048
 8049	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8050		if (!net_eq(net, dev_net(iter->dev)))
 8051			continue;
 8052		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8053					  &iter->dev->adj_list.lower);
 8054		netdev_adjacent_sysfs_add(iter->dev, dev,
 8055					  &iter->dev->adj_list.lower);
 8056	}
 8057
 8058	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8059		if (!net_eq(net, dev_net(iter->dev)))
 8060			continue;
 8061		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8062					  &iter->dev->adj_list.upper);
 8063		netdev_adjacent_sysfs_add(iter->dev, dev,
 8064					  &iter->dev->adj_list.upper);
 8065	}
 8066}
 8067
 8068void *netdev_lower_dev_get_private(struct net_device *dev,
 8069				   struct net_device *lower_dev)
 8070{
 8071	struct netdev_adjacent *lower;
 8072
 8073	if (!lower_dev)
 8074		return NULL;
 8075	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
 8076	if (!lower)
 8077		return NULL;
 8078
 8079	return lower->private;
 8080}
 8081EXPORT_SYMBOL(netdev_lower_dev_get_private);
 8082
 8083
 8084/**
 8085 * netdev_lower_change - Dispatch event about lower device state change
 8086 * @lower_dev: device
 8087 * @lower_state_info: state to dispatch
 8088 *
 8089 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 8090 * The caller must hold the RTNL lock.
 8091 */
 8092void netdev_lower_state_changed(struct net_device *lower_dev,
 8093				void *lower_state_info)
 8094{
 8095	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
 8096		.info.dev = lower_dev,
 8097	};
 8098
 8099	ASSERT_RTNL();
 8100	changelowerstate_info.lower_state_info = lower_state_info;
 8101	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
 8102				      &changelowerstate_info.info);
 8103}
 8104EXPORT_SYMBOL(netdev_lower_state_changed);
 8105
 8106static void dev_change_rx_flags(struct net_device *dev, int flags)
 8107{
 8108	const struct net_device_ops *ops = dev->netdev_ops;
 8109
 8110	if (ops->ndo_change_rx_flags)
 8111		ops->ndo_change_rx_flags(dev, flags);
 8112}
 8113
 8114static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 8115{
 8116	unsigned int old_flags = dev->flags;
 8117	kuid_t uid;
 8118	kgid_t gid;
 8119
 8120	ASSERT_RTNL();
 8121
 8122	dev->flags |= IFF_PROMISC;
 8123	dev->promiscuity += inc;
 8124	if (dev->promiscuity == 0) {
 8125		/*
 8126		 * Avoid overflow.
 8127		 * If inc causes overflow, untouch promisc and return error.
 8128		 */
 8129		if (inc < 0)
 8130			dev->flags &= ~IFF_PROMISC;
 8131		else {
 8132			dev->promiscuity -= inc;
 8133			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
 8134				dev->name);
 8135			return -EOVERFLOW;
 8136		}
 8137	}
 8138	if (dev->flags != old_flags) {
 8139		pr_info("device %s %s promiscuous mode\n",
 8140			dev->name,
 8141			dev->flags & IFF_PROMISC ? "entered" : "left");
 8142		if (audit_enabled) {
 8143			current_uid_gid(&uid, &gid);
 8144			audit_log(audit_context(), GFP_ATOMIC,
 8145				  AUDIT_ANOM_PROMISCUOUS,
 8146				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
 8147				  dev->name, (dev->flags & IFF_PROMISC),
 8148				  (old_flags & IFF_PROMISC),
 8149				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
 8150				  from_kuid(&init_user_ns, uid),
 8151				  from_kgid(&init_user_ns, gid),
 8152				  audit_get_sessionid(current));
 8153		}
 8154
 8155		dev_change_rx_flags(dev, IFF_PROMISC);
 8156	}
 8157	if (notify)
 8158		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
 8159	return 0;
 8160}
 8161
 8162/**
 8163 *	dev_set_promiscuity	- update promiscuity count on a device
 8164 *	@dev: device
 8165 *	@inc: modifier
 8166 *
 8167 *	Add or remove promiscuity from a device. While the count in the device
 8168 *	remains above zero the interface remains promiscuous. Once it hits zero
 8169 *	the device reverts back to normal filtering operation. A negative inc
 8170 *	value is used to drop promiscuity on the device.
 8171 *	Return 0 if successful or a negative errno code on error.
 8172 */
 8173int dev_set_promiscuity(struct net_device *dev, int inc)
 8174{
 8175	unsigned int old_flags = dev->flags;
 8176	int err;
 8177
 8178	err = __dev_set_promiscuity(dev, inc, true);
 8179	if (err < 0)
 8180		return err;
 8181	if (dev->flags != old_flags)
 8182		dev_set_rx_mode(dev);
 8183	return err;
 8184}
 8185EXPORT_SYMBOL(dev_set_promiscuity);
 8186
 8187static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
 8188{
 8189	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
 8190
 8191	ASSERT_RTNL();
 8192
 8193	dev->flags |= IFF_ALLMULTI;
 8194	dev->allmulti += inc;
 8195	if (dev->allmulti == 0) {
 8196		/*
 8197		 * Avoid overflow.
 8198		 * If inc causes overflow, untouch allmulti and return error.
 8199		 */
 8200		if (inc < 0)
 8201			dev->flags &= ~IFF_ALLMULTI;
 8202		else {
 8203			dev->allmulti -= inc;
 8204			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
 8205				dev->name);
 8206			return -EOVERFLOW;
 8207		}
 8208	}
 8209	if (dev->flags ^ old_flags) {
 8210		dev_change_rx_flags(dev, IFF_ALLMULTI);
 8211		dev_set_rx_mode(dev);
 8212		if (notify)
 8213			__dev_notify_flags(dev, old_flags,
 8214					   dev->gflags ^ old_gflags);
 8215	}
 8216	return 0;
 8217}
 8218
 8219/**
 8220 *	dev_set_allmulti	- update allmulti count on a device
 8221 *	@dev: device
 8222 *	@inc: modifier
 8223 *
 8224 *	Add or remove reception of all multicast frames to a device. While the
 8225 *	count in the device remains above zero the interface remains listening
 8226 *	to all interfaces. Once it hits zero the device reverts back to normal
 8227 *	filtering operation. A negative @inc value is used to drop the counter
 8228 *	when releasing a resource needing all multicasts.
 8229 *	Return 0 if successful or a negative errno code on error.
 8230 */
 8231
 8232int dev_set_allmulti(struct net_device *dev, int inc)
 8233{
 8234	return __dev_set_allmulti(dev, inc, true);
 8235}
 8236EXPORT_SYMBOL(dev_set_allmulti);
 8237
 8238/*
 8239 *	Upload unicast and multicast address lists to device and
 8240 *	configure RX filtering. When the device doesn't support unicast
 8241 *	filtering it is put in promiscuous mode while unicast addresses
 8242 *	are present.
 8243 */
 8244void __dev_set_rx_mode(struct net_device *dev)
 8245{
 8246	const struct net_device_ops *ops = dev->netdev_ops;
 8247
 8248	/* dev_open will call this function so the list will stay sane. */
 8249	if (!(dev->flags&IFF_UP))
 8250		return;
 8251
 8252	if (!netif_device_present(dev))
 8253		return;
 8254
 8255	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
 8256		/* Unicast addresses changes may only happen under the rtnl,
 8257		 * therefore calling __dev_set_promiscuity here is safe.
 8258		 */
 8259		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
 8260			__dev_set_promiscuity(dev, 1, false);
 8261			dev->uc_promisc = true;
 8262		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
 8263			__dev_set_promiscuity(dev, -1, false);
 8264			dev->uc_promisc = false;
 8265		}
 8266	}
 8267
 8268	if (ops->ndo_set_rx_mode)
 8269		ops->ndo_set_rx_mode(dev);
 8270}
 8271
 8272void dev_set_rx_mode(struct net_device *dev)
 8273{
 8274	netif_addr_lock_bh(dev);
 8275	__dev_set_rx_mode(dev);
 8276	netif_addr_unlock_bh(dev);
 8277}
 8278
 8279/**
 8280 *	dev_get_flags - get flags reported to userspace
 8281 *	@dev: device
 8282 *
 8283 *	Get the combination of flag bits exported through APIs to userspace.
 8284 */
 8285unsigned int dev_get_flags(const struct net_device *dev)
 8286{
 8287	unsigned int flags;
 8288
 8289	flags = (dev->flags & ~(IFF_PROMISC |
 8290				IFF_ALLMULTI |
 8291				IFF_RUNNING |
 8292				IFF_LOWER_UP |
 8293				IFF_DORMANT)) |
 8294		(dev->gflags & (IFF_PROMISC |
 8295				IFF_ALLMULTI));
 8296
 8297	if (netif_running(dev)) {
 8298		if (netif_oper_up(dev))
 8299			flags |= IFF_RUNNING;
 8300		if (netif_carrier_ok(dev))
 8301			flags |= IFF_LOWER_UP;
 8302		if (netif_dormant(dev))
 8303			flags |= IFF_DORMANT;
 8304	}
 8305
 8306	return flags;
 8307}
 8308EXPORT_SYMBOL(dev_get_flags);
 8309
 8310int __dev_change_flags(struct net_device *dev, unsigned int flags,
 8311		       struct netlink_ext_ack *extack)
 8312{
 8313	unsigned int old_flags = dev->flags;
 8314	int ret;
 8315
 8316	ASSERT_RTNL();
 8317
 8318	/*
 8319	 *	Set the flags on our device.
 8320	 */
 8321
 8322	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
 8323			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
 8324			       IFF_AUTOMEDIA)) |
 8325		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
 8326				    IFF_ALLMULTI));
 8327
 8328	/*
 8329	 *	Load in the correct multicast list now the flags have changed.
 8330	 */
 8331
 8332	if ((old_flags ^ flags) & IFF_MULTICAST)
 8333		dev_change_rx_flags(dev, IFF_MULTICAST);
 8334
 8335	dev_set_rx_mode(dev);
 8336
 8337	/*
 8338	 *	Have we downed the interface. We handle IFF_UP ourselves
 8339	 *	according to user attempts to set it, rather than blindly
 8340	 *	setting it.
 8341	 */
 8342
 8343	ret = 0;
 8344	if ((old_flags ^ flags) & IFF_UP) {
 8345		if (old_flags & IFF_UP)
 8346			__dev_close(dev);
 8347		else
 8348			ret = __dev_open(dev, extack);
 8349	}
 8350
 8351	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 8352		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 8353		unsigned int old_flags = dev->flags;
 8354
 8355		dev->gflags ^= IFF_PROMISC;
 8356
 8357		if (__dev_set_promiscuity(dev, inc, false) >= 0)
 8358			if (dev->flags != old_flags)
 8359				dev_set_rx_mode(dev);
 8360	}
 8361
 8362	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
 8363	 * is important. Some (broken) drivers set IFF_PROMISC, when
 8364	 * IFF_ALLMULTI is requested not asking us and not reporting.
 8365	 */
 8366	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
 8367		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
 8368
 8369		dev->gflags ^= IFF_ALLMULTI;
 8370		__dev_set_allmulti(dev, inc, false);
 8371	}
 8372
 8373	return ret;
 8374}
 8375
 8376void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
 8377			unsigned int gchanges)
 8378{
 8379	unsigned int changes = dev->flags ^ old_flags;
 8380
 8381	if (gchanges)
 8382		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
 8383
 8384	if (changes & IFF_UP) {
 8385		if (dev->flags & IFF_UP)
 8386			call_netdevice_notifiers(NETDEV_UP, dev);
 8387		else
 8388			call_netdevice_notifiers(NETDEV_DOWN, dev);
 8389	}
 8390
 8391	if (dev->flags & IFF_UP &&
 8392	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
 8393		struct netdev_notifier_change_info change_info = {
 8394			.info = {
 8395				.dev = dev,
 8396			},
 8397			.flags_changed = changes,
 8398		};
 8399
 8400		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
 8401	}
 8402}
 8403
 8404/**
 8405 *	dev_change_flags - change device settings
 8406 *	@dev: device
 8407 *	@flags: device state flags
 8408 *	@extack: netlink extended ack
 8409 *
 8410 *	Change settings on device based state flags. The flags are
 8411 *	in the userspace exported format.
 8412 */
 8413int dev_change_flags(struct net_device *dev, unsigned int flags,
 8414		     struct netlink_ext_ack *extack)
 8415{
 8416	int ret;
 8417	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
 8418
 8419	ret = __dev_change_flags(dev, flags, extack);
 8420	if (ret < 0)
 8421		return ret;
 8422
 8423	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
 8424	__dev_notify_flags(dev, old_flags, changes);
 8425	return ret;
 8426}
 8427EXPORT_SYMBOL(dev_change_flags);
 8428
 8429int __dev_set_mtu(struct net_device *dev, int new_mtu)
 8430{
 8431	const struct net_device_ops *ops = dev->netdev_ops;
 8432
 8433	if (ops->ndo_change_mtu)
 8434		return ops->ndo_change_mtu(dev, new_mtu);
 8435
 8436	/* Pairs with all the lockless reads of dev->mtu in the stack */
 8437	WRITE_ONCE(dev->mtu, new_mtu);
 8438	return 0;
 8439}
 8440EXPORT_SYMBOL(__dev_set_mtu);
 8441
 8442int dev_validate_mtu(struct net_device *dev, int new_mtu,
 8443		     struct netlink_ext_ack *extack)
 8444{
 8445	/* MTU must be positive, and in range */
 8446	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
 8447		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
 8448		return -EINVAL;
 8449	}
 8450
 8451	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
 8452		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
 8453		return -EINVAL;
 8454	}
 8455	return 0;
 8456}
 8457
 8458/**
 8459 *	dev_set_mtu_ext - Change maximum transfer unit
 8460 *	@dev: device
 8461 *	@new_mtu: new transfer unit
 8462 *	@extack: netlink extended ack
 8463 *
 8464 *	Change the maximum transfer size of the network device.
 8465 */
 8466int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
 8467		    struct netlink_ext_ack *extack)
 8468{
 8469	int err, orig_mtu;
 8470
 8471	if (new_mtu == dev->mtu)
 8472		return 0;
 8473
 8474	err = dev_validate_mtu(dev, new_mtu, extack);
 8475	if (err)
 8476		return err;
 8477
 8478	if (!netif_device_present(dev))
 8479		return -ENODEV;
 8480
 8481	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
 8482	err = notifier_to_errno(err);
 8483	if (err)
 8484		return err;
 8485
 8486	orig_mtu = dev->mtu;
 8487	err = __dev_set_mtu(dev, new_mtu);
 8488
 8489	if (!err) {
 8490		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8491						   orig_mtu);
 8492		err = notifier_to_errno(err);
 8493		if (err) {
 8494			/* setting mtu back and notifying everyone again,
 8495			 * so that they have a chance to revert changes.
 8496			 */
 8497			__dev_set_mtu(dev, orig_mtu);
 8498			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8499						     new_mtu);
 8500		}
 8501	}
 8502	return err;
 8503}
 8504
 8505int dev_set_mtu(struct net_device *dev, int new_mtu)
 8506{
 8507	struct netlink_ext_ack extack;
 8508	int err;
 8509
 8510	memset(&extack, 0, sizeof(extack));
 8511	err = dev_set_mtu_ext(dev, new_mtu, &extack);
 8512	if (err && extack._msg)
 8513		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
 8514	return err;
 8515}
 8516EXPORT_SYMBOL(dev_set_mtu);
 8517
 8518/**
 8519 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
 8520 *	@dev: device
 8521 *	@new_len: new tx queue length
 8522 */
 8523int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
 8524{
 8525	unsigned int orig_len = dev->tx_queue_len;
 8526	int res;
 8527
 8528	if (new_len != (unsigned int)new_len)
 8529		return -ERANGE;
 8530
 8531	if (new_len != orig_len) {
 8532		dev->tx_queue_len = new_len;
 8533		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
 8534		res = notifier_to_errno(res);
 8535		if (res)
 8536			goto err_rollback;
 8537		res = dev_qdisc_change_tx_queue_len(dev);
 8538		if (res)
 8539			goto err_rollback;
 8540	}
 8541
 8542	return 0;
 8543
 8544err_rollback:
 8545	netdev_err(dev, "refused to change device tx_queue_len\n");
 8546	dev->tx_queue_len = orig_len;
 8547	return res;
 8548}
 8549
 8550/**
 8551 *	dev_set_group - Change group this device belongs to
 8552 *	@dev: device
 8553 *	@new_group: group this device should belong to
 8554 */
 8555void dev_set_group(struct net_device *dev, int new_group)
 8556{
 8557	dev->group = new_group;
 8558}
 8559EXPORT_SYMBOL(dev_set_group);
 8560
 8561/**
 8562 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
 8563 *	@dev: device
 8564 *	@addr: new address
 8565 *	@extack: netlink extended ack
 8566 */
 8567int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 8568			      struct netlink_ext_ack *extack)
 8569{
 8570	struct netdev_notifier_pre_changeaddr_info info = {
 8571		.info.dev = dev,
 8572		.info.extack = extack,
 8573		.dev_addr = addr,
 8574	};
 8575	int rc;
 8576
 8577	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
 8578	return notifier_to_errno(rc);
 8579}
 8580EXPORT_SYMBOL(dev_pre_changeaddr_notify);
 8581
 8582/**
 8583 *	dev_set_mac_address - Change Media Access Control Address
 8584 *	@dev: device
 8585 *	@sa: new address
 8586 *	@extack: netlink extended ack
 8587 *
 8588 *	Change the hardware (MAC) address of the device
 8589 */
 8590int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
 8591			struct netlink_ext_ack *extack)
 8592{
 8593	const struct net_device_ops *ops = dev->netdev_ops;
 8594	int err;
 8595
 8596	if (!ops->ndo_set_mac_address)
 8597		return -EOPNOTSUPP;
 8598	if (sa->sa_family != dev->type)
 8599		return -EINVAL;
 8600	if (!netif_device_present(dev))
 8601		return -ENODEV;
 8602	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
 8603	if (err)
 8604		return err;
 8605	err = ops->ndo_set_mac_address(dev, sa);
 8606	if (err)
 8607		return err;
 8608	dev->addr_assign_type = NET_ADDR_SET;
 8609	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 8610	add_device_randomness(dev->dev_addr, dev->addr_len);
 8611	return 0;
 8612}
 8613EXPORT_SYMBOL(dev_set_mac_address);
 8614
 8615/**
 8616 *	dev_change_carrier - Change device carrier
 8617 *	@dev: device
 8618 *	@new_carrier: new value
 8619 *
 8620 *	Change device carrier
 8621 */
 8622int dev_change_carrier(struct net_device *dev, bool new_carrier)
 8623{
 8624	const struct net_device_ops *ops = dev->netdev_ops;
 8625
 8626	if (!ops->ndo_change_carrier)
 8627		return -EOPNOTSUPP;
 8628	if (!netif_device_present(dev))
 8629		return -ENODEV;
 8630	return ops->ndo_change_carrier(dev, new_carrier);
 8631}
 8632EXPORT_SYMBOL(dev_change_carrier);
 8633
 8634/**
 8635 *	dev_get_phys_port_id - Get device physical port ID
 8636 *	@dev: device
 8637 *	@ppid: port ID
 8638 *
 8639 *	Get device physical port ID
 8640 */
 8641int dev_get_phys_port_id(struct net_device *dev,
 8642			 struct netdev_phys_item_id *ppid)
 8643{
 8644	const struct net_device_ops *ops = dev->netdev_ops;
 8645
 8646	if (!ops->ndo_get_phys_port_id)
 8647		return -EOPNOTSUPP;
 8648	return ops->ndo_get_phys_port_id(dev, ppid);
 8649}
 8650EXPORT_SYMBOL(dev_get_phys_port_id);
 8651
 8652/**
 8653 *	dev_get_phys_port_name - Get device physical port name
 8654 *	@dev: device
 8655 *	@name: port name
 8656 *	@len: limit of bytes to copy to name
 8657 *
 8658 *	Get device physical port name
 8659 */
 8660int dev_get_phys_port_name(struct net_device *dev,
 8661			   char *name, size_t len)
 8662{
 8663	const struct net_device_ops *ops = dev->netdev_ops;
 8664	int err;
 8665
 8666	if (ops->ndo_get_phys_port_name) {
 8667		err = ops->ndo_get_phys_port_name(dev, name, len);
 8668		if (err != -EOPNOTSUPP)
 8669			return err;
 8670	}
 8671	return devlink_compat_phys_port_name_get(dev, name, len);
 8672}
 8673EXPORT_SYMBOL(dev_get_phys_port_name);
 8674
 8675/**
 8676 *	dev_get_port_parent_id - Get the device's port parent identifier
 8677 *	@dev: network device
 8678 *	@ppid: pointer to a storage for the port's parent identifier
 8679 *	@recurse: allow/disallow recursion to lower devices
 8680 *
 8681 *	Get the devices's port parent identifier
 8682 */
 8683int dev_get_port_parent_id(struct net_device *dev,
 8684			   struct netdev_phys_item_id *ppid,
 8685			   bool recurse)
 8686{
 8687	const struct net_device_ops *ops = dev->netdev_ops;
 8688	struct netdev_phys_item_id first = { };
 8689	struct net_device *lower_dev;
 8690	struct list_head *iter;
 8691	int err;
 8692
 8693	if (ops->ndo_get_port_parent_id) {
 8694		err = ops->ndo_get_port_parent_id(dev, ppid);
 8695		if (err != -EOPNOTSUPP)
 8696			return err;
 8697	}
 8698
 8699	err = devlink_compat_switch_id_get(dev, ppid);
 8700	if (!err || err != -EOPNOTSUPP)
 8701		return err;
 8702
 8703	if (!recurse)
 8704		return -EOPNOTSUPP;
 8705
 8706	netdev_for_each_lower_dev(dev, lower_dev, iter) {
 8707		err = dev_get_port_parent_id(lower_dev, ppid, recurse);
 8708		if (err)
 8709			break;
 8710		if (!first.id_len)
 8711			first = *ppid;
 8712		else if (memcmp(&first, ppid, sizeof(*ppid)))
 8713			return -EOPNOTSUPP;
 8714	}
 8715
 8716	return err;
 8717}
 8718EXPORT_SYMBOL(dev_get_port_parent_id);
 8719
 8720/**
 8721 *	netdev_port_same_parent_id - Indicate if two network devices have
 8722 *	the same port parent identifier
 8723 *	@a: first network device
 8724 *	@b: second network device
 8725 */
 8726bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
 8727{
 8728	struct netdev_phys_item_id a_id = { };
 8729	struct netdev_phys_item_id b_id = { };
 8730
 8731	if (dev_get_port_parent_id(a, &a_id, true) ||
 8732	    dev_get_port_parent_id(b, &b_id, true))
 8733		return false;
 8734
 8735	return netdev_phys_item_id_same(&a_id, &b_id);
 8736}
 8737EXPORT_SYMBOL(netdev_port_same_parent_id);
 8738
 8739/**
 8740 *	dev_change_proto_down - update protocol port state information
 8741 *	@dev: device
 8742 *	@proto_down: new value
 8743 *
 8744 *	This info can be used by switch drivers to set the phys state of the
 8745 *	port.
 8746 */
 8747int dev_change_proto_down(struct net_device *dev, bool proto_down)
 8748{
 8749	const struct net_device_ops *ops = dev->netdev_ops;
 8750
 8751	if (!ops->ndo_change_proto_down)
 8752		return -EOPNOTSUPP;
 8753	if (!netif_device_present(dev))
 8754		return -ENODEV;
 8755	return ops->ndo_change_proto_down(dev, proto_down);
 8756}
 8757EXPORT_SYMBOL(dev_change_proto_down);
 8758
 8759/**
 8760 *	dev_change_proto_down_generic - generic implementation for
 8761 * 	ndo_change_proto_down that sets carrier according to
 8762 * 	proto_down.
 8763 *
 8764 *	@dev: device
 8765 *	@proto_down: new value
 8766 */
 8767int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
 8768{
 8769	if (proto_down)
 8770		netif_carrier_off(dev);
 8771	else
 8772		netif_carrier_on(dev);
 8773	dev->proto_down = proto_down;
 8774	return 0;
 8775}
 8776EXPORT_SYMBOL(dev_change_proto_down_generic);
 8777
 8778/**
 8779 *	dev_change_proto_down_reason - proto down reason
 8780 *
 8781 *	@dev: device
 8782 *	@mask: proto down mask
 8783 *	@value: proto down value
 8784 */
 8785void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
 8786				  u32 value)
 8787{
 8788	int b;
 8789
 8790	if (!mask) {
 8791		dev->proto_down_reason = value;
 8792	} else {
 8793		for_each_set_bit(b, &mask, 32) {
 8794			if (value & (1 << b))
 8795				dev->proto_down_reason |= BIT(b);
 8796			else
 8797				dev->proto_down_reason &= ~BIT(b);
 8798		}
 8799	}
 8800}
 8801EXPORT_SYMBOL(dev_change_proto_down_reason);
 8802
 8803struct bpf_xdp_link {
 8804	struct bpf_link link;
 8805	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
 8806	int flags;
 8807};
 8808
 8809static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
 8810{
 8811	if (flags & XDP_FLAGS_HW_MODE)
 8812		return XDP_MODE_HW;
 8813	if (flags & XDP_FLAGS_DRV_MODE)
 8814		return XDP_MODE_DRV;
 8815	if (flags & XDP_FLAGS_SKB_MODE)
 8816		return XDP_MODE_SKB;
 8817	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
 8818}
 8819
 8820static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
 8821{
 8822	switch (mode) {
 8823	case XDP_MODE_SKB:
 8824		return generic_xdp_install;
 8825	case XDP_MODE_DRV:
 8826	case XDP_MODE_HW:
 8827		return dev->netdev_ops->ndo_bpf;
 8828	default:
 8829		return NULL;
 8830	};
 8831}
 8832
 8833static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
 8834					 enum bpf_xdp_mode mode)
 8835{
 8836	return dev->xdp_state[mode].link;
 8837}
 8838
 8839static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
 8840				     enum bpf_xdp_mode mode)
 8841{
 8842	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
 8843
 8844	if (link)
 8845		return link->link.prog;
 8846	return dev->xdp_state[mode].prog;
 8847}
 8848
 8849u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
 8850{
 8851	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
 8852
 8853	return prog ? prog->aux->id : 0;
 8854}
 8855
 8856static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
 8857			     struct bpf_xdp_link *link)
 8858{
 8859	dev->xdp_state[mode].link = link;
 8860	dev->xdp_state[mode].prog = NULL;
 8861}
 8862
 8863static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
 8864			     struct bpf_prog *prog)
 8865{
 8866	dev->xdp_state[mode].link = NULL;
 8867	dev->xdp_state[mode].prog = prog;
 8868}
 8869
 8870static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
 8871			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
 8872			   u32 flags, struct bpf_prog *prog)
 8873{
 8874	struct netdev_bpf xdp;
 8875	int err;
 8876
 8877	memset(&xdp, 0, sizeof(xdp));
 8878	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
 8879	xdp.extack = extack;
 8880	xdp.flags = flags;
 8881	xdp.prog = prog;
 8882
 8883	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
 8884	 * "moved" into driver), so they don't increment it on their own, but
 8885	 * they do decrement refcnt when program is detached or replaced.
 8886	 * Given net_device also owns link/prog, we need to bump refcnt here
 8887	 * to prevent drivers from underflowing it.
 8888	 */
 8889	if (prog)
 8890		bpf_prog_inc(prog);
 8891	err = bpf_op(dev, &xdp);
 8892	if (err) {
 8893		if (prog)
 8894			bpf_prog_put(prog);
 8895		return err;
 8896	}
 8897
 8898	if (mode != XDP_MODE_HW)
 8899		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
 8900
 8901	return 0;
 8902}
 8903
 8904static void dev_xdp_uninstall(struct net_device *dev)
 8905{
 8906	struct bpf_xdp_link *link;
 8907	struct bpf_prog *prog;
 8908	enum bpf_xdp_mode mode;
 8909	bpf_op_t bpf_op;
 8910
 8911	ASSERT_RTNL();
 8912
 8913	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
 8914		prog = dev_xdp_prog(dev, mode);
 8915		if (!prog)
 8916			continue;
 8917
 8918		bpf_op = dev_xdp_bpf_op(dev, mode);
 8919		if (!bpf_op)
 8920			continue;
 8921
 8922		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 8923
 8924		/* auto-detach link from net device */
 8925		link = dev_xdp_link(dev, mode);
 8926		if (link)
 8927			link->dev = NULL;
 8928		else
 8929			bpf_prog_put(prog);
 8930
 8931		dev_xdp_set_link(dev, mode, NULL);
 8932	}
 8933}
 8934
 8935static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
 8936			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
 8937			  struct bpf_prog *old_prog, u32 flags)
 8938{
 8939	struct bpf_prog *cur_prog;
 8940	enum bpf_xdp_mode mode;
 8941	bpf_op_t bpf_op;
 8942	int err;
 8943
 8944	ASSERT_RTNL();
 8945
 8946	/* either link or prog attachment, never both */
 8947	if (link && (new_prog || old_prog))
 8948		return -EINVAL;
 8949	/* link supports only XDP mode flags */
 8950	if (link && (flags & ~XDP_FLAGS_MODES)) {
 8951		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
 8952		return -EINVAL;
 8953	}
 8954	/* just one XDP mode bit should be set, zero defaults to SKB mode */
 8955	if (hweight32(flags & XDP_FLAGS_MODES) > 1) {
 8956		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
 8957		return -EINVAL;
 8958	}
 8959	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
 8960	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
 8961		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
 8962		return -EINVAL;
 8963	}
 8964
 8965	mode = dev_xdp_mode(dev, flags);
 8966	/* can't replace attached link */
 8967	if (dev_xdp_link(dev, mode)) {
 8968		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
 8969		return -EBUSY;
 8970	}
 8971
 8972	cur_prog = dev_xdp_prog(dev, mode);
 8973	/* can't replace attached prog with link */
 8974	if (link && cur_prog) {
 8975		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
 8976		return -EBUSY;
 8977	}
 8978	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
 8979		NL_SET_ERR_MSG(extack, "Active program does not match expected");
 8980		return -EEXIST;
 8981	}
 8982
 8983	/* put effective new program into new_prog */
 8984	if (link)
 8985		new_prog = link->link.prog;
 8986
 8987	if (new_prog) {
 8988		bool offload = mode == XDP_MODE_HW;
 8989		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
 8990					       ? XDP_MODE_DRV : XDP_MODE_SKB;
 8991
 8992		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
 8993			NL_SET_ERR_MSG(extack, "XDP program already attached");
 8994			return -EBUSY;
 8995		}
 8996		if (!offload && dev_xdp_prog(dev, other_mode)) {
 8997			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
 8998			return -EEXIST;
 8999		}
 9000		if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
 9001			NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
 9002			return -EINVAL;
 9003		}
 9004		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
 9005			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
 9006			return -EINVAL;
 9007		}
 9008		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
 9009			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
 9010			return -EINVAL;
 9011		}
 9012	}
 9013
 9014	/* don't call drivers if the effective program didn't change */
 9015	if (new_prog != cur_prog) {
 9016		bpf_op = dev_xdp_bpf_op(dev, mode);
 9017		if (!bpf_op) {
 9018			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
 9019			return -EOPNOTSUPP;
 9020		}
 9021
 9022		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
 9023		if (err)
 9024			return err;
 9025	}
 9026
 9027	if (link)
 9028		dev_xdp_set_link(dev, mode, link);
 9029	else
 9030		dev_xdp_set_prog(dev, mode, new_prog);
 9031	if (cur_prog)
 9032		bpf_prog_put(cur_prog);
 9033
 9034	return 0;
 9035}
 9036
 9037static int dev_xdp_attach_link(struct net_device *dev,
 9038			       struct netlink_ext_ack *extack,
 9039			       struct bpf_xdp_link *link)
 9040{
 9041	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
 9042}
 9043
 9044static int dev_xdp_detach_link(struct net_device *dev,
 9045			       struct netlink_ext_ack *extack,
 9046			       struct bpf_xdp_link *link)
 9047{
 9048	enum bpf_xdp_mode mode;
 9049	bpf_op_t bpf_op;
 9050
 9051	ASSERT_RTNL();
 9052
 9053	mode = dev_xdp_mode(dev, link->flags);
 9054	if (dev_xdp_link(dev, mode) != link)
 9055		return -EINVAL;
 9056
 9057	bpf_op = dev_xdp_bpf_op(dev, mode);
 9058	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 9059	dev_xdp_set_link(dev, mode, NULL);
 9060	return 0;
 9061}
 9062
 9063static void bpf_xdp_link_release(struct bpf_link *link)
 9064{
 9065	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9066
 9067	rtnl_lock();
 9068
 9069	/* if racing with net_device's tear down, xdp_link->dev might be
 9070	 * already NULL, in which case link was already auto-detached
 9071	 */
 9072	if (xdp_link->dev) {
 9073		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
 9074		xdp_link->dev = NULL;
 9075	}
 9076
 9077	rtnl_unlock();
 9078}
 9079
 9080static int bpf_xdp_link_detach(struct bpf_link *link)
 9081{
 9082	bpf_xdp_link_release(link);
 9083	return 0;
 9084}
 9085
 9086static void bpf_xdp_link_dealloc(struct bpf_link *link)
 9087{
 9088	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9089
 9090	kfree(xdp_link);
 9091}
 9092
 9093static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
 9094				     struct seq_file *seq)
 9095{
 9096	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9097	u32 ifindex = 0;
 9098
 9099	rtnl_lock();
 9100	if (xdp_link->dev)
 9101		ifindex = xdp_link->dev->ifindex;
 9102	rtnl_unlock();
 9103
 9104	seq_printf(seq, "ifindex:\t%u\n", ifindex);
 9105}
 9106
 9107static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
 9108				       struct bpf_link_info *info)
 9109{
 9110	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9111	u32 ifindex = 0;
 9112
 9113	rtnl_lock();
 9114	if (xdp_link->dev)
 9115		ifindex = xdp_link->dev->ifindex;
 9116	rtnl_unlock();
 9117
 9118	info->xdp.ifindex = ifindex;
 9119	return 0;
 9120}
 9121
 9122static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
 9123			       struct bpf_prog *old_prog)
 9124{
 9125	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9126	enum bpf_xdp_mode mode;
 9127	bpf_op_t bpf_op;
 9128	int err = 0;
 9129
 9130	rtnl_lock();
 9131
 9132	/* link might have been auto-released already, so fail */
 9133	if (!xdp_link->dev) {
 9134		err = -ENOLINK;
 9135		goto out_unlock;
 9136	}
 9137
 9138	if (old_prog && link->prog != old_prog) {
 9139		err = -EPERM;
 9140		goto out_unlock;
 9141	}
 9142	old_prog = link->prog;
 9143	if (old_prog == new_prog) {
 9144		/* no-op, don't disturb drivers */
 9145		bpf_prog_put(new_prog);
 9146		goto out_unlock;
 9147	}
 9148
 9149	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
 9150	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
 9151	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
 9152			      xdp_link->flags, new_prog);
 9153	if (err)
 9154		goto out_unlock;
 9155
 9156	old_prog = xchg(&link->prog, new_prog);
 9157	bpf_prog_put(old_prog);
 9158
 9159out_unlock:
 9160	rtnl_unlock();
 9161	return err;
 9162}
 9163
 9164static const struct bpf_link_ops bpf_xdp_link_lops = {
 9165	.release = bpf_xdp_link_release,
 9166	.dealloc = bpf_xdp_link_dealloc,
 9167	.detach = bpf_xdp_link_detach,
 9168	.show_fdinfo = bpf_xdp_link_show_fdinfo,
 9169	.fill_link_info = bpf_xdp_link_fill_link_info,
 9170	.update_prog = bpf_xdp_link_update,
 9171};
 9172
 9173int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 9174{
 9175	struct net *net = current->nsproxy->net_ns;
 9176	struct bpf_link_primer link_primer;
 9177	struct bpf_xdp_link *link;
 9178	struct net_device *dev;
 9179	int err, fd;
 9180
 9181	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
 9182	if (!dev)
 9183		return -EINVAL;
 9184
 9185	link = kzalloc(sizeof(*link), GFP_USER);
 9186	if (!link) {
 9187		err = -ENOMEM;
 9188		goto out_put_dev;
 9189	}
 9190
 9191	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
 9192	link->dev = dev;
 9193	link->flags = attr->link_create.flags;
 9194
 9195	err = bpf_link_prime(&link->link, &link_primer);
 9196	if (err) {
 9197		kfree(link);
 9198		goto out_put_dev;
 9199	}
 9200
 9201	rtnl_lock();
 9202	err = dev_xdp_attach_link(dev, NULL, link);
 9203	rtnl_unlock();
 9204
 9205	if (err) {
 9206		bpf_link_cleanup(&link_primer);
 9207		goto out_put_dev;
 9208	}
 9209
 9210	fd = bpf_link_settle(&link_primer);
 9211	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
 9212	dev_put(dev);
 9213	return fd;
 9214
 9215out_put_dev:
 9216	dev_put(dev);
 9217	return err;
 9218}
 9219
 9220/**
 9221 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
 9222 *	@dev: device
 9223 *	@extack: netlink extended ack
 9224 *	@fd: new program fd or negative value to clear
 9225 *	@expected_fd: old program fd that userspace expects to replace or clear
 9226 *	@flags: xdp-related flags
 9227 *
 9228 *	Set or clear a bpf program for a device
 9229 */
 9230int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 9231		      int fd, int expected_fd, u32 flags)
 9232{
 9233	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
 9234	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
 9235	int err;
 9236
 9237	ASSERT_RTNL();
 9238
 9239	if (fd >= 0) {
 9240		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
 9241						 mode != XDP_MODE_SKB);
 9242		if (IS_ERR(new_prog))
 9243			return PTR_ERR(new_prog);
 9244	}
 9245
 9246	if (expected_fd >= 0) {
 9247		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
 9248						 mode != XDP_MODE_SKB);
 9249		if (IS_ERR(old_prog)) {
 9250			err = PTR_ERR(old_prog);
 9251			old_prog = NULL;
 9252			goto err_out;
 9253		}
 9254	}
 9255
 9256	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
 9257
 9258err_out:
 9259	if (err && new_prog)
 9260		bpf_prog_put(new_prog);
 9261	if (old_prog)
 9262		bpf_prog_put(old_prog);
 9263	return err;
 9264}
 9265
 9266/**
 9267 *	dev_new_index	-	allocate an ifindex
 9268 *	@net: the applicable net namespace
 9269 *
 9270 *	Returns a suitable unique value for a new device interface
 9271 *	number.  The caller must hold the rtnl semaphore or the
 9272 *	dev_base_lock to be sure it remains unique.
 9273 */
 9274static int dev_new_index(struct net *net)
 9275{
 9276	int ifindex = net->ifindex;
 9277
 9278	for (;;) {
 9279		if (++ifindex <= 0)
 9280			ifindex = 1;
 9281		if (!__dev_get_by_index(net, ifindex))
 9282			return net->ifindex = ifindex;
 9283	}
 9284}
 9285
 9286/* Delayed registration/unregisteration */
 9287static LIST_HEAD(net_todo_list);
 9288DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 9289
 9290static void net_set_todo(struct net_device *dev)
 9291{
 9292	list_add_tail(&dev->todo_list, &net_todo_list);
 9293	dev_net(dev)->dev_unreg_count++;
 9294}
 9295
 9296static void rollback_registered_many(struct list_head *head)
 9297{
 9298	struct net_device *dev, *tmp;
 9299	LIST_HEAD(close_head);
 9300
 9301	BUG_ON(dev_boot_phase);
 9302	ASSERT_RTNL();
 9303
 9304	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
 9305		/* Some devices call without registering
 9306		 * for initialization unwind. Remove those
 9307		 * devices and proceed with the remaining.
 9308		 */
 9309		if (dev->reg_state == NETREG_UNINITIALIZED) {
 9310			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
 9311				 dev->name, dev);
 9312
 9313			WARN_ON(1);
 9314			list_del(&dev->unreg_list);
 9315			continue;
 9316		}
 9317		dev->dismantle = true;
 9318		BUG_ON(dev->reg_state != NETREG_REGISTERED);
 9319	}
 9320
 9321	/* If device is running, close it first. */
 9322	list_for_each_entry(dev, head, unreg_list)
 9323		list_add_tail(&dev->close_list, &close_head);
 9324	dev_close_many(&close_head, true);
 9325
 9326	list_for_each_entry(dev, head, unreg_list) {
 9327		/* And unlink it from device chain. */
 9328		unlist_netdevice(dev);
 9329
 9330		dev->reg_state = NETREG_UNREGISTERING;
 9331	}
 9332	flush_all_backlogs();
 9333
 9334	synchronize_net();
 9335
 9336	list_for_each_entry(dev, head, unreg_list) {
 9337		struct sk_buff *skb = NULL;
 9338
 9339		/* Shutdown queueing discipline. */
 9340		dev_shutdown(dev);
 9341
 9342		dev_xdp_uninstall(dev);
 9343
 9344		/* Notify protocols, that we are about to destroy
 9345		 * this device. They should clean all the things.
 9346		 */
 9347		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 9348
 9349		if (!dev->rtnl_link_ops ||
 9350		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 9351			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
 9352						     GFP_KERNEL, NULL, 0);
 9353
 9354		/*
 9355		 *	Flush the unicast and multicast chains
 9356		 */
 9357		dev_uc_flush(dev);
 9358		dev_mc_flush(dev);
 9359
 9360		netdev_name_node_alt_flush(dev);
 9361		netdev_name_node_free(dev->name_node);
 9362
 9363		if (dev->netdev_ops->ndo_uninit)
 9364			dev->netdev_ops->ndo_uninit(dev);
 9365
 9366		if (skb)
 9367			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
 9368
 9369		/* Notifier chain MUST detach us all upper devices. */
 9370		WARN_ON(netdev_has_any_upper_dev(dev));
 9371		WARN_ON(netdev_has_any_lower_dev(dev));
 9372
 9373		/* Remove entries from kobject tree */
 9374		netdev_unregister_kobject(dev);
 9375#ifdef CONFIG_XPS
 9376		/* Remove XPS queueing entries */
 9377		netif_reset_xps_queues_gt(dev, 0);
 9378#endif
 9379	}
 9380
 9381	synchronize_net();
 9382
 9383	list_for_each_entry(dev, head, unreg_list)
 9384		dev_put(dev);
 9385}
 9386
 9387static void rollback_registered(struct net_device *dev)
 9388{
 9389	LIST_HEAD(single);
 9390
 9391	list_add(&dev->unreg_list, &single);
 9392	rollback_registered_many(&single);
 9393	list_del(&single);
 9394}
 9395
 9396static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
 9397	struct net_device *upper, netdev_features_t features)
 9398{
 9399	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9400	netdev_features_t feature;
 9401	int feature_bit;
 9402
 9403	for_each_netdev_feature(upper_disables, feature_bit) {
 9404		feature = __NETIF_F_BIT(feature_bit);
 9405		if (!(upper->wanted_features & feature)
 9406		    && (features & feature)) {
 9407			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
 9408				   &feature, upper->name);
 9409			features &= ~feature;
 9410		}
 9411	}
 9412
 9413	return features;
 9414}
 9415
 9416static void netdev_sync_lower_features(struct net_device *upper,
 9417	struct net_device *lower, netdev_features_t features)
 9418{
 9419	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9420	netdev_features_t feature;
 9421	int feature_bit;
 9422
 9423	for_each_netdev_feature(upper_disables, feature_bit) {
 9424		feature = __NETIF_F_BIT(feature_bit);
 9425		if (!(features & feature) && (lower->features & feature)) {
 9426			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
 9427				   &feature, lower->name);
 9428			lower->wanted_features &= ~feature;
 9429			__netdev_update_features(lower);
 9430
 9431			if (unlikely(lower->features & feature))
 9432				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
 9433					    &feature, lower->name);
 9434			else
 9435				netdev_features_change(lower);
 9436		}
 9437	}
 9438}
 9439
 9440static netdev_features_t netdev_fix_features(struct net_device *dev,
 9441	netdev_features_t features)
 9442{
 9443	/* Fix illegal checksum combinations */
 9444	if ((features & NETIF_F_HW_CSUM) &&
 9445	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
 9446		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
 9447		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
 9448	}
 9449
 9450	/* TSO requires that SG is present as well. */
 9451	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
 9452		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
 9453		features &= ~NETIF_F_ALL_TSO;
 9454	}
 9455
 9456	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
 9457					!(features & NETIF_F_IP_CSUM)) {
 9458		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
 9459		features &= ~NETIF_F_TSO;
 9460		features &= ~NETIF_F_TSO_ECN;
 9461	}
 9462
 9463	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
 9464					 !(features & NETIF_F_IPV6_CSUM)) {
 9465		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
 9466		features &= ~NETIF_F_TSO6;
 9467	}
 9468
 9469	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
 9470	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
 9471		features &= ~NETIF_F_TSO_MANGLEID;
 9472
 9473	/* TSO ECN requires that TSO is present as well. */
 9474	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
 9475		features &= ~NETIF_F_TSO_ECN;
 9476
 9477	/* Software GSO depends on SG. */
 9478	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
 9479		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
 9480		features &= ~NETIF_F_GSO;
 9481	}
 9482
 9483	/* GSO partial features require GSO partial be set */
 9484	if ((features & dev->gso_partial_features) &&
 9485	    !(features & NETIF_F_GSO_PARTIAL)) {
 9486		netdev_dbg(dev,
 9487			   "Dropping partially supported GSO features since no GSO partial.\n");
 9488		features &= ~dev->gso_partial_features;
 9489	}
 9490
 9491	if (!(features & NETIF_F_RXCSUM)) {
 9492		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
 9493		 * successfully merged by hardware must also have the
 9494		 * checksum verified by hardware.  If the user does not
 9495		 * want to enable RXCSUM, logically, we should disable GRO_HW.
 9496		 */
 9497		if (features & NETIF_F_GRO_HW) {
 9498			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
 9499			features &= ~NETIF_F_GRO_HW;
 9500		}
 9501	}
 9502
 9503	/* LRO/HW-GRO features cannot be combined with RX-FCS */
 9504	if (features & NETIF_F_RXFCS) {
 9505		if (features & NETIF_F_LRO) {
 9506			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
 9507			features &= ~NETIF_F_LRO;
 9508		}
 9509
 9510		if (features & NETIF_F_GRO_HW) {
 9511			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
 9512			features &= ~NETIF_F_GRO_HW;
 9513		}
 9514	}
 9515
 9516	return features;
 9517}
 9518
 9519int __netdev_update_features(struct net_device *dev)
 9520{
 9521	struct net_device *upper, *lower;
 9522	netdev_features_t features;
 9523	struct list_head *iter;
 9524	int err = -1;
 9525
 9526	ASSERT_RTNL();
 9527
 9528	features = netdev_get_wanted_features(dev);
 9529
 9530	if (dev->netdev_ops->ndo_fix_features)
 9531		features = dev->netdev_ops->ndo_fix_features(dev, features);
 9532
 9533	/* driver might be less strict about feature dependencies */
 9534	features = netdev_fix_features(dev, features);
 9535
 9536	/* some features can't be enabled if they're off an an upper device */
 9537	netdev_for_each_upper_dev_rcu(dev, upper, iter)
 9538		features = netdev_sync_upper_features(dev, upper, features);
 9539
 9540	if (dev->features == features)
 9541		goto sync_lower;
 9542
 9543	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
 9544		&dev->features, &features);
 9545
 9546	if (dev->netdev_ops->ndo_set_features)
 9547		err = dev->netdev_ops->ndo_set_features(dev, features);
 9548	else
 9549		err = 0;
 9550
 9551	if (unlikely(err < 0)) {
 9552		netdev_err(dev,
 9553			"set_features() failed (%d); wanted %pNF, left %pNF\n",
 9554			err, &features, &dev->features);
 9555		/* return non-0 since some features might have changed and
 9556		 * it's better to fire a spurious notification than miss it
 9557		 */
 9558		return -1;
 9559	}
 9560
 9561sync_lower:
 9562	/* some features must be disabled on lower devices when disabled
 9563	 * on an upper device (think: bonding master or bridge)
 9564	 */
 9565	netdev_for_each_lower_dev(dev, lower, iter)
 9566		netdev_sync_lower_features(dev, lower, features);
 9567
 9568	if (!err) {
 9569		netdev_features_t diff = features ^ dev->features;
 9570
 9571		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9572			/* udp_tunnel_{get,drop}_rx_info both need
 9573			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
 9574			 * device, or they won't do anything.
 9575			 * Thus we need to update dev->features
 9576			 * *before* calling udp_tunnel_get_rx_info,
 9577			 * but *after* calling udp_tunnel_drop_rx_info.
 9578			 */
 9579			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9580				dev->features = features;
 9581				udp_tunnel_get_rx_info(dev);
 9582			} else {
 9583				udp_tunnel_drop_rx_info(dev);
 9584			}
 9585		}
 9586
 9587		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9588			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9589				dev->features = features;
 9590				err |= vlan_get_rx_ctag_filter_info(dev);
 9591			} else {
 9592				vlan_drop_rx_ctag_filter_info(dev);
 9593			}
 9594		}
 9595
 9596		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
 9597			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
 9598				dev->features = features;
 9599				err |= vlan_get_rx_stag_filter_info(dev);
 9600			} else {
 9601				vlan_drop_rx_stag_filter_info(dev);
 9602			}
 9603		}
 9604
 9605		dev->features = features;
 9606	}
 9607
 9608	return err < 0 ? 0 : 1;
 9609}
 9610
 9611/**
 9612 *	netdev_update_features - recalculate device features
 9613 *	@dev: the device to check
 9614 *
 9615 *	Recalculate dev->features set and send notifications if it
 9616 *	has changed. Should be called after driver or hardware dependent
 9617 *	conditions might have changed that influence the features.
 9618 */
 9619void netdev_update_features(struct net_device *dev)
 9620{
 9621	if (__netdev_update_features(dev))
 9622		netdev_features_change(dev);
 9623}
 9624EXPORT_SYMBOL(netdev_update_features);
 9625
 9626/**
 9627 *	netdev_change_features - recalculate device features
 9628 *	@dev: the device to check
 9629 *
 9630 *	Recalculate dev->features set and send notifications even
 9631 *	if they have not changed. Should be called instead of
 9632 *	netdev_update_features() if also dev->vlan_features might
 9633 *	have changed to allow the changes to be propagated to stacked
 9634 *	VLAN devices.
 9635 */
 9636void netdev_change_features(struct net_device *dev)
 9637{
 9638	__netdev_update_features(dev);
 9639	netdev_features_change(dev);
 9640}
 9641EXPORT_SYMBOL(netdev_change_features);
 9642
 9643/**
 9644 *	netif_stacked_transfer_operstate -	transfer operstate
 9645 *	@rootdev: the root or lower level device to transfer state from
 9646 *	@dev: the device to transfer operstate to
 9647 *
 9648 *	Transfer operational state from root to device. This is normally
 9649 *	called when a stacking relationship exists between the root
 9650 *	device and the device(a leaf device).
 9651 */
 9652void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 9653					struct net_device *dev)
 9654{
 9655	if (rootdev->operstate == IF_OPER_DORMANT)
 9656		netif_dormant_on(dev);
 9657	else
 9658		netif_dormant_off(dev);
 9659
 9660	if (rootdev->operstate == IF_OPER_TESTING)
 9661		netif_testing_on(dev);
 9662	else
 9663		netif_testing_off(dev);
 9664
 9665	if (netif_carrier_ok(rootdev))
 9666		netif_carrier_on(dev);
 9667	else
 9668		netif_carrier_off(dev);
 9669}
 9670EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 9671
 9672static int netif_alloc_rx_queues(struct net_device *dev)
 9673{
 9674	unsigned int i, count = dev->num_rx_queues;
 9675	struct netdev_rx_queue *rx;
 9676	size_t sz = count * sizeof(*rx);
 9677	int err = 0;
 9678
 9679	BUG_ON(count < 1);
 9680
 9681	rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 9682	if (!rx)
 9683		return -ENOMEM;
 9684
 9685	dev->_rx = rx;
 9686
 9687	for (i = 0; i < count; i++) {
 9688		rx[i].dev = dev;
 9689
 9690		/* XDP RX-queue setup */
 9691		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
 9692		if (err < 0)
 9693			goto err_rxq_info;
 9694	}
 9695	return 0;
 9696
 9697err_rxq_info:
 9698	/* Rollback successful reg's and free other resources */
 9699	while (i--)
 9700		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
 9701	kvfree(dev->_rx);
 9702	dev->_rx = NULL;
 9703	return err;
 9704}
 9705
 9706static void netif_free_rx_queues(struct net_device *dev)
 9707{
 9708	unsigned int i, count = dev->num_rx_queues;
 9709
 9710	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
 9711	if (!dev->_rx)
 9712		return;
 9713
 9714	for (i = 0; i < count; i++)
 9715		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
 9716
 9717	kvfree(dev->_rx);
 9718}
 9719
 9720static void netdev_init_one_queue(struct net_device *dev,
 9721				  struct netdev_queue *queue, void *_unused)
 9722{
 9723	/* Initialize queue lock */
 9724	spin_lock_init(&queue->_xmit_lock);
 9725	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
 9726	queue->xmit_lock_owner = -1;
 9727	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
 9728	queue->dev = dev;
 9729#ifdef CONFIG_BQL
 9730	dql_init(&queue->dql, HZ);
 9731#endif
 9732}
 9733
 9734static void netif_free_tx_queues(struct net_device *dev)
 9735{
 9736	kvfree(dev->_tx);
 9737}
 9738
 9739static int netif_alloc_netdev_queues(struct net_device *dev)
 9740{
 9741	unsigned int count = dev->num_tx_queues;
 9742	struct netdev_queue *tx;
 9743	size_t sz = count * sizeof(*tx);
 9744
 9745	if (count < 1 || count > 0xffff)
 9746		return -EINVAL;
 9747
 9748	tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 9749	if (!tx)
 9750		return -ENOMEM;
 9751
 9752	dev->_tx = tx;
 9753
 9754	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
 9755	spin_lock_init(&dev->tx_global_lock);
 9756
 9757	return 0;
 9758}
 9759
 9760void netif_tx_stop_all_queues(struct net_device *dev)
 9761{
 9762	unsigned int i;
 9763
 9764	for (i = 0; i < dev->num_tx_queues; i++) {
 9765		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 9766
 9767		netif_tx_stop_queue(txq);
 9768	}
 9769}
 9770EXPORT_SYMBOL(netif_tx_stop_all_queues);
 9771
 9772/**
 9773 *	register_netdevice	- register a network device
 9774 *	@dev: device to register
 9775 *
 9776 *	Take a completed network device structure and add it to the kernel
 9777 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 9778 *	chain. 0 is returned on success. A negative errno code is returned
 9779 *	on a failure to set up the device, or if the name is a duplicate.
 9780 *
 9781 *	Callers must hold the rtnl semaphore. You may want
 9782 *	register_netdev() instead of this.
 9783 *
 9784 *	BUGS:
 9785 *	The locking appears insufficient to guarantee two parallel registers
 9786 *	will not get the same name.
 9787 */
 9788
 9789int register_netdevice(struct net_device *dev)
 9790{
 9791	int ret;
 9792	struct net *net = dev_net(dev);
 9793
 9794	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
 9795		     NETDEV_FEATURE_COUNT);
 9796	BUG_ON(dev_boot_phase);
 9797	ASSERT_RTNL();
 9798
 9799	might_sleep();
 9800
 9801	/* When net_device's are persistent, this will be fatal. */
 9802	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
 9803	BUG_ON(!net);
 9804
 9805	ret = ethtool_check_ops(dev->ethtool_ops);
 9806	if (ret)
 9807		return ret;
 9808
 9809	spin_lock_init(&dev->addr_list_lock);
 9810	netdev_set_addr_lockdep_class(dev);
 9811
 9812	ret = dev_get_valid_name(net, dev, dev->name);
 9813	if (ret < 0)
 9814		goto out;
 9815
 9816	ret = -ENOMEM;
 9817	dev->name_node = netdev_name_node_head_alloc(dev);
 9818	if (!dev->name_node)
 9819		goto out;
 9820
 9821	/* Init, if this function is available */
 9822	if (dev->netdev_ops->ndo_init) {
 9823		ret = dev->netdev_ops->ndo_init(dev);
 9824		if (ret) {
 9825			if (ret > 0)
 9826				ret = -EIO;
 9827			goto err_free_name;
 9828		}
 9829	}
 9830
 9831	if (((dev->hw_features | dev->features) &
 9832	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
 9833	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
 9834	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
 9835		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
 9836		ret = -EINVAL;
 9837		goto err_uninit;
 9838	}
 9839
 9840	ret = -EBUSY;
 9841	if (!dev->ifindex)
 9842		dev->ifindex = dev_new_index(net);
 9843	else if (__dev_get_by_index(net, dev->ifindex))
 9844		goto err_uninit;
 9845
 9846	/* Transfer changeable features to wanted_features and enable
 9847	 * software offloads (GSO and GRO).
 9848	 */
 9849	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
 9850	dev->features |= NETIF_F_SOFT_FEATURES;
 9851
 9852	if (dev->netdev_ops->ndo_udp_tunnel_add) {
 9853		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 9854		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 9855	}
 9856
 9857	dev->wanted_features = dev->features & dev->hw_features;
 9858
 9859	if (!(dev->flags & IFF_LOOPBACK))
 9860		dev->hw_features |= NETIF_F_NOCACHE_COPY;
 9861
 9862	/* If IPv4 TCP segmentation offload is supported we should also
 9863	 * allow the device to enable segmenting the frame with the option
 9864	 * of ignoring a static IP ID value.  This doesn't enable the
 9865	 * feature itself but allows the user to enable it later.
 9866	 */
 9867	if (dev->hw_features & NETIF_F_TSO)
 9868		dev->hw_features |= NETIF_F_TSO_MANGLEID;
 9869	if (dev->vlan_features & NETIF_F_TSO)
 9870		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
 9871	if (dev->mpls_features & NETIF_F_TSO)
 9872		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
 9873	if (dev->hw_enc_features & NETIF_F_TSO)
 9874		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
 9875
 9876	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
 9877	 */
 9878	dev->vlan_features |= NETIF_F_HIGHDMA;
 9879
 9880	/* Make NETIF_F_SG inheritable to tunnel devices.
 9881	 */
 9882	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
 9883
 9884	/* Make NETIF_F_SG inheritable to MPLS.
 9885	 */
 9886	dev->mpls_features |= NETIF_F_SG;
 9887
 9888	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
 9889	ret = notifier_to_errno(ret);
 9890	if (ret)
 9891		goto err_uninit;
 9892
 9893	ret = netdev_register_kobject(dev);
 9894	if (ret) {
 9895		dev->reg_state = NETREG_UNREGISTERED;
 9896		goto err_uninit;
 9897	}
 9898	dev->reg_state = NETREG_REGISTERED;
 9899
 9900	__netdev_update_features(dev);
 9901
 9902	/*
 9903	 *	Default initial state at registry is that the
 9904	 *	device is present.
 9905	 */
 9906
 9907	set_bit(__LINK_STATE_PRESENT, &dev->state);
 9908
 9909	linkwatch_init_dev(dev);
 9910
 9911	dev_init_scheduler(dev);
 9912	dev_hold(dev);
 9913	list_netdevice(dev);
 9914	add_device_randomness(dev->dev_addr, dev->addr_len);
 9915
 9916	/* If the device has permanent device address, driver should
 9917	 * set dev_addr and also addr_assign_type should be set to
 9918	 * NET_ADDR_PERM (default value).
 9919	 */
 9920	if (dev->addr_assign_type == NET_ADDR_PERM)
 9921		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
 9922
 9923	/* Notify protocols, that a new device appeared. */
 9924	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
 9925	ret = notifier_to_errno(ret);
 9926	if (ret) {
 9927		rollback_registered(dev);
 9928		rcu_barrier();
 9929
 9930		dev->reg_state = NETREG_UNREGISTERED;
 9931		/* We should put the kobject that hold in
 9932		 * netdev_unregister_kobject(), otherwise
 9933		 * the net device cannot be freed when
 9934		 * driver calls free_netdev(), because the
 9935		 * kobject is being hold.
 9936		 */
 9937		kobject_put(&dev->dev.kobj);
 9938	}
 9939	/*
 9940	 *	Prevent userspace races by waiting until the network
 9941	 *	device is fully setup before sending notifications.
 9942	 */
 9943	if (!dev->rtnl_link_ops ||
 9944	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 9945		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
 9946
 9947out:
 9948	return ret;
 9949
 9950err_uninit:
 9951	if (dev->netdev_ops->ndo_uninit)
 9952		dev->netdev_ops->ndo_uninit(dev);
 9953	if (dev->priv_destructor)
 9954		dev->priv_destructor(dev);
 9955err_free_name:
 9956	netdev_name_node_free(dev->name_node);
 9957	goto out;
 9958}
 9959EXPORT_SYMBOL(register_netdevice);
 9960
 9961/**
 9962 *	init_dummy_netdev	- init a dummy network device for NAPI
 9963 *	@dev: device to init
 9964 *
 9965 *	This takes a network device structure and initialize the minimum
 9966 *	amount of fields so it can be used to schedule NAPI polls without
 9967 *	registering a full blown interface. This is to be used by drivers
 9968 *	that need to tie several hardware interfaces to a single NAPI
 9969 *	poll scheduler due to HW limitations.
 9970 */
 9971int init_dummy_netdev(struct net_device *dev)
 9972{
 9973	/* Clear everything. Note we don't initialize spinlocks
 9974	 * are they aren't supposed to be taken by any of the
 9975	 * NAPI code and this dummy netdev is supposed to be
 9976	 * only ever used for NAPI polls
 9977	 */
 9978	memset(dev, 0, sizeof(struct net_device));
 9979
 9980	/* make sure we BUG if trying to hit standard
 9981	 * register/unregister code path
 9982	 */
 9983	dev->reg_state = NETREG_DUMMY;
 9984
 9985	/* NAPI wants this */
 9986	INIT_LIST_HEAD(&dev->napi_list);
 9987
 9988	/* a dummy interface is started by default */
 9989	set_bit(__LINK_STATE_PRESENT, &dev->state);
 9990	set_bit(__LINK_STATE_START, &dev->state);
 9991
 9992	/* napi_busy_loop stats accounting wants this */
 9993	dev_net_set(dev, &init_net);
 9994
 9995	/* Note : We dont allocate pcpu_refcnt for dummy devices,
 9996	 * because users of this 'device' dont need to change
 9997	 * its refcount.
 9998	 */
 9999
10000	return 0;
10001}
10002EXPORT_SYMBOL_GPL(init_dummy_netdev);
10003
10004
10005/**
10006 *	register_netdev	- register a network device
10007 *	@dev: device to register
10008 *
10009 *	Take a completed network device structure and add it to the kernel
10010 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10011 *	chain. 0 is returned on success. A negative errno code is returned
10012 *	on a failure to set up the device, or if the name is a duplicate.
10013 *
10014 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
10015 *	and expands the device name if you passed a format string to
10016 *	alloc_netdev.
10017 */
10018int register_netdev(struct net_device *dev)
10019{
10020	int err;
10021
10022	if (rtnl_lock_killable())
10023		return -EINTR;
10024	err = register_netdevice(dev);
10025	rtnl_unlock();
10026	return err;
10027}
10028EXPORT_SYMBOL(register_netdev);
10029
10030int netdev_refcnt_read(const struct net_device *dev)
10031{
10032	int i, refcnt = 0;
10033
10034	for_each_possible_cpu(i)
10035		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10036	return refcnt;
10037}
10038EXPORT_SYMBOL(netdev_refcnt_read);
10039
10040/**
10041 * netdev_wait_allrefs - wait until all references are gone.
10042 * @dev: target net_device
10043 *
10044 * This is called when unregistering network devices.
10045 *
10046 * Any protocol or device that holds a reference should register
10047 * for netdevice notification, and cleanup and put back the
10048 * reference if they receive an UNREGISTER event.
10049 * We can get stuck here if buggy protocols don't correctly
10050 * call dev_put.
10051 */
10052static void netdev_wait_allrefs(struct net_device *dev)
10053{
10054	unsigned long rebroadcast_time, warning_time;
10055	int refcnt;
10056
10057	linkwatch_forget_dev(dev);
10058
10059	rebroadcast_time = warning_time = jiffies;
10060	refcnt = netdev_refcnt_read(dev);
10061
10062	while (refcnt != 0) {
10063		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10064			rtnl_lock();
10065
10066			/* Rebroadcast unregister notification */
10067			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10068
10069			__rtnl_unlock();
10070			rcu_barrier();
10071			rtnl_lock();
10072
10073			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10074				     &dev->state)) {
10075				/* We must not have linkwatch events
10076				 * pending on unregister. If this
10077				 * happens, we simply run the queue
10078				 * unscheduled, resulting in a noop
10079				 * for this device.
10080				 */
10081				linkwatch_run_queue();
10082			}
10083
10084			__rtnl_unlock();
10085
10086			rebroadcast_time = jiffies;
10087		}
10088
10089		msleep(250);
10090
10091		refcnt = netdev_refcnt_read(dev);
10092
10093		if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
10094			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10095				 dev->name, refcnt);
10096			warning_time = jiffies;
10097		}
10098	}
10099}
10100
10101/* The sequence is:
10102 *
10103 *	rtnl_lock();
10104 *	...
10105 *	register_netdevice(x1);
10106 *	register_netdevice(x2);
10107 *	...
10108 *	unregister_netdevice(y1);
10109 *	unregister_netdevice(y2);
10110 *      ...
10111 *	rtnl_unlock();
10112 *	free_netdev(y1);
10113 *	free_netdev(y2);
10114 *
10115 * We are invoked by rtnl_unlock().
10116 * This allows us to deal with problems:
10117 * 1) We can delete sysfs objects which invoke hotplug
10118 *    without deadlocking with linkwatch via keventd.
10119 * 2) Since we run with the RTNL semaphore not held, we can sleep
10120 *    safely in order to wait for the netdev refcnt to drop to zero.
10121 *
10122 * We must not return until all unregister events added during
10123 * the interval the lock was held have been completed.
10124 */
10125void netdev_run_todo(void)
10126{
10127	struct list_head list;
10128#ifdef CONFIG_LOCKDEP
10129	struct list_head unlink_list;
10130
10131	list_replace_init(&net_unlink_list, &unlink_list);
10132
10133	while (!list_empty(&unlink_list)) {
10134		struct net_device *dev = list_first_entry(&unlink_list,
10135							  struct net_device,
10136							  unlink_list);
10137		list_del(&dev->unlink_list);
10138		dev->nested_level = dev->lower_level - 1;
10139	}
10140#endif
10141
10142	/* Snapshot list, allow later requests */
10143	list_replace_init(&net_todo_list, &list);
10144
10145	__rtnl_unlock();
10146
10147
10148	/* Wait for rcu callbacks to finish before next phase */
10149	if (!list_empty(&list))
10150		rcu_barrier();
10151
10152	while (!list_empty(&list)) {
10153		struct net_device *dev
10154			= list_first_entry(&list, struct net_device, todo_list);
10155		list_del(&dev->todo_list);
10156
10157		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10158			pr_err("network todo '%s' but state %d\n",
10159			       dev->name, dev->reg_state);
10160			dump_stack();
10161			continue;
10162		}
10163
10164		dev->reg_state = NETREG_UNREGISTERED;
10165
10166		netdev_wait_allrefs(dev);
10167
10168		/* paranoia */
10169		BUG_ON(netdev_refcnt_read(dev));
10170		BUG_ON(!list_empty(&dev->ptype_all));
10171		BUG_ON(!list_empty(&dev->ptype_specific));
10172		WARN_ON(rcu_access_pointer(dev->ip_ptr));
10173		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10174#if IS_ENABLED(CONFIG_DECNET)
10175		WARN_ON(dev->dn_ptr);
10176#endif
10177		if (dev->priv_destructor)
10178			dev->priv_destructor(dev);
10179		if (dev->needs_free_netdev)
10180			free_netdev(dev);
10181
10182		/* Report a network device has been unregistered */
10183		rtnl_lock();
10184		dev_net(dev)->dev_unreg_count--;
10185		__rtnl_unlock();
10186		wake_up(&netdev_unregistering_wq);
10187
10188		/* Free network device */
10189		kobject_put(&dev->dev.kobj);
10190	}
10191}
10192
10193/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10194 * all the same fields in the same order as net_device_stats, with only
10195 * the type differing, but rtnl_link_stats64 may have additional fields
10196 * at the end for newer counters.
10197 */
10198void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10199			     const struct net_device_stats *netdev_stats)
10200{
10201#if BITS_PER_LONG == 64
10202	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
10203	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
10204	/* zero out counters that only exist in rtnl_link_stats64 */
10205	memset((char *)stats64 + sizeof(*netdev_stats), 0,
10206	       sizeof(*stats64) - sizeof(*netdev_stats));
10207#else
10208	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
10209	const unsigned long *src = (const unsigned long *)netdev_stats;
10210	u64 *dst = (u64 *)stats64;
10211
10212	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10213	for (i = 0; i < n; i++)
10214		dst[i] = src[i];
10215	/* zero out counters that only exist in rtnl_link_stats64 */
10216	memset((char *)stats64 + n * sizeof(u64), 0,
10217	       sizeof(*stats64) - n * sizeof(u64));
10218#endif
10219}
10220EXPORT_SYMBOL(netdev_stats_to_stats64);
10221
10222/**
10223 *	dev_get_stats	- get network device statistics
10224 *	@dev: device to get statistics from
10225 *	@storage: place to store stats
10226 *
10227 *	Get network statistics from device. Return @storage.
10228 *	The device driver may provide its own method by setting
10229 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10230 *	otherwise the internal statistics structure is used.
10231 */
10232struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10233					struct rtnl_link_stats64 *storage)
10234{
10235	const struct net_device_ops *ops = dev->netdev_ops;
10236
10237	if (ops->ndo_get_stats64) {
10238		memset(storage, 0, sizeof(*storage));
10239		ops->ndo_get_stats64(dev, storage);
10240	} else if (ops->ndo_get_stats) {
10241		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10242	} else {
10243		netdev_stats_to_stats64(storage, &dev->stats);
10244	}
10245	storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
10246	storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
10247	storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
10248	return storage;
10249}
10250EXPORT_SYMBOL(dev_get_stats);
10251
10252struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10253{
10254	struct netdev_queue *queue = dev_ingress_queue(dev);
10255
10256#ifdef CONFIG_NET_CLS_ACT
10257	if (queue)
10258		return queue;
10259	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10260	if (!queue)
10261		return NULL;
10262	netdev_init_one_queue(dev, queue, NULL);
10263	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10264	queue->qdisc_sleeping = &noop_qdisc;
10265	rcu_assign_pointer(dev->ingress_queue, queue);
10266#endif
10267	return queue;
10268}
10269
10270static const struct ethtool_ops default_ethtool_ops;
10271
10272void netdev_set_default_ethtool_ops(struct net_device *dev,
10273				    const struct ethtool_ops *ops)
10274{
10275	if (dev->ethtool_ops == &default_ethtool_ops)
10276		dev->ethtool_ops = ops;
10277}
10278EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10279
10280void netdev_freemem(struct net_device *dev)
10281{
10282	char *addr = (char *)dev - dev->padded;
10283
10284	kvfree(addr);
10285}
10286
10287/**
10288 * alloc_netdev_mqs - allocate network device
10289 * @sizeof_priv: size of private data to allocate space for
10290 * @name: device name format string
10291 * @name_assign_type: origin of device name
10292 * @setup: callback to initialize device
10293 * @txqs: the number of TX subqueues to allocate
10294 * @rxqs: the number of RX subqueues to allocate
10295 *
10296 * Allocates a struct net_device with private data area for driver use
10297 * and performs basic initialization.  Also allocates subqueue structs
10298 * for each queue on the device.
10299 */
10300struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10301		unsigned char name_assign_type,
10302		void (*setup)(struct net_device *),
10303		unsigned int txqs, unsigned int rxqs)
10304{
10305	struct net_device *dev;
10306	unsigned int alloc_size;
10307	struct net_device *p;
10308
10309	BUG_ON(strlen(name) >= sizeof(dev->name));
10310
10311	if (txqs < 1) {
10312		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10313		return NULL;
10314	}
10315
10316	if (rxqs < 1) {
10317		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10318		return NULL;
10319	}
10320
10321	alloc_size = sizeof(struct net_device);
10322	if (sizeof_priv) {
10323		/* ensure 32-byte alignment of private area */
10324		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10325		alloc_size += sizeof_priv;
10326	}
10327	/* ensure 32-byte alignment of whole construct */
10328	alloc_size += NETDEV_ALIGN - 1;
10329
10330	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
10331	if (!p)
10332		return NULL;
10333
10334	dev = PTR_ALIGN(p, NETDEV_ALIGN);
10335	dev->padded = (char *)dev - (char *)p;
10336
10337	dev->pcpu_refcnt = alloc_percpu(int);
10338	if (!dev->pcpu_refcnt)
10339		goto free_dev;
10340
10341	if (dev_addr_init(dev))
10342		goto free_pcpu;
10343
10344	dev_mc_init(dev);
10345	dev_uc_init(dev);
10346
10347	dev_net_set(dev, &init_net);
10348
10349	dev->gso_max_size = GSO_MAX_SIZE;
10350	dev->gso_max_segs = GSO_MAX_SEGS;
10351	dev->upper_level = 1;
10352	dev->lower_level = 1;
10353#ifdef CONFIG_LOCKDEP
10354	dev->nested_level = 0;
10355	INIT_LIST_HEAD(&dev->unlink_list);
10356#endif
10357
10358	INIT_LIST_HEAD(&dev->napi_list);
10359	INIT_LIST_HEAD(&dev->unreg_list);
10360	INIT_LIST_HEAD(&dev->close_list);
10361	INIT_LIST_HEAD(&dev->link_watch_list);
10362	INIT_LIST_HEAD(&dev->adj_list.upper);
10363	INIT_LIST_HEAD(&dev->adj_list.lower);
10364	INIT_LIST_HEAD(&dev->ptype_all);
10365	INIT_LIST_HEAD(&dev->ptype_specific);
10366	INIT_LIST_HEAD(&dev->net_notifier_list);
10367#ifdef CONFIG_NET_SCHED
10368	hash_init(dev->qdisc_hash);
10369#endif
10370	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10371	setup(dev);
10372
10373	if (!dev->tx_queue_len) {
10374		dev->priv_flags |= IFF_NO_QUEUE;
10375		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10376	}
10377
10378	dev->num_tx_queues = txqs;
10379	dev->real_num_tx_queues = txqs;
10380	if (netif_alloc_netdev_queues(dev))
10381		goto free_all;
10382
10383	dev->num_rx_queues = rxqs;
10384	dev->real_num_rx_queues = rxqs;
10385	if (netif_alloc_rx_queues(dev))
10386		goto free_all;
10387
10388	strcpy(dev->name, name);
10389	dev->name_assign_type = name_assign_type;
10390	dev->group = INIT_NETDEV_GROUP;
10391	if (!dev->ethtool_ops)
10392		dev->ethtool_ops = &default_ethtool_ops;
10393
10394	nf_hook_ingress_init(dev);
10395
10396	return dev;
10397
10398free_all:
10399	free_netdev(dev);
10400	return NULL;
10401
10402free_pcpu:
10403	free_percpu(dev->pcpu_refcnt);
10404free_dev:
10405	netdev_freemem(dev);
10406	return NULL;
10407}
10408EXPORT_SYMBOL(alloc_netdev_mqs);
10409
10410/**
10411 * free_netdev - free network device
10412 * @dev: device
10413 *
10414 * This function does the last stage of destroying an allocated device
10415 * interface. The reference to the device object is released. If this
10416 * is the last reference then it will be freed.Must be called in process
10417 * context.
10418 */
10419void free_netdev(struct net_device *dev)
10420{
10421	struct napi_struct *p, *n;
10422
10423	might_sleep();
10424	netif_free_tx_queues(dev);
10425	netif_free_rx_queues(dev);
10426
10427	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10428
10429	/* Flush device addresses */
10430	dev_addr_flush(dev);
10431
10432	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10433		netif_napi_del(p);
10434
10435	free_percpu(dev->pcpu_refcnt);
10436	dev->pcpu_refcnt = NULL;
10437	free_percpu(dev->xdp_bulkq);
10438	dev->xdp_bulkq = NULL;
10439
10440	/*  Compatibility with error handling in drivers */
10441	if (dev->reg_state == NETREG_UNINITIALIZED) {
10442		netdev_freemem(dev);
10443		return;
10444	}
10445
10446	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10447	dev->reg_state = NETREG_RELEASED;
10448
10449	/* will free via device release */
10450	put_device(&dev->dev);
10451}
10452EXPORT_SYMBOL(free_netdev);
10453
10454/**
10455 *	synchronize_net -  Synchronize with packet receive processing
10456 *
10457 *	Wait for packets currently being received to be done.
10458 *	Does not block later packets from starting.
10459 */
10460void synchronize_net(void)
10461{
10462	might_sleep();
10463	if (rtnl_is_locked())
10464		synchronize_rcu_expedited();
10465	else
10466		synchronize_rcu();
10467}
10468EXPORT_SYMBOL(synchronize_net);
10469
10470/**
10471 *	unregister_netdevice_queue - remove device from the kernel
10472 *	@dev: device
10473 *	@head: list
10474 *
10475 *	This function shuts down a device interface and removes it
10476 *	from the kernel tables.
10477 *	If head not NULL, device is queued to be unregistered later.
10478 *
10479 *	Callers must hold the rtnl semaphore.  You may want
10480 *	unregister_netdev() instead of this.
10481 */
10482
10483void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
10484{
10485	ASSERT_RTNL();
10486
10487	if (head) {
10488		list_move_tail(&dev->unreg_list, head);
10489	} else {
10490		rollback_registered(dev);
10491		/* Finish processing unregister after unlock */
10492		net_set_todo(dev);
10493	}
10494}
10495EXPORT_SYMBOL(unregister_netdevice_queue);
10496
10497/**
10498 *	unregister_netdevice_many - unregister many devices
10499 *	@head: list of devices
10500 *
10501 *  Note: As most callers use a stack allocated list_head,
10502 *  we force a list_del() to make sure stack wont be corrupted later.
10503 */
10504void unregister_netdevice_many(struct list_head *head)
10505{
10506	struct net_device *dev;
10507
10508	if (!list_empty(head)) {
10509		rollback_registered_many(head);
10510		list_for_each_entry(dev, head, unreg_list)
10511			net_set_todo(dev);
10512		list_del(head);
10513	}
10514}
10515EXPORT_SYMBOL(unregister_netdevice_many);
10516
10517/**
10518 *	unregister_netdev - remove device from the kernel
10519 *	@dev: device
10520 *
10521 *	This function shuts down a device interface and removes it
10522 *	from the kernel tables.
10523 *
10524 *	This is just a wrapper for unregister_netdevice that takes
10525 *	the rtnl semaphore.  In general you want to use this and not
10526 *	unregister_netdevice.
10527 */
10528void unregister_netdev(struct net_device *dev)
10529{
10530	rtnl_lock();
10531	unregister_netdevice(dev);
10532	rtnl_unlock();
10533}
10534EXPORT_SYMBOL(unregister_netdev);
10535
10536/**
10537 *	dev_change_net_namespace - move device to different nethost namespace
10538 *	@dev: device
10539 *	@net: network namespace
10540 *	@pat: If not NULL name pattern to try if the current device name
10541 *	      is already taken in the destination network namespace.
10542 *
10543 *	This function shuts down a device interface and moves it
10544 *	to a new network namespace. On success 0 is returned, on
10545 *	a failure a netagive errno code is returned.
10546 *
10547 *	Callers must hold the rtnl semaphore.
10548 */
10549
10550int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
10551{
10552	struct net *net_old = dev_net(dev);
10553	int err, new_nsid, new_ifindex;
10554
10555	ASSERT_RTNL();
10556
10557	/* Don't allow namespace local devices to be moved. */
10558	err = -EINVAL;
10559	if (dev->features & NETIF_F_NETNS_LOCAL)
10560		goto out;
10561
10562	/* Ensure the device has been registrered */
10563	if (dev->reg_state != NETREG_REGISTERED)
10564		goto out;
10565
10566	/* Get out if there is nothing todo */
10567	err = 0;
10568	if (net_eq(net_old, net))
10569		goto out;
10570
10571	/* Pick the destination device name, and ensure
10572	 * we can use it in the destination network namespace.
10573	 */
10574	err = -EEXIST;
10575	if (__dev_get_by_name(net, dev->name)) {
10576		/* We get here if we can't use the current device name */
10577		if (!pat)
10578			goto out;
10579		err = dev_get_valid_name(net, dev, pat);
10580		if (err < 0)
10581			goto out;
10582	}
10583
10584	/*
10585	 * And now a mini version of register_netdevice unregister_netdevice.
10586	 */
10587
10588	/* If device is running close it first. */
10589	dev_close(dev);
10590
10591	/* And unlink it from device chain */
10592	unlist_netdevice(dev);
10593
10594	synchronize_net();
10595
10596	/* Shutdown queueing discipline. */
10597	dev_shutdown(dev);
10598
10599	/* Notify protocols, that we are about to destroy
10600	 * this device. They should clean all the things.
10601	 *
10602	 * Note that dev->reg_state stays at NETREG_REGISTERED.
10603	 * This is wanted because this way 8021q and macvlan know
10604	 * the device is just moving and can keep their slaves up.
10605	 */
10606	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10607	rcu_barrier();
10608
10609	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
10610	/* If there is an ifindex conflict assign a new one */
10611	if (__dev_get_by_index(net, dev->ifindex))
10612		new_ifindex = dev_new_index(net);
10613	else
10614		new_ifindex = dev->ifindex;
10615
10616	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
10617			    new_ifindex);
10618
10619	/*
10620	 *	Flush the unicast and multicast chains
10621	 */
10622	dev_uc_flush(dev);
10623	dev_mc_flush(dev);
10624
10625	/* Send a netdev-removed uevent to the old namespace */
10626	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
10627	netdev_adjacent_del_links(dev);
10628
10629	/* Move per-net netdevice notifiers that are following the netdevice */
10630	move_netdevice_notifiers_dev_net(dev, net);
10631
10632	/* Actually switch the network namespace */
10633	dev_net_set(dev, net);
10634	dev->ifindex = new_ifindex;
10635
10636	/* Send a netdev-add uevent to the new namespace */
10637	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
10638	netdev_adjacent_add_links(dev);
10639
10640	/* Fixup kobjects */
10641	err = device_rename(&dev->dev, dev->name);
10642	WARN_ON(err);
10643
10644	/* Adapt owner in case owning user namespace of target network
10645	 * namespace is different from the original one.
10646	 */
10647	err = netdev_change_owner(dev, net_old, net);
10648	WARN_ON(err);
10649
10650	/* Add the device back in the hashes */
10651	list_netdevice(dev);
10652
10653	/* Notify protocols, that a new device appeared. */
10654	call_netdevice_notifiers(NETDEV_REGISTER, dev);
10655
10656	/*
10657	 *	Prevent userspace races by waiting until the network
10658	 *	device is fully setup before sending notifications.
10659	 */
10660	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10661
10662	synchronize_net();
10663	err = 0;
10664out:
10665	return err;
10666}
10667EXPORT_SYMBOL_GPL(dev_change_net_namespace);
10668
10669static int dev_cpu_dead(unsigned int oldcpu)
10670{
10671	struct sk_buff **list_skb;
10672	struct sk_buff *skb;
10673	unsigned int cpu;
10674	struct softnet_data *sd, *oldsd, *remsd = NULL;
10675
10676	local_irq_disable();
10677	cpu = smp_processor_id();
10678	sd = &per_cpu(softnet_data, cpu);
10679	oldsd = &per_cpu(softnet_data, oldcpu);
10680
10681	/* Find end of our completion_queue. */
10682	list_skb = &sd->completion_queue;
10683	while (*list_skb)
10684		list_skb = &(*list_skb)->next;
10685	/* Append completion queue from offline CPU. */
10686	*list_skb = oldsd->completion_queue;
10687	oldsd->completion_queue = NULL;
10688
10689	/* Append output queue from offline CPU. */
10690	if (oldsd->output_queue) {
10691		*sd->output_queue_tailp = oldsd->output_queue;
10692		sd->output_queue_tailp = oldsd->output_queue_tailp;
10693		oldsd->output_queue = NULL;
10694		oldsd->output_queue_tailp = &oldsd->output_queue;
10695	}
10696	/* Append NAPI poll list from offline CPU, with one exception :
10697	 * process_backlog() must be called by cpu owning percpu backlog.
10698	 * We properly handle process_queue & input_pkt_queue later.
10699	 */
10700	while (!list_empty(&oldsd->poll_list)) {
10701		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
10702							    struct napi_struct,
10703							    poll_list);
10704
10705		list_del_init(&napi->poll_list);
10706		if (napi->poll == process_backlog)
10707			napi->state = 0;
10708		else
10709			____napi_schedule(sd, napi);
10710	}
10711
10712	raise_softirq_irqoff(NET_TX_SOFTIRQ);
10713	local_irq_enable();
10714
10715#ifdef CONFIG_RPS
10716	remsd = oldsd->rps_ipi_list;
10717	oldsd->rps_ipi_list = NULL;
10718#endif
10719	/* send out pending IPI's on offline CPU */
10720	net_rps_send_ipi(remsd);
10721
10722	/* Process offline CPU's input_pkt_queue */
10723	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
10724		netif_rx_ni(skb);
10725		input_queue_head_incr(oldsd);
10726	}
10727	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
10728		netif_rx_ni(skb);
10729		input_queue_head_incr(oldsd);
10730	}
10731
10732	return 0;
10733}
10734
10735/**
10736 *	netdev_increment_features - increment feature set by one
10737 *	@all: current feature set
10738 *	@one: new feature set
10739 *	@mask: mask feature set
10740 *
10741 *	Computes a new feature set after adding a device with feature set
10742 *	@one to the master device with current feature set @all.  Will not
10743 *	enable anything that is off in @mask. Returns the new feature set.
10744 */
10745netdev_features_t netdev_increment_features(netdev_features_t all,
10746	netdev_features_t one, netdev_features_t mask)
10747{
10748	if (mask & NETIF_F_HW_CSUM)
10749		mask |= NETIF_F_CSUM_MASK;
10750	mask |= NETIF_F_VLAN_CHALLENGED;
10751
10752	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
10753	all &= one | ~NETIF_F_ALL_FOR_ALL;
10754
10755	/* If one device supports hw checksumming, set for all. */
10756	if (all & NETIF_F_HW_CSUM)
10757		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
10758
10759	return all;
10760}
10761EXPORT_SYMBOL(netdev_increment_features);
10762
10763static struct hlist_head * __net_init netdev_create_hash(void)
10764{
10765	int i;
10766	struct hlist_head *hash;
10767
10768	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
10769	if (hash != NULL)
10770		for (i = 0; i < NETDEV_HASHENTRIES; i++)
10771			INIT_HLIST_HEAD(&hash[i]);
10772
10773	return hash;
10774}
10775
10776/* Initialize per network namespace state */
10777static int __net_init netdev_init(struct net *net)
10778{
10779	BUILD_BUG_ON(GRO_HASH_BUCKETS >
10780		     8 * sizeof_field(struct napi_struct, gro_bitmask));
10781
10782	if (net != &init_net)
10783		INIT_LIST_HEAD(&net->dev_base_head);
10784
10785	net->dev_name_head = netdev_create_hash();
10786	if (net->dev_name_head == NULL)
10787		goto err_name;
10788
10789	net->dev_index_head = netdev_create_hash();
10790	if (net->dev_index_head == NULL)
10791		goto err_idx;
10792
10793	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
10794
10795	return 0;
10796
10797err_idx:
10798	kfree(net->dev_name_head);
10799err_name:
10800	return -ENOMEM;
10801}
10802
10803/**
10804 *	netdev_drivername - network driver for the device
10805 *	@dev: network device
10806 *
10807 *	Determine network driver for device.
10808 */
10809const char *netdev_drivername(const struct net_device *dev)
10810{
10811	const struct device_driver *driver;
10812	const struct device *parent;
10813	const char *empty = "";
10814
10815	parent = dev->dev.parent;
10816	if (!parent)
10817		return empty;
10818
10819	driver = parent->driver;
10820	if (driver && driver->name)
10821		return driver->name;
10822	return empty;
10823}
10824
10825static void __netdev_printk(const char *level, const struct net_device *dev,
10826			    struct va_format *vaf)
10827{
10828	if (dev && dev->dev.parent) {
10829		dev_printk_emit(level[1] - '0',
10830				dev->dev.parent,
10831				"%s %s %s%s: %pV",
10832				dev_driver_string(dev->dev.parent),
10833				dev_name(dev->dev.parent),
10834				netdev_name(dev), netdev_reg_state(dev),
10835				vaf);
10836	} else if (dev) {
10837		printk("%s%s%s: %pV",
10838		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
10839	} else {
10840		printk("%s(NULL net_device): %pV", level, vaf);
10841	}
10842}
10843
10844void netdev_printk(const char *level, const struct net_device *dev,
10845		   const char *format, ...)
10846{
10847	struct va_format vaf;
10848	va_list args;
10849
10850	va_start(args, format);
10851
10852	vaf.fmt = format;
10853	vaf.va = &args;
10854
10855	__netdev_printk(level, dev, &vaf);
10856
10857	va_end(args);
10858}
10859EXPORT_SYMBOL(netdev_printk);
10860
10861#define define_netdev_printk_level(func, level)			\
10862void func(const struct net_device *dev, const char *fmt, ...)	\
10863{								\
10864	struct va_format vaf;					\
10865	va_list args;						\
10866								\
10867	va_start(args, fmt);					\
10868								\
10869	vaf.fmt = fmt;						\
10870	vaf.va = &args;						\
10871								\
10872	__netdev_printk(level, dev, &vaf);			\
10873								\
10874	va_end(args);						\
10875}								\
10876EXPORT_SYMBOL(func);
10877
10878define_netdev_printk_level(netdev_emerg, KERN_EMERG);
10879define_netdev_printk_level(netdev_alert, KERN_ALERT);
10880define_netdev_printk_level(netdev_crit, KERN_CRIT);
10881define_netdev_printk_level(netdev_err, KERN_ERR);
10882define_netdev_printk_level(netdev_warn, KERN_WARNING);
10883define_netdev_printk_level(netdev_notice, KERN_NOTICE);
10884define_netdev_printk_level(netdev_info, KERN_INFO);
10885
10886static void __net_exit netdev_exit(struct net *net)
10887{
10888	kfree(net->dev_name_head);
10889	kfree(net->dev_index_head);
10890	if (net != &init_net)
10891		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
10892}
10893
10894static struct pernet_operations __net_initdata netdev_net_ops = {
10895	.init = netdev_init,
10896	.exit = netdev_exit,
10897};
10898
10899static void __net_exit default_device_exit(struct net *net)
10900{
10901	struct net_device *dev, *aux;
10902	/*
10903	 * Push all migratable network devices back to the
10904	 * initial network namespace
10905	 */
10906	rtnl_lock();
10907	for_each_netdev_safe(net, dev, aux) {
10908		int err;
10909		char fb_name[IFNAMSIZ];
10910
10911		/* Ignore unmoveable devices (i.e. loopback) */
10912		if (dev->features & NETIF_F_NETNS_LOCAL)
10913			continue;
10914
10915		/* Leave virtual devices for the generic cleanup */
10916		if (dev->rtnl_link_ops)
10917			continue;
10918
10919		/* Push remaining network devices to init_net */
10920		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
10921		if (__dev_get_by_name(&init_net, fb_name))
10922			snprintf(fb_name, IFNAMSIZ, "dev%%d");
10923		err = dev_change_net_namespace(dev, &init_net, fb_name);
10924		if (err) {
10925			pr_emerg("%s: failed to move %s to init_net: %d\n",
10926				 __func__, dev->name, err);
10927			BUG();
10928		}
10929	}
10930	rtnl_unlock();
10931}
10932
10933static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
10934{
10935	/* Return with the rtnl_lock held when there are no network
10936	 * devices unregistering in any network namespace in net_list.
10937	 */
10938	struct net *net;
10939	bool unregistering;
10940	DEFINE_WAIT_FUNC(wait, woken_wake_function);
10941
10942	add_wait_queue(&netdev_unregistering_wq, &wait);
10943	for (;;) {
10944		unregistering = false;
10945		rtnl_lock();
10946		list_for_each_entry(net, net_list, exit_list) {
10947			if (net->dev_unreg_count > 0) {
10948				unregistering = true;
10949				break;
10950			}
10951		}
10952		if (!unregistering)
10953			break;
10954		__rtnl_unlock();
10955
10956		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
10957	}
10958	remove_wait_queue(&netdev_unregistering_wq, &wait);
10959}
10960
10961static void __net_exit default_device_exit_batch(struct list_head *net_list)
10962{
10963	/* At exit all network devices most be removed from a network
10964	 * namespace.  Do this in the reverse order of registration.
10965	 * Do this across as many network namespaces as possible to
10966	 * improve batching efficiency.
10967	 */
10968	struct net_device *dev;
10969	struct net *net;
10970	LIST_HEAD(dev_kill_list);
10971
10972	/* To prevent network device cleanup code from dereferencing
10973	 * loopback devices or network devices that have been freed
10974	 * wait here for all pending unregistrations to complete,
10975	 * before unregistring the loopback device and allowing the
10976	 * network namespace be freed.
10977	 *
10978	 * The netdev todo list containing all network devices
10979	 * unregistrations that happen in default_device_exit_batch
10980	 * will run in the rtnl_unlock() at the end of
10981	 * default_device_exit_batch.
10982	 */
10983	rtnl_lock_unregistering(net_list);
10984	list_for_each_entry(net, net_list, exit_list) {
10985		for_each_netdev_reverse(net, dev) {
10986			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
10987				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
10988			else
10989				unregister_netdevice_queue(dev, &dev_kill_list);
10990		}
10991	}
10992	unregister_netdevice_many(&dev_kill_list);
10993	rtnl_unlock();
10994}
10995
10996static struct pernet_operations __net_initdata default_device_ops = {
10997	.exit = default_device_exit,
10998	.exit_batch = default_device_exit_batch,
10999};
11000
11001/*
11002 *	Initialize the DEV module. At boot time this walks the device list and
11003 *	unhooks any devices that fail to initialise (normally hardware not
11004 *	present) and leaves us with a valid list of present and active devices.
11005 *
11006 */
11007
11008/*
11009 *       This is called single threaded during boot, so no need
11010 *       to take the rtnl semaphore.
11011 */
11012static int __init net_dev_init(void)
11013{
11014	int i, rc = -ENOMEM;
11015
11016	BUG_ON(!dev_boot_phase);
11017
11018	if (dev_proc_init())
11019		goto out;
11020
11021	if (netdev_kobject_init())
11022		goto out;
11023
11024	INIT_LIST_HEAD(&ptype_all);
11025	for (i = 0; i < PTYPE_HASH_SIZE; i++)
11026		INIT_LIST_HEAD(&ptype_base[i]);
11027
11028	INIT_LIST_HEAD(&offload_base);
11029
11030	if (register_pernet_subsys(&netdev_net_ops))
11031		goto out;
11032
11033	/*
11034	 *	Initialise the packet receive queues.
11035	 */
11036
11037	for_each_possible_cpu(i) {
11038		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11039		struct softnet_data *sd = &per_cpu(softnet_data, i);
11040
11041		INIT_WORK(flush, flush_backlog);
11042
11043		skb_queue_head_init(&sd->input_pkt_queue);
11044		skb_queue_head_init(&sd->process_queue);
11045#ifdef CONFIG_XFRM_OFFLOAD
11046		skb_queue_head_init(&sd->xfrm_backlog);
11047#endif
11048		INIT_LIST_HEAD(&sd->poll_list);
11049		sd->output_queue_tailp = &sd->output_queue;
11050#ifdef CONFIG_RPS
11051		sd->csd.func = rps_trigger_softirq;
11052		sd->csd.info = sd;
11053		sd->cpu = i;
11054#endif
11055
11056		init_gro_hash(&sd->backlog);
11057		sd->backlog.poll = process_backlog;
11058		sd->backlog.weight = weight_p;
11059	}
11060
11061	dev_boot_phase = 0;
11062
11063	/* The loopback device is special if any other network devices
11064	 * is present in a network namespace the loopback device must
11065	 * be present. Since we now dynamically allocate and free the
11066	 * loopback device ensure this invariant is maintained by
11067	 * keeping the loopback device as the first device on the
11068	 * list of network devices.  Ensuring the loopback devices
11069	 * is the first device that appears and the last network device
11070	 * that disappears.
11071	 */
11072	if (register_pernet_device(&loopback_net_ops))
11073		goto out;
11074
11075	if (register_pernet_device(&default_device_ops))
11076		goto out;
11077
11078	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11079	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11080
11081	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11082				       NULL, dev_cpu_dead);
11083	WARN_ON(rc < 0);
11084	rc = 0;
11085out:
11086	return rc;
11087}
11088
11089subsys_initcall(net_dev_init);