Linux Audio

Check our new training course

Loading...
v3.5.6
 
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
 
  84#include <linux/mutex.h>
 
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
 
 
 
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
 
  99#include <linux/rtnetlink.h>
 100#include <linux/proc_fs.h>
 101#include <linux/seq_file.h>
 102#include <linux/stat.h>
 
 103#include <net/dst.h>
 
 
 104#include <net/pkt_sched.h>
 
 105#include <net/checksum.h>
 106#include <net/xfrm.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/kmod.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/wext.h>
 115#include <net/iw_handler.h>
 116#include <asm/current.h>
 117#include <linux/audit.h>
 118#include <linux/dmaengine.h>
 119#include <linux/err.h>
 120#include <linux/ctype.h>
 121#include <linux/if_arp.h>
 122#include <linux/if_vlan.h>
 123#include <linux/ip.h>
 124#include <net/ip.h>
 
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129#include <trace/events/napi.h>
 130#include <trace/events/net.h>
 131#include <trace/events/skb.h>
 132#include <linux/pci.h>
 133#include <linux/inetdevice.h>
 134#include <linux/cpu_rmap.h>
 135#include <linux/net_tstamp.h>
 136#include <linux/static_key.h>
 137#include <net/flow_keys.h>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 138
 
 139#include "net-sysfs.h"
 140
 141/* Instead of increasing this, you should create a hash table. */
 142#define MAX_GRO_SKBS 8
 143
 144/* This should be increased if a protocol with a bigger head is added. */
 145#define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147/*
 148 *	The list of packet types we will receive (as opposed to discard)
 149 *	and the routines to invoke.
 150 *
 151 *	Why 16. Because with 16 the only overlap we get on a hash of the
 152 *	low nibble of the protocol value is RARP/SNAP/X.25.
 153 *
 154 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 155 *             sure which should go first, but I bet it won't make much
 156 *             difference if we are running VLANs.  The good news is that
 157 *             this protocol won't be in the list unless compiled in, so
 158 *             the average user (w/out VLANs) will not be adversely affected.
 159 *             --BLG
 160 *
 161 *		0800	IP
 162 *		8100    802.1Q VLAN
 163 *		0001	802.3
 164 *		0002	AX.25
 165 *		0004	802.2
 166 *		8035	RARP
 167 *		0005	SNAP
 168 *		0805	X.25
 169 *		0806	ARP
 170 *		8137	IPX
 171 *		0009	Localtalk
 172 *		86DD	IPv6
 173 */
 174
 175#define PTYPE_HASH_SIZE	(16)
 176#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 177
 178static DEFINE_SPINLOCK(ptype_lock);
 179static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 180static struct list_head ptype_all __read_mostly;	/* Taps */
 
 
 
 
 
 
 
 
 181
 182/*
 183 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 184 * semaphore.
 185 *
 186 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 187 *
 188 * Writers must hold the rtnl semaphore while they loop through the
 189 * dev_base_head list, and hold dev_base_lock for writing when they do the
 190 * actual updates.  This allows pure readers to access the list even
 191 * while a writer is preparing to update it.
 192 *
 193 * To put it another way, dev_base_lock is held for writing only to
 194 * protect against pure readers; the rtnl semaphore provides the
 195 * protection against other writers.
 196 *
 197 * See, for example usages, register_netdevice() and
 198 * unregister_netdevice(), which must be called with the rtnl
 199 * semaphore held.
 200 */
 201DEFINE_RWLOCK(dev_base_lock);
 202EXPORT_SYMBOL(dev_base_lock);
 203
 
 
 
 
 
 
 
 
 
 
 204static inline void dev_base_seq_inc(struct net *net)
 205{
 206	while (++net->dev_base_seq == 0);
 
 207}
 208
 209static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 210{
 211	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 212
 213	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 214}
 215
 216static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 217{
 218	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 219}
 220
 221static inline void rps_lock(struct softnet_data *sd)
 
 222{
 223#ifdef CONFIG_RPS
 224	spin_lock(&sd->input_pkt_queue.lock);
 225#endif
 
 226}
 227
 228static inline void rps_unlock(struct softnet_data *sd)
 229{
 230#ifdef CONFIG_RPS
 231	spin_unlock(&sd->input_pkt_queue.lock);
 232#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 233}
 234
 235/* Device list insertion */
 236static int list_netdevice(struct net_device *dev)
 237{
 238	struct net *net = dev_net(dev);
 239
 240	ASSERT_RTNL();
 241
 242	write_lock_bh(&dev_base_lock);
 243	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 244	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 245	hlist_add_head_rcu(&dev->index_hlist,
 246			   dev_index_hash(net, dev->ifindex));
 247	write_unlock_bh(&dev_base_lock);
 248
 249	dev_base_seq_inc(net);
 250
 251	return 0;
 252}
 253
 254/* Device list removal
 255 * caller must respect a RCU grace period before freeing/reusing dev
 256 */
 257static void unlist_netdevice(struct net_device *dev)
 258{
 259	ASSERT_RTNL();
 260
 261	/* Unlink dev from the device chain */
 262	write_lock_bh(&dev_base_lock);
 
 263	list_del_rcu(&dev->dev_list);
 264	hlist_del_rcu(&dev->name_hlist);
 265	hlist_del_rcu(&dev->index_hlist);
 266	write_unlock_bh(&dev_base_lock);
 
 267
 268	dev_base_seq_inc(dev_net(dev));
 269}
 270
 271/*
 272 *	Our notifier list
 273 */
 274
 275static RAW_NOTIFIER_HEAD(netdev_chain);
 276
 277/*
 278 *	Device drivers call our routines to queue packets here. We empty the
 279 *	queue in the local softnet handler.
 280 */
 281
 282DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 283EXPORT_PER_CPU_SYMBOL(softnet_data);
 284
 285#ifdef CONFIG_LOCKDEP
 286/*
 287 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 288 * according to dev->type
 289 */
 290static const unsigned short netdev_lock_type[] =
 291	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 292	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 293	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 294	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 295	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 296	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 297	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 298	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 299	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 300	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 301	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 302	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 303	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 304	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 305	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 306
 307static const char *const netdev_lock_name[] =
 308	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 309	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 310	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 311	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 312	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 313	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 314	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 315	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 316	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 317	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 318	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 319	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 320	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 321	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 322	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 323
 324static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 325static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 326
 327static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 328{
 329	int i;
 330
 331	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 332		if (netdev_lock_type[i] == dev_type)
 333			return i;
 334	/* the last key is used by default */
 335	return ARRAY_SIZE(netdev_lock_type) - 1;
 336}
 337
 338static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 339						 unsigned short dev_type)
 340{
 341	int i;
 342
 343	i = netdev_lock_pos(dev_type);
 344	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 345				   netdev_lock_name[i]);
 346}
 347
 348static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 349{
 350	int i;
 351
 352	i = netdev_lock_pos(dev->type);
 353	lockdep_set_class_and_name(&dev->addr_list_lock,
 354				   &netdev_addr_lock_key[i],
 355				   netdev_lock_name[i]);
 356}
 357#else
 358static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 359						 unsigned short dev_type)
 360{
 361}
 
 362static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 363{
 364}
 365#endif
 366
 367/*******************************************************************************
 
 
 
 
 368
 369		Protocol management and registration routines
 370
 371*******************************************************************************/
 372
 373/*
 374 *	Add a protocol ID to the list. Now that the input handler is
 375 *	smarter we can dispense with all the messy stuff that used to be
 376 *	here.
 377 *
 378 *	BEWARE!!! Protocol handlers, mangling input packets,
 379 *	MUST BE last in hash buckets and checking protocol handlers
 380 *	MUST start from promiscuous ptype_all chain in net_bh.
 381 *	It is true now, do not change it.
 382 *	Explanation follows: if protocol handler, mangling packet, will
 383 *	be the first on list, it is not able to sense, that packet
 384 *	is cloned and should be copied-on-write, so that it will
 385 *	change it and subsequent readers will get broken packet.
 386 *							--ANK (980803)
 387 */
 388
 389static inline struct list_head *ptype_head(const struct packet_type *pt)
 390{
 391	if (pt->type == htons(ETH_P_ALL))
 392		return &ptype_all;
 393	else
 394		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 
 395}
 396
 397/**
 398 *	dev_add_pack - add packet handler
 399 *	@pt: packet type declaration
 400 *
 401 *	Add a protocol handler to the networking stack. The passed &packet_type
 402 *	is linked into kernel lists and may not be freed until it has been
 403 *	removed from the kernel lists.
 404 *
 405 *	This call does not sleep therefore it can not
 406 *	guarantee all CPU's that are in middle of receiving packets
 407 *	will see the new packet type (until the next received packet).
 408 */
 409
 410void dev_add_pack(struct packet_type *pt)
 411{
 412	struct list_head *head = ptype_head(pt);
 413
 414	spin_lock(&ptype_lock);
 415	list_add_rcu(&pt->list, head);
 416	spin_unlock(&ptype_lock);
 417}
 418EXPORT_SYMBOL(dev_add_pack);
 419
 420/**
 421 *	__dev_remove_pack	 - remove packet handler
 422 *	@pt: packet type declaration
 423 *
 424 *	Remove a protocol handler that was previously added to the kernel
 425 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 426 *	from the kernel lists and can be freed or reused once this function
 427 *	returns.
 428 *
 429 *      The packet type might still be in use by receivers
 430 *	and must not be freed until after all the CPU's have gone
 431 *	through a quiescent state.
 432 */
 433void __dev_remove_pack(struct packet_type *pt)
 434{
 435	struct list_head *head = ptype_head(pt);
 436	struct packet_type *pt1;
 437
 438	spin_lock(&ptype_lock);
 439
 440	list_for_each_entry(pt1, head, list) {
 441		if (pt == pt1) {
 442			list_del_rcu(&pt->list);
 443			goto out;
 444		}
 445	}
 446
 447	pr_warn("dev_remove_pack: %p not found\n", pt);
 448out:
 449	spin_unlock(&ptype_lock);
 450}
 451EXPORT_SYMBOL(__dev_remove_pack);
 452
 453/**
 454 *	dev_remove_pack	 - remove packet handler
 455 *	@pt: packet type declaration
 456 *
 457 *	Remove a protocol handler that was previously added to the kernel
 458 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 459 *	from the kernel lists and can be freed or reused once this function
 460 *	returns.
 461 *
 462 *	This call sleeps to guarantee that no CPU is looking at the packet
 463 *	type after return.
 464 */
 465void dev_remove_pack(struct packet_type *pt)
 466{
 467	__dev_remove_pack(pt);
 468
 469	synchronize_net();
 470}
 471EXPORT_SYMBOL(dev_remove_pack);
 472
 473/******************************************************************************
 474
 475		      Device Boot-time Settings Routines
 476
 477*******************************************************************************/
 478
 479/* Boot time configuration table */
 480static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 
 481
 482/**
 483 *	netdev_boot_setup_add	- add new setup entry
 484 *	@name: name of the device
 485 *	@map: configured settings for the device
 486 *
 487 *	Adds new setup entry to the dev_boot_setup list.  The function
 488 *	returns 0 on error and 1 on success.  This is a generic routine to
 489 *	all netdevices.
 490 */
 491static int netdev_boot_setup_add(char *name, struct ifmap *map)
 492{
 493	struct netdev_boot_setup *s;
 494	int i;
 495
 496	s = dev_boot_setup;
 497	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 498		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 499			memset(s[i].name, 0, sizeof(s[i].name));
 500			strlcpy(s[i].name, name, IFNAMSIZ);
 501			memcpy(&s[i].map, map, sizeof(s[i].map));
 502			break;
 503		}
 504	}
 505
 506	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 507}
 
 508
 509/**
 510 *	netdev_boot_setup_check	- check boot time settings
 511 *	@dev: the netdevice
 
 512 *
 513 * 	Check boot time settings for the device.
 514 *	The found settings are set for the device to be used
 515 *	later in the device probing.
 516 *	Returns 0 if no settings found, 1 if they are.
 517 */
 518int netdev_boot_setup_check(struct net_device *dev)
 519{
 520	struct netdev_boot_setup *s = dev_boot_setup;
 521	int i;
 522
 523	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 524		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 525		    !strcmp(dev->name, s[i].name)) {
 526			dev->irq 	= s[i].map.irq;
 527			dev->base_addr 	= s[i].map.base_addr;
 528			dev->mem_start 	= s[i].map.mem_start;
 529			dev->mem_end 	= s[i].map.mem_end;
 530			return 1;
 531		}
 532	}
 533	return 0;
 534}
 535EXPORT_SYMBOL(netdev_boot_setup_check);
 536
 
 
 
 537
 538/**
 539 *	netdev_boot_base	- get address from boot time settings
 540 *	@prefix: prefix for network device
 541 *	@unit: id for network device
 542 *
 543 * 	Check boot time settings for the base address of device.
 544 *	The found settings are set for the device to be used
 545 *	later in the device probing.
 546 *	Returns 0 if no settings found.
 547 */
 548unsigned long netdev_boot_base(const char *prefix, int unit)
 549{
 550	const struct netdev_boot_setup *s = dev_boot_setup;
 551	char name[IFNAMSIZ];
 552	int i;
 553
 554	sprintf(name, "%s%d", prefix, unit);
 555
 556	/*
 557	 * If device already registered then return base of 1
 558	 * to indicate not to probe for this interface
 559	 */
 560	if (__dev_get_by_name(&init_net, name))
 561		return 1;
 562
 563	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 564		if (!strcmp(name, s[i].name))
 565			return s[i].map.base_addr;
 566	return 0;
 567}
 568
 569/*
 570 * Saves at boot time configured settings for any netdevice.
 571 */
 572int __init netdev_boot_setup(char *str)
 573{
 574	int ints[5];
 575	struct ifmap map;
 
 
 
 
 576
 577	str = get_options(str, ARRAY_SIZE(ints), ints);
 578	if (!str || !*str)
 579		return 0;
 580
 581	/* Save settings */
 582	memset(&map, 0, sizeof(map));
 583	if (ints[0] > 0)
 584		map.irq = ints[1];
 585	if (ints[0] > 1)
 586		map.base_addr = ints[2];
 587	if (ints[0] > 2)
 588		map.mem_start = ints[3];
 589	if (ints[0] > 3)
 590		map.mem_end = ints[4];
 591
 592	/* Add new entry to the list */
 593	return netdev_boot_setup_add(str, &map);
 594}
 
 595
 596__setup("netdev=", netdev_boot_setup);
 
 
 597
 598/*******************************************************************************
 
 599
 600			    Device Interface Subroutines
 
 
 
 
 601
 602*******************************************************************************/
 
 
 603
 604/**
 605 *	__dev_get_by_name	- find a device by its name
 606 *	@net: the applicable net namespace
 607 *	@name: name to find
 608 *
 609 *	Find an interface by name. Must be called under RTNL semaphore
 610 *	or @dev_base_lock. If the name is found a pointer to the device
 611 *	is returned. If the name is not found then %NULL is returned. The
 612 *	reference counters are not incremented so the caller must be
 613 *	careful with locks.
 614 */
 615
 616struct net_device *__dev_get_by_name(struct net *net, const char *name)
 617{
 618	struct hlist_node *p;
 619	struct net_device *dev;
 620	struct hlist_head *head = dev_name_hash(net, name);
 621
 622	hlist_for_each_entry(dev, p, head, name_hlist)
 623		if (!strncmp(dev->name, name, IFNAMSIZ))
 624			return dev;
 625
 626	return NULL;
 627}
 628EXPORT_SYMBOL(__dev_get_by_name);
 629
 630/**
 631 *	dev_get_by_name_rcu	- find a device by its name
 632 *	@net: the applicable net namespace
 633 *	@name: name to find
 634 *
 635 *	Find an interface by name.
 636 *	If the name is found a pointer to the device is returned.
 637 * 	If the name is not found then %NULL is returned.
 638 *	The reference counters are not incremented so the caller must be
 639 *	careful with locks. The caller must hold RCU lock.
 640 */
 641
 642struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 643{
 644	struct hlist_node *p;
 645	struct net_device *dev;
 646	struct hlist_head *head = dev_name_hash(net, name);
 647
 648	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 649		if (!strncmp(dev->name, name, IFNAMSIZ))
 650			return dev;
 651
 652	return NULL;
 
 653}
 654EXPORT_SYMBOL(dev_get_by_name_rcu);
 655
 656/**
 657 *	dev_get_by_name		- find a device by its name
 658 *	@net: the applicable net namespace
 659 *	@name: name to find
 660 *
 661 *	Find an interface by name. This can be called from any
 662 *	context and does its own locking. The returned handle has
 663 *	the usage count incremented and the caller must use dev_put() to
 664 *	release it when it is no longer needed. %NULL is returned if no
 665 *	matching device is found.
 666 */
 667
 668struct net_device *dev_get_by_name(struct net *net, const char *name)
 669{
 670	struct net_device *dev;
 671
 672	rcu_read_lock();
 673	dev = dev_get_by_name_rcu(net, name);
 674	if (dev)
 675		dev_hold(dev);
 676	rcu_read_unlock();
 677	return dev;
 678}
 679EXPORT_SYMBOL(dev_get_by_name);
 680
 681/**
 682 *	__dev_get_by_index - find a device by its ifindex
 683 *	@net: the applicable net namespace
 684 *	@ifindex: index of device
 685 *
 686 *	Search for an interface by index. Returns %NULL if the device
 687 *	is not found or a pointer to the device. The device has not
 688 *	had its reference counter increased so the caller must be careful
 689 *	about locking. The caller must hold either the RTNL semaphore
 690 *	or @dev_base_lock.
 691 */
 692
 693struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 694{
 695	struct hlist_node *p;
 696	struct net_device *dev;
 697	struct hlist_head *head = dev_index_hash(net, ifindex);
 698
 699	hlist_for_each_entry(dev, p, head, index_hlist)
 700		if (dev->ifindex == ifindex)
 701			return dev;
 702
 703	return NULL;
 704}
 705EXPORT_SYMBOL(__dev_get_by_index);
 706
 707/**
 708 *	dev_get_by_index_rcu - find a device by its ifindex
 709 *	@net: the applicable net namespace
 710 *	@ifindex: index of device
 711 *
 712 *	Search for an interface by index. Returns %NULL if the device
 713 *	is not found or a pointer to the device. The device has not
 714 *	had its reference counter increased so the caller must be careful
 715 *	about locking. The caller must hold RCU lock.
 716 */
 717
 718struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 719{
 720	struct hlist_node *p;
 721	struct net_device *dev;
 722	struct hlist_head *head = dev_index_hash(net, ifindex);
 723
 724	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 725		if (dev->ifindex == ifindex)
 726			return dev;
 727
 728	return NULL;
 729}
 730EXPORT_SYMBOL(dev_get_by_index_rcu);
 731
 732
 733/**
 734 *	dev_get_by_index - find a device by its ifindex
 735 *	@net: the applicable net namespace
 736 *	@ifindex: index of device
 737 *
 738 *	Search for an interface by index. Returns NULL if the device
 739 *	is not found or a pointer to the device. The device returned has
 740 *	had a reference added and the pointer is safe until the user calls
 741 *	dev_put to indicate they have finished with it.
 742 */
 743
 744struct net_device *dev_get_by_index(struct net *net, int ifindex)
 745{
 746	struct net_device *dev;
 747
 748	rcu_read_lock();
 749	dev = dev_get_by_index_rcu(net, ifindex);
 750	if (dev)
 751		dev_hold(dev);
 752	rcu_read_unlock();
 753	return dev;
 754}
 755EXPORT_SYMBOL(dev_get_by_index);
 756
 757/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 758 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 759 *	@net: the applicable net namespace
 760 *	@type: media type of device
 761 *	@ha: hardware address
 762 *
 763 *	Search for an interface by MAC address. Returns NULL if the device
 764 *	is not found or a pointer to the device.
 765 *	The caller must hold RCU or RTNL.
 766 *	The returned device has not had its ref count increased
 767 *	and the caller must therefore be careful about locking
 768 *
 769 */
 770
 771struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 772				       const char *ha)
 773{
 774	struct net_device *dev;
 775
 776	for_each_netdev_rcu(net, dev)
 777		if (dev->type == type &&
 778		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 779			return dev;
 780
 781	return NULL;
 782}
 783EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 784
 785struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 786{
 787	struct net_device *dev;
 788
 789	ASSERT_RTNL();
 790	for_each_netdev(net, dev)
 791		if (dev->type == type)
 792			return dev;
 793
 794	return NULL;
 795}
 796EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 797
 798struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 799{
 800	struct net_device *dev, *ret = NULL;
 801
 802	rcu_read_lock();
 803	for_each_netdev_rcu(net, dev)
 804		if (dev->type == type) {
 805			dev_hold(dev);
 806			ret = dev;
 807			break;
 808		}
 809	rcu_read_unlock();
 810	return ret;
 811}
 812EXPORT_SYMBOL(dev_getfirstbyhwtype);
 813
 814/**
 815 *	dev_get_by_flags_rcu - find any device with given flags
 816 *	@net: the applicable net namespace
 817 *	@if_flags: IFF_* values
 818 *	@mask: bitmask of bits in if_flags to check
 819 *
 820 *	Search for any interface with the given flags. Returns NULL if a device
 821 *	is not found or a pointer to the device. Must be called inside
 822 *	rcu_read_lock(), and result refcount is unchanged.
 823 */
 824
 825struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 826				    unsigned short mask)
 827{
 828	struct net_device *dev, *ret;
 829
 
 
 830	ret = NULL;
 831	for_each_netdev_rcu(net, dev) {
 832		if (((dev->flags ^ if_flags) & mask) == 0) {
 833			ret = dev;
 834			break;
 835		}
 836	}
 837	return ret;
 838}
 839EXPORT_SYMBOL(dev_get_by_flags_rcu);
 840
 841/**
 842 *	dev_valid_name - check if name is okay for network device
 843 *	@name: name string
 844 *
 845 *	Network device names need to be valid file names to
 846 *	to allow sysfs to work.  We also disallow any kind of
 847 *	whitespace.
 848 */
 849bool dev_valid_name(const char *name)
 850{
 851	if (*name == '\0')
 852		return false;
 853	if (strlen(name) >= IFNAMSIZ)
 854		return false;
 855	if (!strcmp(name, ".") || !strcmp(name, ".."))
 856		return false;
 857
 858	while (*name) {
 859		if (*name == '/' || isspace(*name))
 860			return false;
 861		name++;
 862	}
 863	return true;
 864}
 865EXPORT_SYMBOL(dev_valid_name);
 866
 867/**
 868 *	__dev_alloc_name - allocate a name for a device
 869 *	@net: network namespace to allocate the device name in
 870 *	@name: name format string
 871 *	@buf:  scratch buffer and result name string
 872 *
 873 *	Passed a format string - eg "lt%d" it will try and find a suitable
 874 *	id. It scans list of devices to build up a free map, then chooses
 875 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 876 *	while allocating the name and adding the device in order to avoid
 877 *	duplicates.
 878 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 879 *	Returns the number of the unit assigned or a negative errno code.
 880 */
 881
 882static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 883{
 884	int i = 0;
 885	const char *p;
 886	const int max_netdevices = 8*PAGE_SIZE;
 887	unsigned long *inuse;
 888	struct net_device *d;
 889
 890	p = strnchr(name, IFNAMSIZ-1, '%');
 
 
 
 891	if (p) {
 892		/*
 893		 * Verify the string as this thing may have come from
 894		 * the user.  There must be either one "%d" and no other "%"
 895		 * characters.
 896		 */
 897		if (p[1] != 'd' || strchr(p + 2, '%'))
 898			return -EINVAL;
 899
 900		/* Use one page as a bit array of possible slots */
 901		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 902		if (!inuse)
 903			return -ENOMEM;
 904
 905		for_each_netdev(net, d) {
 
 
 
 
 
 
 
 
 
 
 
 
 906			if (!sscanf(d->name, name, &i))
 907				continue;
 908			if (i < 0 || i >= max_netdevices)
 909				continue;
 910
 911			/*  avoid cases where sscanf is not exact inverse of printf */
 912			snprintf(buf, IFNAMSIZ, name, i);
 913			if (!strncmp(buf, d->name, IFNAMSIZ))
 914				set_bit(i, inuse);
 915		}
 916
 917		i = find_first_zero_bit(inuse, max_netdevices);
 918		free_page((unsigned long) inuse);
 919	}
 920
 921	if (buf != name)
 922		snprintf(buf, IFNAMSIZ, name, i);
 923	if (!__dev_get_by_name(net, buf))
 924		return i;
 925
 926	/* It is possible to run out of possible slots
 927	 * when the name is long and there isn't enough space left
 928	 * for the digits, or if all bits are used.
 929	 */
 930	return -ENFILE;
 931}
 932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 933/**
 934 *	dev_alloc_name - allocate a name for a device
 935 *	@dev: device
 936 *	@name: name format string
 937 *
 938 *	Passed a format string - eg "lt%d" it will try and find a suitable
 939 *	id. It scans list of devices to build up a free map, then chooses
 940 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 941 *	while allocating the name and adding the device in order to avoid
 942 *	duplicates.
 943 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 944 *	Returns the number of the unit assigned or a negative errno code.
 945 */
 946
 947int dev_alloc_name(struct net_device *dev, const char *name)
 948{
 949	char buf[IFNAMSIZ];
 950	struct net *net;
 951	int ret;
 952
 953	BUG_ON(!dev_net(dev));
 954	net = dev_net(dev);
 955	ret = __dev_alloc_name(net, name, buf);
 956	if (ret >= 0)
 957		strlcpy(dev->name, buf, IFNAMSIZ);
 958	return ret;
 959}
 960EXPORT_SYMBOL(dev_alloc_name);
 961
 962static int dev_get_valid_name(struct net_device *dev, const char *name)
 
 963{
 964	struct net *net;
 965
 966	BUG_ON(!dev_net(dev));
 967	net = dev_net(dev);
 968
 969	if (!dev_valid_name(name))
 970		return -EINVAL;
 971
 972	if (strchr(name, '%'))
 973		return dev_alloc_name(dev, name);
 974	else if (__dev_get_by_name(net, name))
 975		return -EEXIST;
 976	else if (dev->name != name)
 977		strlcpy(dev->name, name, IFNAMSIZ);
 978
 979	return 0;
 980}
 981
 982/**
 983 *	dev_change_name - change name of a device
 984 *	@dev: device
 985 *	@newname: name (or format string) must be at least IFNAMSIZ
 986 *
 987 *	Change name of a device, can pass format strings "eth%d".
 988 *	for wildcarding.
 989 */
 990int dev_change_name(struct net_device *dev, const char *newname)
 991{
 
 992	char oldname[IFNAMSIZ];
 993	int err = 0;
 994	int ret;
 995	struct net *net;
 996
 997	ASSERT_RTNL();
 998	BUG_ON(!dev_net(dev));
 999
1000	net = dev_net(dev);
1001	if (dev->flags & IFF_UP)
1002		return -EBUSY;
1003
1004	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 
 
 
1005		return 0;
 
1006
1007	memcpy(oldname, dev->name, IFNAMSIZ);
1008
1009	err = dev_get_valid_name(dev, newname);
1010	if (err < 0)
 
1011		return err;
 
 
 
 
 
 
 
 
1012
1013rollback:
1014	ret = device_rename(&dev->dev, dev->name);
1015	if (ret) {
1016		memcpy(dev->name, oldname, IFNAMSIZ);
 
 
1017		return ret;
1018	}
1019
1020	write_lock_bh(&dev_base_lock);
1021	hlist_del_rcu(&dev->name_hlist);
1022	write_unlock_bh(&dev_base_lock);
 
 
 
 
1023
1024	synchronize_rcu();
1025
1026	write_lock_bh(&dev_base_lock);
1027	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1028	write_unlock_bh(&dev_base_lock);
1029
1030	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1031	ret = notifier_to_errno(ret);
1032
1033	if (ret) {
1034		/* err >= 0 after dev_alloc_name() or stores the first errno */
1035		if (err >= 0) {
1036			err = ret;
 
1037			memcpy(dev->name, oldname, IFNAMSIZ);
 
 
 
1038			goto rollback;
1039		} else {
1040			pr_err("%s: name change rollback failed: %d\n",
1041			       dev->name, ret);
1042		}
1043	}
1044
1045	return err;
1046}
1047
1048/**
1049 *	dev_set_alias - change ifalias of a device
1050 *	@dev: device
1051 *	@alias: name up to IFALIASZ
1052 *	@len: limit of bytes to copy from info
1053 *
1054 *	Set ifalias for a device,
1055 */
1056int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057{
1058	char *new_ifalias;
1059
1060	ASSERT_RTNL();
1061
1062	if (len >= IFALIASZ)
1063		return -EINVAL;
1064
1065	if (!len) {
1066		if (dev->ifalias) {
1067			kfree(dev->ifalias);
1068			dev->ifalias = NULL;
1069		}
1070		return 0;
 
1071	}
1072
1073	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1074	if (!new_ifalias)
1075		return -ENOMEM;
1076	dev->ifalias = new_ifalias;
 
 
 
1077
1078	strlcpy(dev->ifalias, alias, len+1);
1079	return len;
1080}
 
1081
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1082
1083/**
1084 *	netdev_features_change - device changes features
1085 *	@dev: device to cause notification
1086 *
1087 *	Called to indicate a device has changed features.
1088 */
1089void netdev_features_change(struct net_device *dev)
1090{
1091	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1092}
1093EXPORT_SYMBOL(netdev_features_change);
1094
1095/**
1096 *	netdev_state_change - device changes state
1097 *	@dev: device to cause notification
1098 *
1099 *	Called to indicate a device has changed state. This function calls
1100 *	the notifier chains for netdev_chain and sends a NEWLINK message
1101 *	to the routing socket.
1102 */
1103void netdev_state_change(struct net_device *dev)
1104{
1105	if (dev->flags & IFF_UP) {
1106		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1107		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 
 
 
 
 
1108	}
1109}
1110EXPORT_SYMBOL(netdev_state_change);
1111
1112int netdev_bonding_change(struct net_device *dev, unsigned long event)
 
 
 
 
 
 
 
 
 
 
 
1113{
1114	return call_netdevice_notifiers(event, dev);
 
 
1115}
1116EXPORT_SYMBOL(netdev_bonding_change);
1117
1118/**
1119 *	dev_load 	- load a network module
1120 *	@net: the applicable net namespace
1121 *	@name: name of interface
1122 *
1123 *	If a network interface is not present and the process has suitable
1124 *	privileges this function loads the module. If module loading is not
1125 *	available in this kernel then it becomes a nop.
 
 
1126 */
1127
1128void dev_load(struct net *net, const char *name)
1129{
1130	struct net_device *dev;
1131	int no_module;
 
 
 
1132
1133	rcu_read_lock();
1134	dev = dev_get_by_name_rcu(net, name);
1135	rcu_read_unlock();
1136
1137	no_module = !dev;
1138	if (no_module && capable(CAP_NET_ADMIN))
1139		no_module = request_module("netdev-%s", name);
1140	if (no_module && capable(CAP_SYS_MODULE)) {
1141		if (!request_module("%s", name))
1142			pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1143				name);
 
 
 
 
 
 
 
1144	}
 
 
1145}
1146EXPORT_SYMBOL(dev_load);
1147
1148static int __dev_open(struct net_device *dev)
1149{
1150	const struct net_device_ops *ops = dev->netdev_ops;
1151	int ret;
1152
1153	ASSERT_RTNL();
 
1154
1155	if (!netif_device_present(dev))
1156		return -ENODEV;
 
 
 
 
 
 
 
 
 
 
 
1157
1158	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1159	ret = notifier_to_errno(ret);
1160	if (ret)
1161		return ret;
1162
1163	set_bit(__LINK_STATE_START, &dev->state);
1164
1165	if (ops->ndo_validate_addr)
1166		ret = ops->ndo_validate_addr(dev);
1167
1168	if (!ret && ops->ndo_open)
1169		ret = ops->ndo_open(dev);
1170
 
 
1171	if (ret)
1172		clear_bit(__LINK_STATE_START, &dev->state);
1173	else {
1174		dev->flags |= IFF_UP;
1175		net_dmaengine_get();
1176		dev_set_rx_mode(dev);
1177		dev_activate(dev);
1178		add_device_randomness(dev->dev_addr, dev->addr_len);
1179	}
1180
1181	return ret;
1182}
1183
1184/**
1185 *	dev_open	- prepare an interface for use.
1186 *	@dev:	device to open
 
1187 *
1188 *	Takes a device from down to up state. The device's private open
1189 *	function is invoked and then the multicast lists are loaded. Finally
1190 *	the device is moved into the up state and a %NETDEV_UP message is
1191 *	sent to the netdev notifier chain.
1192 *
1193 *	Calling this function on an active interface is a nop. On a failure
1194 *	a negative errno code is returned.
1195 */
1196int dev_open(struct net_device *dev)
1197{
1198	int ret;
1199
1200	if (dev->flags & IFF_UP)
1201		return 0;
1202
1203	ret = __dev_open(dev);
1204	if (ret < 0)
1205		return ret;
1206
1207	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1208	call_netdevice_notifiers(NETDEV_UP, dev);
1209
1210	return ret;
1211}
1212EXPORT_SYMBOL(dev_open);
1213
1214static int __dev_close_many(struct list_head *head)
1215{
1216	struct net_device *dev;
1217
1218	ASSERT_RTNL();
1219	might_sleep();
1220
1221	list_for_each_entry(dev, head, unreg_list) {
 
 
 
1222		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1223
1224		clear_bit(__LINK_STATE_START, &dev->state);
1225
1226		/* Synchronize to scheduled poll. We cannot touch poll list, it
1227		 * can be even on different cpu. So just clear netif_running().
1228		 *
1229		 * dev->stop() will invoke napi_disable() on all of it's
1230		 * napi_struct instances on this device.
1231		 */
1232		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1233	}
1234
1235	dev_deactivate_many(head);
1236
1237	list_for_each_entry(dev, head, unreg_list) {
1238		const struct net_device_ops *ops = dev->netdev_ops;
1239
1240		/*
1241		 *	Call the device specific close. This cannot fail.
1242		 *	Only if device is UP
1243		 *
1244		 *	We allow it to be called even after a DETACH hot-plug
1245		 *	event.
1246		 */
1247		if (ops->ndo_stop)
1248			ops->ndo_stop(dev);
1249
1250		dev->flags &= ~IFF_UP;
1251		net_dmaengine_put();
1252	}
1253
1254	return 0;
1255}
1256
1257static int __dev_close(struct net_device *dev)
1258{
1259	int retval;
1260	LIST_HEAD(single);
1261
1262	list_add(&dev->unreg_list, &single);
1263	retval = __dev_close_many(&single);
1264	list_del(&single);
1265	return retval;
1266}
1267
1268static int dev_close_many(struct list_head *head)
1269{
1270	struct net_device *dev, *tmp;
1271	LIST_HEAD(tmp_list);
1272
1273	list_for_each_entry_safe(dev, tmp, head, unreg_list)
 
1274		if (!(dev->flags & IFF_UP))
1275			list_move(&dev->unreg_list, &tmp_list);
1276
1277	__dev_close_many(head);
1278
1279	list_for_each_entry(dev, head, unreg_list) {
1280		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1281		call_netdevice_notifiers(NETDEV_DOWN, dev);
 
 
1282	}
1283
1284	/* rollback_registered_many needs the complete original list */
1285	list_splice(&tmp_list, head);
1286	return 0;
1287}
 
1288
1289/**
1290 *	dev_close - shutdown an interface.
1291 *	@dev: device to shutdown
1292 *
1293 *	This function moves an active device into down state. A
1294 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1295 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1296 *	chain.
1297 */
1298int dev_close(struct net_device *dev)
1299{
1300	if (dev->flags & IFF_UP) {
1301		LIST_HEAD(single);
1302
1303		list_add(&dev->unreg_list, &single);
1304		dev_close_many(&single);
1305		list_del(&single);
1306	}
1307	return 0;
1308}
1309EXPORT_SYMBOL(dev_close);
1310
1311
1312/**
1313 *	dev_disable_lro - disable Large Receive Offload on a device
1314 *	@dev: device
1315 *
1316 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1317 *	called under RTNL.  This is needed if received packets may be
1318 *	forwarded to another interface.
1319 */
1320void dev_disable_lro(struct net_device *dev)
1321{
1322	/*
1323	 * If we're trying to disable lro on a vlan device
1324	 * use the underlying physical device instead
1325	 */
1326	if (is_vlan_dev(dev))
1327		dev = vlan_dev_real_dev(dev);
1328
1329	dev->wanted_features &= ~NETIF_F_LRO;
1330	netdev_update_features(dev);
1331
1332	if (unlikely(dev->features & NETIF_F_LRO))
1333		netdev_WARN(dev, "failed to disable LRO!\n");
 
 
 
1334}
1335EXPORT_SYMBOL(dev_disable_lro);
1336
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1337
1338static int dev_boot_phase = 1;
1339
1340/**
1341 *	register_netdevice_notifier - register a network notifier block
1342 *	@nb: notifier
1343 *
1344 *	Register a notifier to be called when network device events occur.
1345 *	The notifier passed is linked into the kernel structures and must
1346 *	not be reused until it has been unregistered. A negative errno code
1347 *	is returned on a failure.
1348 *
1349 * 	When registered all registration and up events are replayed
1350 *	to the new notifier to allow device to have a race free
1351 *	view of the network device list.
1352 */
1353
1354int register_netdevice_notifier(struct notifier_block *nb)
1355{
1356	struct net_device *dev;
1357	struct net_device *last;
1358	struct net *net;
1359	int err;
1360
 
 
1361	rtnl_lock();
1362	err = raw_notifier_chain_register(&netdev_chain, nb);
1363	if (err)
1364		goto unlock;
1365	if (dev_boot_phase)
1366		goto unlock;
1367	for_each_net(net) {
1368		for_each_netdev(net, dev) {
1369			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1370			err = notifier_to_errno(err);
1371			if (err)
1372				goto rollback;
1373
1374			if (!(dev->flags & IFF_UP))
1375				continue;
1376
1377			nb->notifier_call(nb, NETDEV_UP, dev);
1378		}
1379	}
1380
1381unlock:
1382	rtnl_unlock();
 
1383	return err;
1384
1385rollback:
1386	last = dev;
1387	for_each_net(net) {
1388		for_each_netdev(net, dev) {
1389			if (dev == last)
1390				goto outroll;
1391
1392			if (dev->flags & IFF_UP) {
1393				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1394				nb->notifier_call(nb, NETDEV_DOWN, dev);
1395			}
1396			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1397			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1398		}
1399	}
1400
1401outroll:
1402	raw_notifier_chain_unregister(&netdev_chain, nb);
1403	goto unlock;
1404}
1405EXPORT_SYMBOL(register_netdevice_notifier);
1406
1407/**
1408 *	unregister_netdevice_notifier - unregister a network notifier block
1409 *	@nb: notifier
1410 *
1411 *	Unregister a notifier previously registered by
1412 *	register_netdevice_notifier(). The notifier is unlinked into the
1413 *	kernel structures and may then be reused. A negative errno code
1414 *	is returned on a failure.
1415 *
1416 * 	After unregistering unregister and down device events are synthesized
1417 *	for all devices on the device list to the removed notifier to remove
1418 *	the need for special case cleanup code.
1419 */
1420
1421int unregister_netdevice_notifier(struct notifier_block *nb)
1422{
1423	struct net_device *dev;
1424	struct net *net;
1425	int err;
1426
 
 
1427	rtnl_lock();
1428	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1429	if (err)
1430		goto unlock;
1431
1432	for_each_net(net) {
1433		for_each_netdev(net, dev) {
1434			if (dev->flags & IFF_UP) {
1435				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1436				nb->notifier_call(nb, NETDEV_DOWN, dev);
1437			}
1438			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1439			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1440		}
1441	}
1442unlock:
1443	rtnl_unlock();
 
1444	return err;
1445}
1446EXPORT_SYMBOL(unregister_netdevice_notifier);
1447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1448/**
1449 *	call_netdevice_notifiers - call all network notifier blocks
1450 *      @val: value passed unmodified to notifier function
1451 *      @dev: net_device pointer passed unmodified to notifier function
1452 *
1453 *	Call all network notifier blocks.  Parameters and return value
1454 *	are as for raw_notifier_call_chain().
1455 */
1456
1457int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1458{
1459	ASSERT_RTNL();
1460	return raw_notifier_call_chain(&netdev_chain, val, dev);
1461}
1462EXPORT_SYMBOL(call_netdevice_notifiers);
1463
1464static struct static_key netstamp_needed __read_mostly;
1465#ifdef HAVE_JUMP_LABEL
1466/* We are not allowed to call static_key_slow_dec() from irq context
1467 * If net_disable_timestamp() is called from irq context, defer the
1468 * static_key_slow_dec() calls.
 
 
 
1469 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1470static atomic_t netstamp_needed_deferred;
 
 
 
 
 
 
 
 
 
 
 
 
 
1471#endif
1472
1473void net_enable_timestamp(void)
1474{
1475#ifdef HAVE_JUMP_LABEL
1476	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1477
1478	if (deferred) {
1479		while (--deferred)
1480			static_key_slow_dec(&netstamp_needed);
1481		return;
1482	}
 
 
 
 
1483#endif
1484	WARN_ON(in_interrupt());
1485	static_key_slow_inc(&netstamp_needed);
1486}
1487EXPORT_SYMBOL(net_enable_timestamp);
1488
1489void net_disable_timestamp(void)
1490{
1491#ifdef HAVE_JUMP_LABEL
1492	if (in_interrupt()) {
1493		atomic_inc(&netstamp_needed_deferred);
1494		return;
 
 
1495	}
 
 
 
 
1496#endif
1497	static_key_slow_dec(&netstamp_needed);
1498}
1499EXPORT_SYMBOL(net_disable_timestamp);
1500
1501static inline void net_timestamp_set(struct sk_buff *skb)
1502{
1503	skb->tstamp.tv64 = 0;
1504	if (static_key_false(&netstamp_needed))
1505		__net_timestamp(skb);
 
1506}
1507
1508#define net_timestamp_check(COND, SKB)			\
1509	if (static_key_false(&netstamp_needed)) {		\
1510		if ((COND) && !(SKB)->tstamp.tv64)	\
1511			__net_timestamp(SKB);		\
1512	}						\
1513
1514static int net_hwtstamp_validate(struct ifreq *ifr)
1515{
1516	struct hwtstamp_config cfg;
1517	enum hwtstamp_tx_types tx_type;
1518	enum hwtstamp_rx_filters rx_filter;
1519	int tx_type_valid = 0;
1520	int rx_filter_valid = 0;
1521
1522	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1523		return -EFAULT;
1524
1525	if (cfg.flags) /* reserved for future extensions */
1526		return -EINVAL;
1527
1528	tx_type = cfg.tx_type;
1529	rx_filter = cfg.rx_filter;
1530
1531	switch (tx_type) {
1532	case HWTSTAMP_TX_OFF:
1533	case HWTSTAMP_TX_ON:
1534	case HWTSTAMP_TX_ONESTEP_SYNC:
1535		tx_type_valid = 1;
1536		break;
1537	}
1538
1539	switch (rx_filter) {
1540	case HWTSTAMP_FILTER_NONE:
1541	case HWTSTAMP_FILTER_ALL:
1542	case HWTSTAMP_FILTER_SOME:
1543	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1544	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1545	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1546	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1547	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1548	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1549	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1550	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1551	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1552	case HWTSTAMP_FILTER_PTP_V2_EVENT:
1553	case HWTSTAMP_FILTER_PTP_V2_SYNC:
1554	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1555		rx_filter_valid = 1;
1556		break;
1557	}
1558
1559	if (!tx_type_valid || !rx_filter_valid)
1560		return -ERANGE;
1561
1562	return 0;
1563}
1564
1565static inline bool is_skb_forwardable(struct net_device *dev,
1566				      struct sk_buff *skb)
1567{
1568	unsigned int len;
1569
1570	if (!(dev->flags & IFF_UP))
1571		return false;
1572
1573	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1574	if (skb->len <= len)
1575		return true;
1576
1577	/* if TSO is enabled, we don't care about the length as the packet
1578	 * could be forwarded without being segmented before
1579	 */
1580	if (skb_is_gso(skb))
1581		return true;
1582
1583	return false;
1584}
 
1585
1586/**
1587 * dev_forward_skb - loopback an skb to another netif
1588 *
1589 * @dev: destination network device
1590 * @skb: buffer to forward
1591 *
1592 * return values:
1593 *	NET_RX_SUCCESS	(no congestion)
1594 *	NET_RX_DROP     (packet was dropped, but freed)
1595 *
1596 * dev_forward_skb can be used for injecting an skb from the
1597 * start_xmit function of one device into the receive queue
1598 * of another device.
1599 *
1600 * The receiving device may be in another namespace, so
1601 * we have to clear all information in the skb that could
1602 * impact namespace isolation.
1603 */
1604int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1605{
1606	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1607		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1608			atomic_long_inc(&dev->rx_dropped);
1609			kfree_skb(skb);
1610			return NET_RX_DROP;
1611		}
1612	}
1613
1614	skb_orphan(skb);
1615	nf_reset(skb);
1616
1617	if (unlikely(!is_skb_forwardable(dev, skb))) {
1618		atomic_long_inc(&dev->rx_dropped);
1619		kfree_skb(skb);
1620		return NET_RX_DROP;
1621	}
1622	skb->skb_iif = 0;
1623	skb->dev = dev;
1624	skb_dst_drop(skb);
1625	skb->tstamp.tv64 = 0;
1626	skb->pkt_type = PACKET_HOST;
1627	skb->protocol = eth_type_trans(skb, dev);
1628	skb->mark = 0;
1629	secpath_reset(skb);
1630	nf_reset(skb);
1631	return netif_rx(skb);
1632}
1633EXPORT_SYMBOL_GPL(dev_forward_skb);
1634
 
 
 
 
 
1635static inline int deliver_skb(struct sk_buff *skb,
1636			      struct packet_type *pt_prev,
1637			      struct net_device *orig_dev)
1638{
1639	atomic_inc(&skb->users);
 
 
1640	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1641}
1642
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1643static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1644{
1645	if (ptype->af_packet_priv == NULL)
1646		return false;
1647
1648	if (ptype->id_match)
1649		return ptype->id_match(ptype, skb->sk);
1650	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1651		return true;
1652
1653	return false;
1654}
1655
 
 
 
 
 
 
 
 
 
 
 
1656/*
1657 *	Support routine. Sends outgoing frames to any network
1658 *	taps currently in use.
1659 */
1660
1661static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1662{
1663	struct packet_type *ptype;
1664	struct sk_buff *skb2 = NULL;
1665	struct packet_type *pt_prev = NULL;
 
1666
1667	rcu_read_lock();
1668	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 
 
 
 
1669		/* Never send packets back to the socket
1670		 * they originated from - MvS (miquels@drinkel.ow.org)
1671		 */
1672		if ((ptype->dev == dev || !ptype->dev) &&
1673		    (!skb_loop_sk(ptype, skb))) {
1674			if (pt_prev) {
1675				deliver_skb(skb2, pt_prev, skb->dev);
1676				pt_prev = ptype;
1677				continue;
1678			}
1679
1680			skb2 = skb_clone(skb, GFP_ATOMIC);
1681			if (!skb2)
1682				break;
 
 
1683
1684			net_timestamp_set(skb2);
 
 
 
 
 
 
 
 
 
 
 
1685
1686			/* skb->nh should be correctly
1687			   set by sender, so that the second statement is
1688			   just protection against buggy protocols.
1689			 */
1690			skb_reset_mac_header(skb2);
 
 
1691
1692			if (skb_network_header(skb2) < skb2->data ||
1693			    skb2->network_header > skb2->tail) {
1694				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1695						     ntohs(skb2->protocol),
1696						     dev->name);
1697				skb_reset_network_header(skb2);
1698			}
1699
1700			skb2->transport_header = skb2->network_header;
1701			skb2->pkt_type = PACKET_OUTGOING;
1702			pt_prev = ptype;
1703		}
 
 
 
 
 
 
1704	}
1705	if (pt_prev)
1706		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1707	rcu_read_unlock();
1708}
 
1709
1710/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 
1711 * @dev: Network device
1712 * @txq: number of queues available
1713 *
1714 * If real_num_tx_queues is changed the tc mappings may no longer be
1715 * valid. To resolve this verify the tc mapping remains valid and if
1716 * not NULL the mapping. With no priorities mapping to this
1717 * offset/count pair it will no longer be used. In the worst case TC0
1718 * is invalid nothing can be done so disable priority mappings. If is
1719 * expected that drivers will fix this mapping if they can before
1720 * calling netif_set_real_num_tx_queues.
1721 */
1722static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1723{
1724	int i;
1725	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1726
1727	/* If TC0 is invalidated disable TC mapping */
1728	if (tc->offset + tc->count > txq) {
1729		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1730		dev->num_tc = 0;
1731		return;
1732	}
1733
1734	/* Invalidated prio to tc mappings set to TC0 */
1735	for (i = 1; i < TC_BITMASK + 1; i++) {
1736		int q = netdev_get_prio_tc_map(dev, i);
1737
1738		tc = &dev->tc_to_txq[q];
1739		if (tc->offset + tc->count > txq) {
1740			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1741				i, q);
1742			netdev_set_prio_tc_map(dev, i, 0);
1743		}
1744	}
1745}
1746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1747/*
1748 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1749 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1750 */
1751int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1752{
 
1753	int rc;
1754
 
 
1755	if (txq < 1 || txq > dev->num_tx_queues)
1756		return -EINVAL;
1757
1758	if (dev->reg_state == NETREG_REGISTERED ||
1759	    dev->reg_state == NETREG_UNREGISTERING) {
1760		ASSERT_RTNL();
1761
1762		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1763						  txq);
1764		if (rc)
1765			return rc;
1766
1767		if (dev->num_tc)
1768			netif_setup_tc(dev, txq);
1769
1770		if (txq < dev->real_num_tx_queues)
 
 
 
 
 
1771			qdisc_reset_all_tx_gt(dev, txq);
 
 
 
 
 
 
1772	}
1773
1774	dev->real_num_tx_queues = txq;
1775	return 0;
1776}
1777EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1778
1779#ifdef CONFIG_RPS
1780/**
1781 *	netif_set_real_num_rx_queues - set actual number of RX queues used
1782 *	@dev: Network device
1783 *	@rxq: Actual number of RX queues
1784 *
1785 *	This must be called either with the rtnl_lock held or before
1786 *	registration of the net device.  Returns 0 on success, or a
1787 *	negative error code.  If called before registration, it always
1788 *	succeeds.
1789 */
1790int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1791{
1792	int rc;
1793
1794	if (rxq < 1 || rxq > dev->num_rx_queues)
1795		return -EINVAL;
1796
1797	if (dev->reg_state == NETREG_REGISTERED) {
1798		ASSERT_RTNL();
1799
1800		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1801						  rxq);
1802		if (rc)
1803			return rc;
1804	}
1805
1806	dev->real_num_rx_queues = rxq;
1807	return 0;
1808}
1809EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1810#endif
1811
1812static inline void __netif_reschedule(struct Qdisc *q)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1813{
1814	struct softnet_data *sd;
1815	unsigned long flags;
1816
1817	local_irq_save(flags);
1818	sd = &__get_cpu_var(softnet_data);
1819	q->next_sched = NULL;
1820	*sd->output_queue_tailp = q;
1821	sd->output_queue_tailp = &q->next_sched;
1822	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1823	local_irq_restore(flags);
1824}
1825
1826void __netif_schedule(struct Qdisc *q)
1827{
1828	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1829		__netif_reschedule(q);
1830}
1831EXPORT_SYMBOL(__netif_schedule);
1832
1833void dev_kfree_skb_irq(struct sk_buff *skb)
 
 
 
 
1834{
1835	if (atomic_dec_and_test(&skb->users)) {
1836		struct softnet_data *sd;
1837		unsigned long flags;
1838
1839		local_irq_save(flags);
1840		sd = &__get_cpu_var(softnet_data);
1841		skb->next = sd->completion_queue;
1842		sd->completion_queue = skb;
1843		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1844		local_irq_restore(flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1845	}
1846}
1847EXPORT_SYMBOL(dev_kfree_skb_irq);
1848
1849void dev_kfree_skb_any(struct sk_buff *skb)
1850{
1851	if (in_irq() || irqs_disabled())
1852		dev_kfree_skb_irq(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1853	else
1854		dev_kfree_skb(skb);
1855}
1856EXPORT_SYMBOL(dev_kfree_skb_any);
1857
1858
1859/**
1860 * netif_device_detach - mark device as removed
1861 * @dev: network device
1862 *
1863 * Mark device as removed from system and therefore no longer available.
1864 */
1865void netif_device_detach(struct net_device *dev)
1866{
1867	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1868	    netif_running(dev)) {
1869		netif_tx_stop_all_queues(dev);
1870	}
1871}
1872EXPORT_SYMBOL(netif_device_detach);
1873
1874/**
1875 * netif_device_attach - mark device as attached
1876 * @dev: network device
1877 *
1878 * Mark device as attached from system and restart if needed.
1879 */
1880void netif_device_attach(struct net_device *dev)
1881{
1882	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1883	    netif_running(dev)) {
1884		netif_tx_wake_all_queues(dev);
1885		__netdev_watchdog_up(dev);
1886	}
1887}
1888EXPORT_SYMBOL(netif_device_attach);
1889
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1890static void skb_warn_bad_offload(const struct sk_buff *skb)
1891{
1892	static const netdev_features_t null_features = 0;
1893	struct net_device *dev = skb->dev;
1894	const char *driver = "";
1895
1896	if (dev && dev->dev.parent)
1897		driver = dev_driver_string(dev->dev.parent);
1898
1899	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1900	     "gso_type=%d ip_summed=%d\n",
1901	     driver, dev ? &dev->features : &null_features,
1902	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
1903	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1904	     skb_shinfo(skb)->gso_type, skb->ip_summed);
 
 
 
 
1905}
1906
1907/*
1908 * Invalidate hardware checksum when packet is to be mangled, and
1909 * complete checksum manually on outgoing path.
1910 */
1911int skb_checksum_help(struct sk_buff *skb)
1912{
1913	__wsum csum;
1914	int ret = 0, offset;
1915
1916	if (skb->ip_summed == CHECKSUM_COMPLETE)
1917		goto out_set_summed;
1918
1919	if (unlikely(skb_shinfo(skb)->gso_size)) {
1920		skb_warn_bad_offload(skb);
1921		return -EINVAL;
1922	}
1923
 
 
 
 
 
 
 
 
 
1924	offset = skb_checksum_start_offset(skb);
1925	BUG_ON(offset >= skb_headlen(skb));
 
 
 
 
1926	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1927
1928	offset += skb->csum_offset;
1929	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
 
 
 
 
 
 
1930
1931	if (skb_cloned(skb) &&
1932	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1933		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1934		if (ret)
1935			goto out;
1936	}
 
 
 
 
 
 
1937
1938	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1939out_set_summed:
 
 
 
 
 
 
1940	skb->ip_summed = CHECKSUM_NONE;
 
1941out:
1942	return ret;
1943}
1944EXPORT_SYMBOL(skb_checksum_help);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1945
1946/**
1947 *	skb_gso_segment - Perform segmentation on skb.
1948 *	@skb: buffer to segment
1949 *	@features: features for the output path (see dev->features)
 
1950 *
1951 *	This function segments the given skb and returns a list of segments.
1952 *
1953 *	It may return NULL if the skb requires no segmentation.  This is
1954 *	only possible when GSO is used for verifying header integrity.
 
 
1955 */
1956struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1957	netdev_features_t features)
1958{
1959	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1960	struct packet_type *ptype;
1961	__be16 type = skb->protocol;
1962	int vlan_depth = ETH_HLEN;
1963	int err;
1964
1965	while (type == htons(ETH_P_8021Q)) {
1966		struct vlan_hdr *vh;
1967
1968		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1969			return ERR_PTR(-EINVAL);
 
 
 
 
 
 
 
 
 
 
 
1970
1971		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1972		type = vh->h_vlan_encapsulated_proto;
1973		vlan_depth += VLAN_HLEN;
1974	}
1975
1976	skb_reset_mac_header(skb);
1977	skb->mac_len = skb->network_header - skb->mac_header;
1978	__skb_pull(skb, skb->mac_len);
1979
1980	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1981		skb_warn_bad_offload(skb);
1982
1983		if (skb_header_cloned(skb) &&
1984		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1985			return ERR_PTR(err);
1986	}
1987
1988	rcu_read_lock();
1989	list_for_each_entry_rcu(ptype,
1990			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1991		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1992			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1993				err = ptype->gso_send_check(skb);
1994				segs = ERR_PTR(err);
1995				if (err || skb_gso_ok(skb, features))
1996					break;
1997				__skb_push(skb, (skb->data -
1998						 skb_network_header(skb)));
1999			}
2000			segs = ptype->gso_segment(skb, features);
2001			break;
2002		}
2003	}
2004	rcu_read_unlock();
2005
2006	__skb_push(skb, skb->data - skb_mac_header(skb));
 
2007
2008	return segs;
2009}
2010EXPORT_SYMBOL(skb_gso_segment);
2011
2012/* Take action when hardware reception checksum errors are detected. */
2013#ifdef CONFIG_BUG
2014void netdev_rx_csum_fault(struct net_device *dev)
2015{
2016	if (net_ratelimit()) {
2017		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2018		dump_stack();
2019	}
 
 
 
 
2020}
2021EXPORT_SYMBOL(netdev_rx_csum_fault);
2022#endif
2023
2024/* Actually, we should eliminate this check as soon as we know, that:
2025 * 1. IOMMU is present and allows to map all the memory.
2026 * 2. No high memory really exists on this machine.
2027 */
2028
2029static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2030{
2031#ifdef CONFIG_HIGHMEM
2032	int i;
 
2033	if (!(dev->features & NETIF_F_HIGHDMA)) {
2034		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2035			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2036			if (PageHighMem(skb_frag_page(frag)))
2037				return 1;
2038		}
2039	}
2040
2041	if (PCI_DMA_BUS_IS_PHYS) {
2042		struct device *pdev = dev->dev.parent;
2043
2044		if (!pdev)
2045			return 0;
2046		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2047			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2048			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2049			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2050				return 1;
2051		}
2052	}
2053#endif
2054	return 0;
2055}
2056
2057struct dev_gso_cb {
2058	void (*destructor)(struct sk_buff *skb);
2059};
 
 
 
 
 
 
 
2060
2061#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
 
 
 
 
 
 
 
 
 
2062
2063static void dev_gso_skb_destructor(struct sk_buff *skb)
 
2064{
2065	struct dev_gso_cb *cb;
2066
2067	do {
2068		struct sk_buff *nskb = skb->next;
2069
2070		skb->next = nskb->next;
2071		nskb->next = NULL;
2072		kfree_skb(nskb);
2073	} while (skb->next);
 
 
2074
2075	cb = DEV_GSO_CB(skb);
2076	if (cb->destructor)
2077		cb->destructor(skb);
2078}
2079
2080/**
2081 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2082 *	@skb: buffer to segment
2083 *	@features: device features as applicable to this skb
2084 *
2085 *	This function segments the given skb and stores the list of segments
2086 *	in skb->next.
2087 */
2088static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2089{
2090	struct sk_buff *segs;
 
 
2091
2092	segs = skb_gso_segment(skb, features);
 
 
 
 
 
2093
2094	/* Verifying header integrity only. */
2095	if (!segs)
2096		return 0;
 
 
2097
2098	if (IS_ERR(segs))
2099		return PTR_ERR(segs);
2100
2101	skb->next = segs;
2102	DEV_GSO_CB(skb)->destructor = skb->destructor;
2103	skb->destructor = dev_gso_skb_destructor;
 
2104
2105	return 0;
2106}
 
 
 
 
 
 
2107
2108static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2109{
2110	return ((features & NETIF_F_GEN_CSUM) ||
2111		((features & NETIF_F_V4_CSUM) &&
2112		 protocol == htons(ETH_P_IP)) ||
2113		((features & NETIF_F_V6_CSUM) &&
2114		 protocol == htons(ETH_P_IPV6)) ||
2115		((features & NETIF_F_FCOE_CRC) &&
2116		 protocol == htons(ETH_P_FCOE)));
2117}
2118
2119static netdev_features_t harmonize_features(struct sk_buff *skb,
2120	__be16 protocol, netdev_features_t features)
2121{
2122	if (!can_checksum_protocol(features, protocol)) {
2123		features &= ~NETIF_F_ALL_CSUM;
2124		features &= ~NETIF_F_SG;
2125	} else if (illegal_highdma(skb->dev, skb)) {
2126		features &= ~NETIF_F_SG;
2127	}
2128
2129	return features;
2130}
2131
2132netdev_features_t netif_skb_features(struct sk_buff *skb)
2133{
2134	__be16 protocol = skb->protocol;
2135	netdev_features_t features = skb->dev->features;
2136
2137	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2138		features &= ~NETIF_F_GSO_MASK;
2139
2140	if (protocol == htons(ETH_P_8021Q)) {
2141		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2142		protocol = veh->h_vlan_encapsulated_proto;
2143	} else if (!vlan_tx_tag_present(skb)) {
2144		return harmonize_features(skb, protocol, features);
2145	}
2146
2147	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2148
2149	if (protocol != htons(ETH_P_8021Q)) {
2150		return harmonize_features(skb, protocol, features);
2151	} else {
2152		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2153				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2154		return harmonize_features(skb, protocol, features);
2155	}
2156}
2157EXPORT_SYMBOL(netif_skb_features);
2158
2159/*
2160 * Returns true if either:
2161 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2162 *	2. skb is fragmented and the device does not support SG, or if
2163 *	   at least one of fragments is in highmem and device does not
2164 *	   support DMA from it.
2165 */
2166static inline int skb_needs_linearize(struct sk_buff *skb,
2167				      int features)
2168{
2169	return skb_is_nonlinear(skb) &&
2170			((skb_has_frag_list(skb) &&
2171				!(features & NETIF_F_FRAGLIST)) ||
2172			(skb_shinfo(skb)->nr_frags &&
2173				!(features & NETIF_F_SG)));
2174}
2175
2176int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2177			struct netdev_queue *txq)
2178{
2179	const struct net_device_ops *ops = dev->netdev_ops;
2180	int rc = NETDEV_TX_OK;
2181	unsigned int skb_len;
2182
2183	if (likely(!skb->next)) {
2184		netdev_features_t features;
2185
2186		/*
2187		 * If device doesn't need skb->dst, release it right now while
2188		 * its hot in this cpu cache
2189		 */
2190		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2191			skb_dst_drop(skb);
2192
2193		if (!list_empty(&ptype_all))
2194			dev_queue_xmit_nit(skb, dev);
2195
2196		features = netif_skb_features(skb);
 
 
 
 
2197
2198		if (vlan_tx_tag_present(skb) &&
2199		    !(features & NETIF_F_HW_VLAN_TX)) {
2200			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2201			if (unlikely(!skb))
2202				goto out;
2203
2204			skb->vlan_tci = 0;
 
 
 
 
2205		}
2206
2207		if (netif_needs_gso(skb, features)) {
2208			if (unlikely(dev_gso_segment(skb, features)))
2209				goto out_kfree_skb;
2210			if (skb->next)
2211				goto gso;
2212		} else {
2213			if (skb_needs_linearize(skb, features) &&
2214			    __skb_linearize(skb))
2215				goto out_kfree_skb;
2216
2217			/* If packet is not checksummed and device does not
2218			 * support checksumming for this protocol, complete
2219			 * checksumming here.
2220			 */
2221			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2222				skb_set_transport_header(skb,
2223					skb_checksum_start_offset(skb));
2224				if (!(features & NETIF_F_ALL_CSUM) &&
2225				     skb_checksum_help(skb))
2226					goto out_kfree_skb;
2227			}
2228		}
2229
2230		skb_len = skb->len;
2231		rc = ops->ndo_start_xmit(skb, dev);
2232		trace_net_dev_xmit(skb, rc, dev, skb_len);
2233		if (rc == NETDEV_TX_OK)
2234			txq_trans_update(txq);
2235		return rc;
2236	}
2237
2238gso:
2239	do {
2240		struct sk_buff *nskb = skb->next;
2241
2242		skb->next = nskb->next;
2243		nskb->next = NULL;
2244
2245		/*
2246		 * If device doesn't need nskb->dst, release it right now while
2247		 * its hot in this cpu cache
2248		 */
2249		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2250			skb_dst_drop(nskb);
2251
2252		skb_len = nskb->len;
2253		rc = ops->ndo_start_xmit(nskb, dev);
2254		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2255		if (unlikely(rc != NETDEV_TX_OK)) {
2256			if (rc & ~NETDEV_TX_MASK)
2257				goto out_kfree_gso_skb;
2258			nskb->next = skb->next;
2259			skb->next = nskb;
2260			return rc;
2261		}
2262		txq_trans_update(txq);
2263		if (unlikely(netif_xmit_stopped(txq) && skb->next))
2264			return NETDEV_TX_BUSY;
2265	} while (skb->next);
2266
2267out_kfree_gso_skb:
2268	if (likely(skb->next == NULL))
2269		skb->destructor = DEV_GSO_CB(skb)->destructor;
2270out_kfree_skb:
2271	kfree_skb(skb);
2272out:
2273	return rc;
 
2274}
2275
2276static u32 hashrnd __read_mostly;
 
 
 
 
 
 
 
2277
2278/*
2279 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2280 * to be used as a distribution range.
2281 */
2282u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2283		  unsigned int num_tx_queues)
2284{
2285	u32 hash;
2286	u16 qoffset = 0;
2287	u16 qcount = num_tx_queues;
2288
2289	if (skb_rx_queue_recorded(skb)) {
2290		hash = skb_get_rx_queue(skb);
2291		while (unlikely(hash >= num_tx_queues))
2292			hash -= num_tx_queues;
2293		return hash;
2294	}
2295
2296	if (dev->num_tc) {
2297		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2298		qoffset = dev->tc_to_txq[tc].offset;
2299		qcount = dev->tc_to_txq[tc].count;
 
 
2300	}
2301
2302	if (skb->sk && skb->sk->sk_hash)
2303		hash = skb->sk->sk_hash;
2304	else
2305		hash = (__force u16) skb->protocol;
2306	hash = jhash_1word(hash, hashrnd);
2307
2308	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2309}
2310EXPORT_SYMBOL(__skb_tx_hash);
2311
2312static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2313{
2314	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2315		net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2316				     dev->name, queue_index,
2317				     dev->real_num_tx_queues);
2318		return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2319	}
2320	return queue_index;
 
 
 
 
 
 
 
 
 
2321}
2322
2323static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2324{
2325#ifdef CONFIG_XPS
2326	struct xps_dev_maps *dev_maps;
2327	struct xps_map *map;
2328	int queue_index = -1;
2329
2330	rcu_read_lock();
2331	dev_maps = rcu_dereference(dev->xps_maps);
2332	if (dev_maps) {
2333		map = rcu_dereference(
2334		    dev_maps->cpu_map[raw_smp_processor_id()]);
2335		if (map) {
2336			if (map->len == 1)
2337				queue_index = map->queues[0];
2338			else {
2339				u32 hash;
2340				if (skb->sk && skb->sk->sk_hash)
2341					hash = skb->sk->sk_hash;
2342				else
2343					hash = (__force u16) skb->protocol ^
2344					    skb->rxhash;
2345				hash = jhash_1word(hash, hashrnd);
2346				queue_index = map->queues[
2347				    ((u64)hash * map->len) >> 32];
2348			}
2349			if (unlikely(queue_index >= dev->real_num_tx_queues))
2350				queue_index = -1;
2351		}
2352	}
2353	rcu_read_unlock();
2354
2355	return queue_index;
2356#else
2357	return -1;
2358#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
2359}
 
2360
2361static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2362					struct sk_buff *skb)
2363{
2364	int queue_index;
2365	const struct net_device_ops *ops = dev->netdev_ops;
2366
2367	if (dev->real_num_tx_queues == 1)
2368		queue_index = 0;
2369	else if (ops->ndo_select_queue) {
2370		queue_index = ops->ndo_select_queue(dev, skb);
2371		queue_index = dev_cap_txqueue(dev, queue_index);
2372	} else {
2373		struct sock *sk = skb->sk;
2374		queue_index = sk_tx_queue_get(sk);
2375
2376		if (queue_index < 0 || skb->ooo_okay ||
2377		    queue_index >= dev->real_num_tx_queues) {
2378			int old_index = queue_index;
2379
2380			queue_index = get_xps_queue(dev, skb);
2381			if (queue_index < 0)
2382				queue_index = skb_tx_hash(dev, skb);
2383
2384			if (queue_index != old_index && sk) {
2385				struct dst_entry *dst =
2386				    rcu_dereference_check(sk->sk_dst_cache, 1);
 
 
 
 
 
 
 
 
 
 
2387
2388				if (dst && skb_dst(skb) == dst)
2389					sk_tx_queue_set(sk, queue_index);
2390			}
2391		}
 
 
 
 
 
 
2392	}
 
2393
2394	skb_set_queue_mapping(skb, queue_index);
2395	return netdev_get_tx_queue(dev, queue_index);
 
 
 
 
 
 
 
 
2396}
2397
2398static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2399				 struct net_device *dev,
2400				 struct netdev_queue *txq)
2401{
2402	spinlock_t *root_lock = qdisc_lock(q);
 
2403	bool contended;
2404	int rc;
2405
2406	qdisc_skb_cb(skb)->pkt_len = skb->len;
2407	qdisc_calculate_pkt_len(skb, q);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2408	/*
2409	 * Heuristic to force contended enqueues to serialize on a
2410	 * separate lock before trying to get qdisc main lock.
2411	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2412	 * and dequeue packets faster.
 
 
 
 
2413	 */
2414	contended = qdisc_is_running(q);
2415	if (unlikely(contended))
2416		spin_lock(&q->busylock);
2417
2418	spin_lock(root_lock);
2419	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2420		kfree_skb(skb);
2421		rc = NET_XMIT_DROP;
2422	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2423		   qdisc_run_begin(q)) {
2424		/*
2425		 * This is a work-conserving queue; there are no old skbs
2426		 * waiting to be sent out; and the qdisc is not running -
2427		 * xmit the skb directly.
2428		 */
2429		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2430			skb_dst_force(skb);
2431
2432		qdisc_bstats_update(q, skb);
2433
2434		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2435			if (unlikely(contended)) {
2436				spin_unlock(&q->busylock);
2437				contended = false;
2438			}
2439			__qdisc_run(q);
2440		} else
2441			qdisc_run_end(q);
2442
 
2443		rc = NET_XMIT_SUCCESS;
2444	} else {
2445		skb_dst_force(skb);
2446		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2447		if (qdisc_run_begin(q)) {
2448			if (unlikely(contended)) {
2449				spin_unlock(&q->busylock);
2450				contended = false;
2451			}
2452			__qdisc_run(q);
 
2453		}
2454	}
2455	spin_unlock(root_lock);
 
 
2456	if (unlikely(contended))
2457		spin_unlock(&q->busylock);
2458	return rc;
2459}
2460
2461#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2462static void skb_update_prio(struct sk_buff *skb)
2463{
2464	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
 
 
2465
2466	if (!skb->priority && skb->sk && map) {
2467		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
 
 
 
 
 
 
2468
2469		if (prioidx < map->priomap_len)
2470			skb->priority = map->priomap[prioidx];
2471	}
 
2472}
2473#else
2474#define skb_update_prio(skb)
2475#endif
2476
2477static DEFINE_PER_CPU(int, xmit_recursion);
2478#define RECURSION_LIMIT 10
2479
2480/**
2481 *	dev_queue_xmit - transmit a buffer
 
 
2482 *	@skb: buffer to transmit
2483 *
2484 *	Queue a buffer for transmission to a network device. The caller must
2485 *	have set the device and priority and built the buffer before calling
2486 *	this function. The function can be called from an interrupt.
2487 *
2488 *	A negative errno code is returned on a failure. A success does not
2489 *	guarantee the frame will be transmitted as it may be dropped due
2490 *	to congestion or traffic shaping.
2491 *
2492 * -----------------------------------------------------------------------------------
2493 *      I notice this method can also return errors from the queue disciplines,
2494 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2495 *      be positive.
2496 *
2497 *      Regardless of the return value, the skb is consumed, so it is currently
2498 *      difficult to retry a send to this method.  (You can bump the ref count
2499 *      before sending to hold a reference for retry if you are careful.)
2500 *
2501 *      When calling this method, interrupts MUST be enabled.  This is because
2502 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2503 *          --BLG
2504 */
2505int dev_queue_xmit(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2506{
2507	struct net_device *dev = skb->dev;
2508	struct netdev_queue *txq;
2509	struct Qdisc *q;
2510	int rc = -ENOMEM;
 
 
 
 
 
 
 
2511
2512	/* Disable soft irqs for various locks below. Also
2513	 * stops preemption for RCU.
2514	 */
2515	rcu_read_lock_bh();
2516
2517	skb_update_prio(skb);
2518
2519	txq = dev_pick_tx(dev, skb);
2520	q = rcu_dereference_bh(txq->qdisc);
2521
2522#ifdef CONFIG_NET_CLS_ACT
2523	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2524#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2525	trace_net_dev_queue(skb);
2526	if (q->enqueue) {
2527		rc = __dev_xmit_skb(skb, q, dev, txq);
2528		goto out;
2529	}
2530
2531	/* The device has no queue. Common case for software devices:
2532	   loopback, all the sorts of tunnels...
2533
2534	   Really, it is unlikely that netif_tx_lock protection is necessary
2535	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2536	   counters.)
2537	   However, it is possible, that they rely on protection
2538	   made by us here.
2539
2540	   Check this and shot the lock. It is not prone from deadlocks.
2541	   Either shot noqueue qdisc, it is even simpler 8)
2542	 */
2543	if (dev->flags & IFF_UP) {
2544		int cpu = smp_processor_id(); /* ok because BHs are off */
2545
2546		if (txq->xmit_lock_owner != cpu) {
2547
2548			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
 
 
2549				goto recursion_alert;
2550
 
 
 
 
2551			HARD_TX_LOCK(dev, txq, cpu);
2552
2553			if (!netif_xmit_stopped(txq)) {
2554				__this_cpu_inc(xmit_recursion);
2555				rc = dev_hard_start_xmit(skb, dev, txq);
2556				__this_cpu_dec(xmit_recursion);
2557				if (dev_xmit_complete(rc)) {
2558					HARD_TX_UNLOCK(dev, txq);
2559					goto out;
2560				}
2561			}
2562			HARD_TX_UNLOCK(dev, txq);
2563			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2564					     dev->name);
2565		} else {
2566			/* Recursion is detected! It is possible,
2567			 * unfortunately
2568			 */
2569recursion_alert:
2570			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2571					     dev->name);
2572		}
2573	}
2574
2575	rc = -ENETDOWN;
2576	rcu_read_unlock_bh();
2577
2578	kfree_skb(skb);
 
2579	return rc;
2580out:
2581	rcu_read_unlock_bh();
2582	return rc;
2583}
2584EXPORT_SYMBOL(dev_queue_xmit);
 
 
 
 
 
 
 
 
2585
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2586
2587/*=======================================================================
2588			Receiver routines
2589  =======================================================================*/
 
 
 
 
 
 
 
 
 
2590
2591int netdev_max_backlog __read_mostly = 1000;
 
 
2592int netdev_tstamp_prequeue __read_mostly = 1;
 
2593int netdev_budget __read_mostly = 300;
2594int weight_p __read_mostly = 64;            /* old backlog weight */
 
 
 
 
 
 
2595
2596/* Called with irq disabled */
2597static inline void ____napi_schedule(struct softnet_data *sd,
2598				     struct napi_struct *napi)
2599{
2600	list_add_tail(&napi->poll_list, &sd->poll_list);
2601	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2602}
2603
2604/*
2605 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2606 * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2607 * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2608 * if hash is a canonical 4-tuple hash over transport ports.
2609 */
2610void __skb_get_rxhash(struct sk_buff *skb)
2611{
2612	struct flow_keys keys;
2613	u32 hash;
2614
2615	if (!skb_flow_dissect(skb, &keys))
2616		return;
2617
2618	if (keys.ports) {
2619		if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2620			swap(keys.port16[0], keys.port16[1]);
2621		skb->l4_rxhash = 1;
 
 
 
 
 
 
 
 
 
 
 
 
2622	}
2623
2624	/* get a consistent hash (same value on both flow directions) */
2625	if ((__force u32)keys.dst < (__force u32)keys.src)
2626		swap(keys.dst, keys.src);
2627
2628	hash = jhash_3words((__force u32)keys.dst,
2629			    (__force u32)keys.src,
2630			    (__force u32)keys.ports, hashrnd);
2631	if (!hash)
2632		hash = 1;
2633
2634	skb->rxhash = hash;
2635}
2636EXPORT_SYMBOL(__skb_get_rxhash);
2637
2638#ifdef CONFIG_RPS
2639
2640/* One global table that all flow-based protocols share. */
2641struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2642EXPORT_SYMBOL(rps_sock_flow_table);
 
 
2643
2644struct static_key rps_needed __read_mostly;
 
 
 
2645
2646static struct rps_dev_flow *
2647set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2648	    struct rps_dev_flow *rflow, u16 next_cpu)
2649{
2650	if (next_cpu != RPS_NO_CPU) {
2651#ifdef CONFIG_RFS_ACCEL
2652		struct netdev_rx_queue *rxqueue;
2653		struct rps_dev_flow_table *flow_table;
2654		struct rps_dev_flow *old_rflow;
2655		u32 flow_id;
2656		u16 rxq_index;
2657		int rc;
2658
2659		/* Should we steer this flow to a different hardware queue? */
2660		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2661		    !(dev->features & NETIF_F_NTUPLE))
2662			goto out;
2663		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2664		if (rxq_index == skb_get_rx_queue(skb))
2665			goto out;
2666
2667		rxqueue = dev->_rx + rxq_index;
2668		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2669		if (!flow_table)
2670			goto out;
2671		flow_id = skb->rxhash & flow_table->mask;
2672		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2673							rxq_index, flow_id);
2674		if (rc < 0)
2675			goto out;
2676		old_rflow = rflow;
2677		rflow = &flow_table->flows[flow_id];
2678		rflow->filter = rc;
2679		if (old_rflow->filter == rflow->filter)
2680			old_rflow->filter = RPS_NO_FILTER;
2681	out:
2682#endif
2683		rflow->last_qtail =
2684			per_cpu(softnet_data, next_cpu).input_queue_head;
2685	}
2686
2687	rflow->cpu = next_cpu;
2688	return rflow;
2689}
2690
2691/*
2692 * get_rps_cpu is called from netif_receive_skb and returns the target
2693 * CPU from the RPS map of the receiving queue for a given skb.
2694 * rcu_read_lock must be held on entry.
2695 */
2696static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2697		       struct rps_dev_flow **rflowp)
2698{
2699	struct netdev_rx_queue *rxqueue;
2700	struct rps_map *map;
2701	struct rps_dev_flow_table *flow_table;
2702	struct rps_sock_flow_table *sock_flow_table;
2703	int cpu = -1;
2704	u16 tcpu;
 
2705
2706	if (skb_rx_queue_recorded(skb)) {
2707		u16 index = skb_get_rx_queue(skb);
 
2708		if (unlikely(index >= dev->real_num_rx_queues)) {
2709			WARN_ONCE(dev->real_num_rx_queues > 1,
2710				  "%s received packet on queue %u, but number "
2711				  "of RX queues is %u\n",
2712				  dev->name, index, dev->real_num_rx_queues);
2713			goto done;
2714		}
2715		rxqueue = dev->_rx + index;
2716	} else
2717		rxqueue = dev->_rx;
 
2718
 
2719	map = rcu_dereference(rxqueue->rps_map);
2720	if (map) {
2721		if (map->len == 1 &&
2722		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2723			tcpu = map->cpus[0];
2724			if (cpu_online(tcpu))
2725				cpu = tcpu;
2726			goto done;
2727		}
2728	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2729		goto done;
2730	}
2731
2732	skb_reset_network_header(skb);
2733	if (!skb_get_rxhash(skb))
 
2734		goto done;
2735
2736	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2737	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2738	if (flow_table && sock_flow_table) {
2739		u16 next_cpu;
2740		struct rps_dev_flow *rflow;
 
 
2741
2742		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2743		tcpu = rflow->cpu;
 
 
 
 
2744
2745		next_cpu = sock_flow_table->ents[skb->rxhash &
2746		    sock_flow_table->mask];
 
 
 
2747
2748		/*
2749		 * If the desired CPU (where last recvmsg was done) is
2750		 * different from current CPU (one in the rx-queue flow
2751		 * table entry), switch if one of the following holds:
2752		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2753		 *   - Current CPU is offline.
2754		 *   - The current CPU's queue tail has advanced beyond the
2755		 *     last packet that was enqueued using this table entry.
2756		 *     This guarantees that all previous packets for the flow
2757		 *     have been dequeued, thus preserving in order delivery.
2758		 */
2759		if (unlikely(tcpu != next_cpu) &&
2760		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2761		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2762		      rflow->last_qtail)) >= 0))
 
2763			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 
2764
2765		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2766			*rflowp = rflow;
2767			cpu = tcpu;
2768			goto done;
2769		}
2770	}
2771
2772	if (map) {
2773		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2774
 
 
2775		if (cpu_online(tcpu)) {
2776			cpu = tcpu;
2777			goto done;
2778		}
2779	}
2780
2781done:
2782	return cpu;
2783}
2784
2785#ifdef CONFIG_RFS_ACCEL
2786
2787/**
2788 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2789 * @dev: Device on which the filter was set
2790 * @rxq_index: RX queue index
2791 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2792 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2793 *
2794 * Drivers that implement ndo_rx_flow_steer() should periodically call
2795 * this function for each installed filter and remove the filters for
2796 * which it returns %true.
2797 */
2798bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2799			 u32 flow_id, u16 filter_id)
2800{
2801	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2802	struct rps_dev_flow_table *flow_table;
2803	struct rps_dev_flow *rflow;
2804	bool expire = true;
2805	int cpu;
2806
2807	rcu_read_lock();
2808	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2809	if (flow_table && flow_id <= flow_table->mask) {
2810		rflow = &flow_table->flows[flow_id];
2811		cpu = ACCESS_ONCE(rflow->cpu);
2812		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2813		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2814			   rflow->last_qtail) <
2815		     (int)(10 * flow_table->mask)))
2816			expire = false;
2817	}
2818	rcu_read_unlock();
2819	return expire;
2820}
2821EXPORT_SYMBOL(rps_may_expire_flow);
2822
2823#endif /* CONFIG_RFS_ACCEL */
2824
2825/* Called from hardirq (IPI) context */
2826static void rps_trigger_softirq(void *data)
2827{
2828	struct softnet_data *sd = data;
2829
2830	____napi_schedule(sd, &sd->backlog);
2831	sd->received_rps++;
2832}
2833
2834#endif /* CONFIG_RPS */
2835
 
 
 
 
 
 
 
 
 
2836/*
2837 * Check if this softnet_data structure is another cpu one
2838 * If yes, queue it to our IPI list and return 1
2839 * If no, return 0
2840 */
2841static int rps_ipi_queued(struct softnet_data *sd)
2842{
2843#ifdef CONFIG_RPS
2844	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2845
 
2846	if (sd != mysd) {
2847		sd->rps_ipi_next = mysd->rps_ipi_list;
2848		mysd->rps_ipi_list = sd;
2849
2850		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2851		return 1;
2852	}
2853#endif /* CONFIG_RPS */
 
2854	return 0;
2855}
2856
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2857/*
2858 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2859 * queue (may be a remote CPU queue).
2860 */
2861static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2862			      unsigned int *qtail)
2863{
 
2864	struct softnet_data *sd;
2865	unsigned long flags;
 
2866
 
2867	sd = &per_cpu(softnet_data, cpu);
2868
2869	local_irq_save(flags);
2870
2871	rps_lock(sd);
2872	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2873		if (skb_queue_len(&sd->input_pkt_queue)) {
 
2874enqueue:
2875			__skb_queue_tail(&sd->input_pkt_queue, skb);
2876			input_queue_tail_incr_save(sd, qtail);
2877			rps_unlock(sd);
2878			local_irq_restore(flags);
2879			return NET_RX_SUCCESS;
2880		}
2881
2882		/* Schedule NAPI for backlog device
2883		 * We can use non atomic operation since we own the queue lock
2884		 */
2885		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2886			if (!rps_ipi_queued(sd))
2887				____napi_schedule(sd, &sd->backlog);
2888		}
2889		goto enqueue;
2890	}
 
2891
 
2892	sd->dropped++;
2893	rps_unlock(sd);
2894
2895	local_irq_restore(flags);
2896
2897	atomic_long_inc(&skb->dev->rx_dropped);
2898	kfree_skb(skb);
2899	return NET_RX_DROP;
2900}
2901
2902/**
2903 *	netif_rx	-	post buffer to the network code
2904 *	@skb: buffer to post
2905 *
2906 *	This function receives a packet from a device driver and queues it for
2907 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2908 *	may be dropped during processing for congestion control or by the
2909 *	protocol layers.
2910 *
2911 *	return values:
2912 *	NET_RX_SUCCESS	(no congestion)
2913 *	NET_RX_DROP     (packet was dropped)
2914 *
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2915 */
 
 
 
 
 
 
2916
2917int netif_rx(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2918{
2919	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2920
2921	/* if netpoll wants it, pretend we never saw it */
2922	if (netpoll_rx(skb))
2923		return NET_RX_DROP;
2924
2925	net_timestamp_check(netdev_tstamp_prequeue, skb);
2926
2927	trace_netif_rx(skb);
 
2928#ifdef CONFIG_RPS
2929	if (static_key_false(&rps_needed)) {
2930		struct rps_dev_flow voidflow, *rflow = &voidflow;
2931		int cpu;
2932
2933		preempt_disable();
2934		rcu_read_lock();
2935
2936		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2937		if (cpu < 0)
2938			cpu = smp_processor_id();
2939
2940		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2941
2942		rcu_read_unlock();
2943		preempt_enable();
2944	} else
2945#endif
2946	{
2947		unsigned int qtail;
2948		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2949		put_cpu();
2950	}
2951	return ret;
2952}
2953EXPORT_SYMBOL(netif_rx);
2954
2955int netif_rx_ni(struct sk_buff *skb)
 
 
 
 
 
 
 
 
2956{
2957	int err;
2958
2959	preempt_disable();
2960	err = netif_rx(skb);
2961	if (local_softirq_pending())
2962		do_softirq();
2963	preempt_enable();
2964
2965	return err;
 
 
 
2966}
2967EXPORT_SYMBOL(netif_rx_ni);
2968
2969static void net_tx_action(struct softirq_action *h)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2970{
2971	struct softnet_data *sd = &__get_cpu_var(softnet_data);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2972
2973	if (sd->completion_queue) {
2974		struct sk_buff *clist;
2975
2976		local_irq_disable();
2977		clist = sd->completion_queue;
2978		sd->completion_queue = NULL;
2979		local_irq_enable();
2980
2981		while (clist) {
2982			struct sk_buff *skb = clist;
 
2983			clist = clist->next;
2984
2985			WARN_ON(atomic_read(&skb->users));
2986			trace_kfree_skb(skb, net_tx_action);
2987			__kfree_skb(skb);
 
 
 
 
 
 
 
 
2988		}
2989	}
2990
2991	if (sd->output_queue) {
2992		struct Qdisc *head;
2993
2994		local_irq_disable();
2995		head = sd->output_queue;
2996		sd->output_queue = NULL;
2997		sd->output_queue_tailp = &sd->output_queue;
2998		local_irq_enable();
2999
 
 
3000		while (head) {
3001			struct Qdisc *q = head;
3002			spinlock_t *root_lock;
3003
3004			head = head->next_sched;
3005
3006			root_lock = qdisc_lock(q);
3007			if (spin_trylock(root_lock)) {
3008				smp_mb__before_clear_bit();
3009				clear_bit(__QDISC_STATE_SCHED,
3010					  &q->state);
3011				qdisc_run(q);
3012				spin_unlock(root_lock);
3013			} else {
3014				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3015					      &q->state)) {
3016					__netif_reschedule(q);
3017				} else {
3018					smp_mb__before_clear_bit();
3019					clear_bit(__QDISC_STATE_SCHED,
3020						  &q->state);
3021				}
 
 
 
 
3022			}
 
 
 
 
 
3023		}
 
 
3024	}
 
 
3025}
3026
3027#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3028    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3029/* This hook is defined here for ATM LANE */
3030int (*br_fdb_test_addr_hook)(struct net_device *dev,
3031			     unsigned char *addr) __read_mostly;
3032EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3033#endif
3034
3035#ifdef CONFIG_NET_CLS_ACT
3036/* TODO: Maybe we should just force sch_ingress to be compiled in
3037 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3038 * a compare and 2 stores extra right now if we dont have it on
3039 * but have CONFIG_NET_CLS_ACT
3040 * NOTE: This doesn't stop any functionality; if you dont have
3041 * the ingress scheduler, you just can't add policies on ingress.
3042 *
3043 */
3044static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3045{
3046	struct net_device *dev = skb->dev;
3047	u32 ttl = G_TC_RTTL(skb->tc_verd);
3048	int result = TC_ACT_OK;
3049	struct Qdisc *q;
3050
3051	if (unlikely(MAX_RED_LOOP < ttl++)) {
3052		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3053				     skb->skb_iif, dev->ifindex);
3054		return TC_ACT_SHOT;
3055	}
3056
3057	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3058	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3059
3060	q = rxq->qdisc;
3061	if (q != &noop_qdisc) {
3062		spin_lock(qdisc_lock(q));
3063		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3064			result = qdisc_enqueue_root(skb, q);
3065		spin_unlock(qdisc_lock(q));
3066	}
3067
3068	return result;
3069}
3070
3071static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3072					 struct packet_type **pt_prev,
3073					 int *ret, struct net_device *orig_dev)
3074{
3075	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
 
 
3076
3077	if (!rxq || rxq->qdisc == &noop_qdisc)
3078		goto out;
 
 
 
 
 
3079
3080	if (*pt_prev) {
3081		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3082		*pt_prev = NULL;
3083	}
3084
3085	switch (ing_filter(skb, rxq)) {
 
 
 
 
 
 
 
 
 
 
3086	case TC_ACT_SHOT:
 
 
 
 
3087	case TC_ACT_STOLEN:
3088		kfree_skb(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3089		return NULL;
 
 
 
 
 
3090	}
3091
3092out:
3093	skb->tc_verd = 0;
3094	return skb;
3095}
3096#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3097
3098/**
3099 *	netdev_rx_handler_register - register receive handler
3100 *	@dev: device to register a handler for
3101 *	@rx_handler: receive handler to register
3102 *	@rx_handler_data: data pointer that is used by rx handler
3103 *
3104 *	Register a receive hander for a device. This handler will then be
3105 *	called from __netif_receive_skb. A negative errno code is returned
3106 *	on a failure.
3107 *
3108 *	The caller must hold the rtnl_mutex.
3109 *
3110 *	For a general description of rx_handler, see enum rx_handler_result.
3111 */
3112int netdev_rx_handler_register(struct net_device *dev,
3113			       rx_handler_func_t *rx_handler,
3114			       void *rx_handler_data)
3115{
3116	ASSERT_RTNL();
3117
3118	if (dev->rx_handler)
3119		return -EBUSY;
3120
 
 
 
 
3121	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3122	rcu_assign_pointer(dev->rx_handler, rx_handler);
3123
3124	return 0;
3125}
3126EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3127
3128/**
3129 *	netdev_rx_handler_unregister - unregister receive handler
3130 *	@dev: device to unregister a handler from
3131 *
3132 *	Unregister a receive hander from a device.
3133 *
3134 *	The caller must hold the rtnl_mutex.
3135 */
3136void netdev_rx_handler_unregister(struct net_device *dev)
3137{
3138
3139	ASSERT_RTNL();
3140	RCU_INIT_POINTER(dev->rx_handler, NULL);
 
 
 
 
 
3141	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3142}
3143EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3144
3145static int __netif_receive_skb(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3146{
3147	struct packet_type *ptype, *pt_prev;
3148	rx_handler_func_t *rx_handler;
 
3149	struct net_device *orig_dev;
3150	struct net_device *null_or_dev;
3151	bool deliver_exact = false;
3152	int ret = NET_RX_DROP;
3153	__be16 type;
3154
3155	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3156
3157	trace_netif_receive_skb(skb);
3158
3159	/* if we've gotten here through NAPI, check netpoll */
3160	if (netpoll_receive_skb(skb))
3161		return NET_RX_DROP;
3162
3163	if (!skb->skb_iif)
3164		skb->skb_iif = skb->dev->ifindex;
3165	orig_dev = skb->dev;
3166
3167	skb_reset_network_header(skb);
3168	skb_reset_transport_header(skb);
 
3169	skb_reset_mac_len(skb);
3170
3171	pt_prev = NULL;
3172
3173	rcu_read_lock();
3174
3175another_round:
 
3176
3177	__this_cpu_inc(softnet_data.processed);
3178
3179	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3180		skb = vlan_untag(skb);
3181		if (unlikely(!skb))
 
 
 
 
 
 
3182			goto out;
 
3183	}
3184
3185#ifdef CONFIG_NET_CLS_ACT
3186	if (skb->tc_verd & TC_NCLS) {
3187		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3188		goto ncls;
3189	}
3190#endif
 
 
 
 
 
3191
3192	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3193		if (!ptype->dev || ptype->dev == skb->dev) {
3194			if (pt_prev)
3195				ret = deliver_skb(skb, pt_prev, orig_dev);
3196			pt_prev = ptype;
3197		}
3198	}
3199
3200#ifdef CONFIG_NET_CLS_ACT
3201	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3202	if (!skb)
3203		goto out;
3204ncls:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3205#endif
 
 
 
 
3206
3207	rx_handler = rcu_dereference(skb->dev->rx_handler);
3208	if (vlan_tx_tag_present(skb)) {
3209		if (pt_prev) {
3210			ret = deliver_skb(skb, pt_prev, orig_dev);
3211			pt_prev = NULL;
3212		}
3213		if (vlan_do_receive(&skb, !rx_handler))
3214			goto another_round;
3215		else if (unlikely(!skb))
3216			goto out;
3217	}
3218
 
3219	if (rx_handler) {
3220		if (pt_prev) {
3221			ret = deliver_skb(skb, pt_prev, orig_dev);
3222			pt_prev = NULL;
3223		}
3224		switch (rx_handler(&skb)) {
3225		case RX_HANDLER_CONSUMED:
 
3226			goto out;
3227		case RX_HANDLER_ANOTHER:
3228			goto another_round;
3229		case RX_HANDLER_EXACT:
3230			deliver_exact = true;
 
3231		case RX_HANDLER_PASS:
3232			break;
3233		default:
3234			BUG();
3235		}
3236	}
3237
3238	/* deliver only exact match when indicated */
3239	null_or_dev = deliver_exact ? skb->dev : NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3240
3241	type = skb->protocol;
3242	list_for_each_entry_rcu(ptype,
3243			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3244		if (ptype->type == type &&
3245		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3246		     ptype->dev == orig_dev)) {
3247			if (pt_prev)
3248				ret = deliver_skb(skb, pt_prev, orig_dev);
3249			pt_prev = ptype;
3250		}
 
 
 
 
 
3251	}
3252
3253	if (pt_prev) {
3254		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 
 
3255	} else {
3256		atomic_long_inc(&skb->dev->rx_dropped);
3257		kfree_skb(skb);
 
 
 
 
3258		/* Jamal, now you will not able to escape explaining
3259		 * me how you were going to use this. :-)
3260		 */
3261		ret = NET_RX_DROP;
3262	}
3263
3264out:
3265	rcu_read_unlock();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3266	return ret;
3267}
3268
3269/**
3270 *	netif_receive_skb - process receive buffer from network
3271 *	@skb: buffer to process
3272 *
3273 *	netif_receive_skb() is the main receive data processing function.
3274 *	It always succeeds. The buffer may be dropped during processing
3275 *	for congestion control or by the protocol layers.
3276 *
3277 *	This function may only be called from softirq context and interrupts
3278 *	should be enabled.
3279 *
3280 *	Return values (usually ignored):
3281 *	NET_RX_SUCCESS: no congestion
3282 *	NET_RX_DROP: packet was dropped
3283 */
3284int netif_receive_skb(struct sk_buff *skb)
3285{
3286	net_timestamp_check(netdev_tstamp_prequeue, skb);
3287
3288	if (skb_defer_rx_timestamp(skb))
3289		return NET_RX_SUCCESS;
3290
3291#ifdef CONFIG_RPS
3292	if (static_key_false(&rps_needed)) {
3293		struct rps_dev_flow voidflow, *rflow = &voidflow;
3294		int cpu, ret;
3295
3296		rcu_read_lock();
 
 
3297
3298		cpu = get_rps_cpu(skb->dev, skb, &rflow);
 
 
 
 
3299
3300		if (cpu >= 0) {
3301			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3302			rcu_read_unlock();
3303			return ret;
 
 
 
 
 
 
 
3304		}
3305		rcu_read_unlock();
3306	}
3307#endif
3308	return __netif_receive_skb(skb);
3309}
3310EXPORT_SYMBOL(netif_receive_skb);
3311
3312/* Network device is going away, flush any packets still pending
3313 * Called with irqs disabled.
3314 */
3315static void flush_backlog(void *arg)
3316{
3317	struct net_device *dev = arg;
3318	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3319	struct sk_buff *skb, *tmp;
 
 
 
 
 
 
 
 
 
3320
3321	rps_lock(sd);
3322	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3323		if (skb->dev == dev) {
3324			__skb_unlink(skb, &sd->input_pkt_queue);
3325			kfree_skb(skb);
3326			input_queue_head_incr(sd);
 
 
 
 
 
 
 
 
 
 
3327		}
 
3328	}
3329	rps_unlock(sd);
3330
3331	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3332		if (skb->dev == dev) {
3333			__skb_unlink(skb, &sd->process_queue);
3334			kfree_skb(skb);
3335			input_queue_head_incr(sd);
3336		}
3337	}
3338}
3339
3340static int napi_gro_complete(struct sk_buff *skb)
3341{
3342	struct packet_type *ptype;
3343	__be16 type = skb->protocol;
3344	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3345	int err = -ENOENT;
3346
3347	if (NAPI_GRO_CB(skb)->count == 1) {
3348		skb_shinfo(skb)->gso_size = 0;
3349		goto out;
3350	}
3351
3352	rcu_read_lock();
3353	list_for_each_entry_rcu(ptype, head, list) {
3354		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3355			continue;
3356
3357		err = ptype->gro_complete(skb);
3358		break;
3359	}
3360	rcu_read_unlock();
3361
3362	if (err) {
3363		WARN_ON(&ptype->list == head);
3364		kfree_skb(skb);
3365		return NET_RX_SUCCESS;
3366	}
 
 
 
 
 
 
 
 
 
3367
3368out:
3369	return netif_receive_skb(skb);
3370}
3371
3372inline void napi_gro_flush(struct napi_struct *napi)
3373{
 
3374	struct sk_buff *skb, *next;
 
3375
3376	for (skb = napi->gro_list; skb; skb = next) {
3377		next = skb->next;
3378		skb->next = NULL;
3379		napi_gro_complete(skb);
 
 
 
 
 
 
 
 
 
 
 
3380	}
3381
3382	napi->gro_count = 0;
3383	napi->gro_list = NULL;
 
 
 
3384}
3385EXPORT_SYMBOL(napi_gro_flush);
3386
3387enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3388{
3389	struct sk_buff **pp = NULL;
3390	struct packet_type *ptype;
3391	__be16 type = skb->protocol;
3392	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3393	int same_flow;
3394	int mac_len;
3395	enum gro_result ret;
3396
3397	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3398		goto normal;
3399
3400	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3401		goto normal;
3402
3403	rcu_read_lock();
3404	list_for_each_entry_rcu(ptype, head, list) {
3405		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3406			continue;
3407
3408		skb_set_network_header(skb, skb_gro_offset(skb));
3409		mac_len = skb->network_header - skb->mac_header;
3410		skb->mac_len = mac_len;
3411		NAPI_GRO_CB(skb)->same_flow = 0;
3412		NAPI_GRO_CB(skb)->flush = 0;
3413		NAPI_GRO_CB(skb)->free = 0;
3414
3415		pp = ptype->gro_receive(&napi->gro_list, skb);
 
 
 
 
 
 
3416		break;
3417	}
3418	rcu_read_unlock();
3419
3420	if (&ptype->list == head)
3421		goto normal;
3422
3423	same_flow = NAPI_GRO_CB(skb)->same_flow;
3424	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3425
3426	if (pp) {
3427		struct sk_buff *nskb = *pp;
3428
3429		*pp = nskb->next;
3430		nskb->next = NULL;
3431		napi_gro_complete(nskb);
3432		napi->gro_count--;
3433	}
3434
3435	if (same_flow)
3436		goto ok;
3437
3438	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3439		goto normal;
3440
3441	napi->gro_count++;
3442	NAPI_GRO_CB(skb)->count = 1;
3443	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3444	skb->next = napi->gro_list;
3445	napi->gro_list = skb;
3446	ret = GRO_HELD;
3447
3448pull:
3449	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3450		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3451
3452		BUG_ON(skb->end - skb->tail < grow);
 
 
3453
3454		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3455
3456		skb->tail += grow;
3457		skb->data_len -= grow;
3458
3459		skb_shinfo(skb)->frags[0].page_offset += grow;
3460		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
 
 
 
3461
3462		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3463			skb_frag_unref(skb, 0);
3464			memmove(skb_shinfo(skb)->frags,
3465				skb_shinfo(skb)->frags + 1,
3466				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3467		}
3468	}
3469
3470ok:
 
3471	return ret;
3472
3473normal:
3474	ret = GRO_NORMAL;
3475	goto pull;
3476}
3477EXPORT_SYMBOL(dev_gro_receive);
3478
3479static inline gro_result_t
3480__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3481{
3482	struct sk_buff *p;
3483	unsigned int maclen = skb->dev->hard_header_len;
3484
3485	for (p = napi->gro_list; p; p = p->next) {
3486		unsigned long diffs;
3487
3488		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3489		diffs |= p->vlan_tci ^ skb->vlan_tci;
3490		if (maclen == ETH_HLEN)
3491			diffs |= compare_ether_header(skb_mac_header(p),
3492						      skb_gro_mac_header(skb));
3493		else if (!diffs)
3494			diffs = memcmp(skb_mac_header(p),
3495				       skb_gro_mac_header(skb),
3496				       maclen);
3497		NAPI_GRO_CB(p)->same_flow = !diffs;
3498		NAPI_GRO_CB(p)->flush = 0;
3499	}
 
3500
3501	return dev_gro_receive(napi, skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3502}
3503
3504gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3505{
3506	switch (ret) {
3507	case GRO_NORMAL:
3508		if (netif_receive_skb(skb))
3509			ret = GRO_DROP;
3510		break;
3511
3512	case GRO_DROP:
3513		kfree_skb(skb);
3514		break;
3515
3516	case GRO_MERGED_FREE:
3517		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3518			kmem_cache_free(skbuff_head_cache, skb);
3519		else
3520			__kfree_skb(skb);
3521		break;
3522
3523	case GRO_HELD:
3524	case GRO_MERGED:
3525		break;
3526	}
3527
3528	return ret;
3529}
3530EXPORT_SYMBOL(napi_skb_finish);
3531
3532void skb_gro_reset_offset(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
3533{
3534	NAPI_GRO_CB(skb)->data_offset = 0;
3535	NAPI_GRO_CB(skb)->frag0 = NULL;
3536	NAPI_GRO_CB(skb)->frag0_len = 0;
3537
3538	if (skb->mac_header == skb->tail &&
3539	    !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3540		NAPI_GRO_CB(skb)->frag0 =
3541			skb_frag_address(&skb_shinfo(skb)->frags[0]);
3542		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3543	}
 
 
3544}
3545EXPORT_SYMBOL(skb_gro_reset_offset);
3546
3547gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3548{
3549	skb_gro_reset_offset(skb);
3550
3551	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3552}
3553EXPORT_SYMBOL(napi_gro_receive);
3554
3555static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3556{
3557	__skb_pull(skb, skb_headlen(skb));
3558	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
3559	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3560	skb->vlan_tci = 0;
3561	skb->dev = napi->dev;
3562	skb->skb_iif = 0;
3563
3564	napi->skb = skb;
3565}
3566
3567struct sk_buff *napi_get_frags(struct napi_struct *napi)
3568{
3569	struct sk_buff *skb = napi->skb;
 
 
 
 
 
 
3570
3571	if (!skb) {
3572		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3573		if (skb)
3574			napi->skb = skb;
 
 
3575	}
3576	return skb;
3577}
3578EXPORT_SYMBOL(napi_get_frags);
3579
3580gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3581			       gro_result_t ret)
3582{
3583	switch (ret) {
3584	case GRO_NORMAL:
3585	case GRO_HELD:
3586		skb->protocol = eth_type_trans(skb, skb->dev);
3587
3588		if (ret == GRO_HELD)
3589			skb_gro_pull(skb, -ETH_HLEN);
3590		else if (netif_receive_skb(skb))
3591			ret = GRO_DROP;
3592		break;
3593
3594	case GRO_DROP:
3595	case GRO_MERGED_FREE:
3596		napi_reuse_skb(napi, skb);
3597		break;
3598
3599	case GRO_MERGED:
3600		break;
3601	}
 
 
 
3602
3603	return ret;
 
 
 
 
 
 
 
3604}
3605EXPORT_SYMBOL(napi_frags_finish);
3606
3607static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3608{
3609	struct sk_buff *skb = napi->skb;
3610	struct ethhdr *eth;
3611	unsigned int hlen;
3612	unsigned int off;
3613
3614	napi->skb = NULL;
 
 
 
 
3615
3616	skb_reset_mac_header(skb);
3617	skb_gro_reset_offset(skb);
3618
3619	off = skb_gro_offset(skb);
3620	hlen = off + sizeof(*eth);
3621	eth = skb_gro_header_fast(skb, off);
3622	if (skb_gro_header_hard(skb, hlen)) {
3623		eth = skb_gro_header_slow(skb, hlen, off);
3624		if (unlikely(!eth)) {
3625			napi_reuse_skb(napi, skb);
3626			skb = NULL;
3627			goto out;
3628		}
3629	}
3630
3631	skb_gro_pull(skb, sizeof(*eth));
3632
3633	/*
3634	 * This works because the only protocols we care about don't require
3635	 * special handling.  We'll fix it up properly at the end.
3636	 */
3637	skb->protocol = eth->h_proto;
 
3638
3639out:
3640	return skb;
3641}
3642
3643gro_result_t napi_gro_frags(struct napi_struct *napi)
3644{
3645	struct sk_buff *skb = napi_frags_skb(napi);
3646
3647	if (!skb)
3648		return GRO_DROP;
3649
3650	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
 
 
 
 
3651}
3652EXPORT_SYMBOL(napi_gro_frags);
3653
3654/*
3655 * net_rps_action sends any pending IPI's for rps.
3656 * Note: called with local irq disabled, but exits with local irq enabled.
3657 */
3658static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3659{
3660#ifdef CONFIG_RPS
3661	struct softnet_data *remsd = sd->rps_ipi_list;
3662
3663	if (remsd) {
3664		sd->rps_ipi_list = NULL;
3665
3666		local_irq_enable();
3667
3668		/* Send pending IPI's to kick RPS processing on remote cpus. */
3669		while (remsd) {
3670			struct softnet_data *next = remsd->rps_ipi_next;
3671
3672			if (cpu_online(remsd->cpu))
3673				__smp_call_function_single(remsd->cpu,
3674							   &remsd->csd, 0);
3675			remsd = next;
3676		}
3677	} else
3678#endif
3679		local_irq_enable();
3680}
3681
 
 
 
 
 
 
 
 
 
3682static int process_backlog(struct napi_struct *napi, int quota)
3683{
3684	int work = 0;
3685	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 
 
3686
3687#ifdef CONFIG_RPS
3688	/* Check if we have pending ipi, its better to send them now,
3689	 * not waiting net_rx_action() end.
3690	 */
3691	if (sd->rps_ipi_list) {
3692		local_irq_disable();
3693		net_rps_action_and_irq_enable(sd);
3694	}
3695#endif
3696	napi->weight = weight_p;
3697	local_irq_disable();
3698	while (work < quota) {
3699		struct sk_buff *skb;
3700		unsigned int qlen;
3701
3702		while ((skb = __skb_dequeue(&sd->process_queue))) {
3703			local_irq_enable();
3704			__netif_receive_skb(skb);
3705			local_irq_disable();
3706			input_queue_head_incr(sd);
3707			if (++work >= quota) {
3708				local_irq_enable();
3709				return work;
3710			}
3711		}
3712
3713		rps_lock(sd);
3714		qlen = skb_queue_len(&sd->input_pkt_queue);
3715		if (qlen)
3716			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3717						   &sd->process_queue);
3718
3719		if (qlen < quota - work) {
 
3720			/*
3721			 * Inline a custom version of __napi_complete().
3722			 * only current cpu owns and manipulates this napi,
3723			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3724			 * we can use a plain write instead of clear_bit(),
 
3725			 * and we dont need an smp_mb() memory barrier.
3726			 */
3727			list_del(&napi->poll_list);
3728			napi->state = 0;
3729
3730			quota = work + qlen;
 
 
3731		}
3732		rps_unlock(sd);
3733	}
3734	local_irq_enable();
3735
3736	return work;
3737}
3738
3739/**
3740 * __napi_schedule - schedule for receive
3741 * @n: entry to schedule
3742 *
3743 * The entry's receive function will be scheduled to run
 
3744 */
3745void __napi_schedule(struct napi_struct *n)
3746{
3747	unsigned long flags;
3748
3749	local_irq_save(flags);
3750	____napi_schedule(&__get_cpu_var(softnet_data), n);
3751	local_irq_restore(flags);
3752}
3753EXPORT_SYMBOL(__napi_schedule);
3754
3755void __napi_complete(struct napi_struct *n)
 
 
 
 
 
 
 
 
 
3756{
3757	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3758	BUG_ON(n->gro_list);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3759
3760	list_del(&n->poll_list);
3761	smp_mb__before_clear_bit();
3762	clear_bit(NAPI_STATE_SCHED, &n->state);
3763}
3764EXPORT_SYMBOL(__napi_complete);
3765
3766void napi_complete(struct napi_struct *n)
 
 
 
 
 
 
 
 
 
 
3767{
3768	unsigned long flags;
 
 
 
 
 
 
 
 
 
 
3769
3770	/*
3771	 * don't let napi dequeue from the cpu poll list
3772	 * just in case its running on a different cpu
 
 
3773	 */
3774	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3775		return;
 
3776
3777	napi_gro_flush(n);
3778	local_irq_save(flags);
3779	__napi_complete(n);
3780	local_irq_restore(flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3781}
3782EXPORT_SYMBOL(napi_complete);
3783
3784void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3785		    int (*poll)(struct napi_struct *, int), int weight)
3786{
 
 
 
3787	INIT_LIST_HEAD(&napi->poll_list);
3788	napi->gro_count = 0;
3789	napi->gro_list = NULL;
 
 
3790	napi->skb = NULL;
 
 
3791	napi->poll = poll;
 
 
 
3792	napi->weight = weight;
3793	list_add(&napi->dev_list, &dev->napi_list);
3794	napi->dev = dev;
3795#ifdef CONFIG_NETPOLL
3796	spin_lock_init(&napi->poll_lock);
3797	napi->poll_owner = -1;
3798#endif
3799	set_bit(NAPI_STATE_SCHED, &napi->state);
 
 
 
 
 
 
 
 
 
 
3800}
3801EXPORT_SYMBOL(netif_napi_add);
3802
3803void netif_napi_del(struct napi_struct *napi)
3804{
3805	struct sk_buff *skb, *next;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3806
3807	list_del_init(&napi->dev_list);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3808	napi_free_frags(napi);
3809
3810	for (skb = napi->gro_list; skb; skb = next) {
3811		next = skb->next;
3812		skb->next = NULL;
3813		kfree_skb(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3814	}
3815
3816	napi->gro_list = NULL;
3817	napi->gro_count = 0;
 
3818}
3819EXPORT_SYMBOL(netif_napi_del);
3820
3821static void net_rx_action(struct softirq_action *h)
3822{
3823	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3824	unsigned long time_limit = jiffies + 2;
3825	int budget = netdev_budget;
3826	void *have;
 
3827
3828	local_irq_disable();
3829
3830	while (!list_empty(&sd->poll_list)) {
3831		struct napi_struct *n;
3832		int work, weight;
3833
3834		/* If softirq window is exhuasted then punt.
3835		 * Allow this to run for 2 jiffies since which will allow
3836		 * an average latency of 1.5/HZ.
3837		 */
3838		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3839			goto softnet_break;
3840
3841		local_irq_enable();
 
3842
3843		/* Even though interrupts have been re-enabled, this
3844		 * access is safe because interrupts can only add new
3845		 * entries to the tail of this list, and only ->poll()
3846		 * calls can remove this head entry from the list.
3847		 */
3848		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3849
3850		have = netpoll_poll_lock(n);
 
 
 
 
 
3851
3852		weight = n->weight;
3853
3854		/* This NAPI_STATE_SCHED test is for avoiding a race
3855		 * with netpoll's poll_napi().  Only the entity which
3856		 * obtains the lock and sees NAPI_STATE_SCHED set will
3857		 * actually make the ->poll() call.  Therefore we avoid
3858		 * accidentally calling ->poll() when NAPI is not scheduled.
3859		 */
3860		work = 0;
3861		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3862			work = n->poll(n, weight);
3863			trace_napi_poll(n);
3864		}
3865
3866		WARN_ON_ONCE(work > weight);
 
 
 
 
 
3867
3868		budget -= work;
 
3869
3870		local_irq_disable();
 
 
 
3871
3872		/* Drivers must not modify the NAPI state if they
3873		 * consume the entire weight.  In such cases this code
3874		 * still "owns" the NAPI instance and therefore can
3875		 * move the instance around on the list at-will.
3876		 */
3877		if (unlikely(work == weight)) {
3878			if (unlikely(napi_disable_pending(n))) {
3879				local_irq_enable();
3880				napi_complete(n);
3881				local_irq_disable();
3882			} else
3883				list_move_tail(&n->poll_list, &sd->poll_list);
 
 
 
 
3884		}
 
 
 
 
 
 
 
 
3885
3886		netpoll_poll_unlock(have);
 
 
 
 
 
 
 
 
 
 
 
 
 
3887	}
3888out:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3889	net_rps_action_and_irq_enable(sd);
 
 
3890
3891#ifdef CONFIG_NET_DMA
3892	/*
3893	 * There may not be any more sk_buffs coming right now, so push
3894	 * any pending DMA copies to hardware
3895	 */
3896	dma_issue_pending_all();
3897#endif
3898
3899	return;
 
3900
3901softnet_break:
3902	sd->time_squeeze++;
3903	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3904	goto out;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3905}
3906
3907static gifconf_func_t *gifconf_list[NPROTO];
 
 
 
 
 
 
3908
3909/**
3910 *	register_gifconf	-	register a SIOCGIF handler
3911 *	@family: Address family
3912 *	@gifconf: Function handler
3913 *
3914 *	Register protocol dependent address dumping routines. The handler
3915 *	that is passed must not be freed or reused until it has been replaced
3916 *	by another handler.
3917 */
3918int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
 
3919{
3920	if (family >= NPROTO)
3921		return -EINVAL;
3922	gifconf_list[family] = gifconf;
3923	return 0;
 
 
 
 
3924}
3925EXPORT_SYMBOL(register_gifconf);
3926
 
 
 
 
 
 
 
 
 
3927
3928/*
3929 *	Map an interface index to its name (SIOCGIFNAME)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3930 */
 
 
 
3931
3932/*
3933 *	We need this ioctl for efficient implementation of the
3934 *	if_indextoname() function required by the IPv6 API.  Without
3935 *	it, we would have to search all the interfaces to find a
3936 *	match.  --pb
 
 
 
 
 
3937 */
 
 
 
3938
3939static int dev_ifname(struct net *net, struct ifreq __user *arg)
 
 
 
 
 
 
 
 
 
 
 
 
 
3940{
3941	struct net_device *dev;
3942	struct ifreq ifr;
3943
3944	/*
3945	 *	Fetch the caller's info block.
3946	 */
3947
3948	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3949		return -EFAULT;
3950
3951	rcu_read_lock();
3952	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3953	if (!dev) {
3954		rcu_read_unlock();
3955		return -ENODEV;
3956	}
3957
3958	strcpy(ifr.ifr_name, dev->name);
3959	rcu_read_unlock();
 
 
 
 
 
 
 
 
3960
3961	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3962		return -EFAULT;
3963	return 0;
3964}
3965
3966/*
3967 *	Perform a SIOCGIFCONF call. This structure will change
3968 *	size eventually, and there is nothing I can do about it.
3969 *	Thus we will need a 'compatibility mode'.
 
 
 
 
 
 
 
 
 
 
 
 
 
3970 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3971
3972static int dev_ifconf(struct net *net, char __user *arg)
 
 
3973{
3974	struct ifconf ifc;
3975	struct net_device *dev;
3976	char __user *pos;
3977	int len;
3978	int total;
3979	int i;
3980
3981	/*
3982	 *	Fetch the caller's info block.
3983	 */
3984
3985	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3986		return -EFAULT;
3987
3988	pos = ifc.ifc_buf;
3989	len = ifc.ifc_len;
3990
3991	/*
3992	 *	Loop over the interfaces, and write an info block for each.
3993	 */
3994
3995	total = 0;
3996	for_each_netdev(net, dev) {
3997		for (i = 0; i < NPROTO; i++) {
3998			if (gifconf_list[i]) {
3999				int done;
4000				if (!pos)
4001					done = gifconf_list[i](dev, NULL, 0);
4002				else
4003					done = gifconf_list[i](dev, pos + total,
4004							       len - total);
4005				if (done < 0)
4006					return -EFAULT;
4007				total += done;
4008			}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4009		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4010	}
4011
4012	/*
4013	 *	All done.  Write the updated control block back to the caller.
4014	 */
4015	ifc.ifc_len = total;
4016
4017	/*
4018	 * 	Both BSD and Solaris return 0 here, so we do too.
4019	 */
4020	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4021}
 
4022
4023#ifdef CONFIG_PROC_FS
 
 
 
 
 
 
4024
4025#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4026
4027#define get_bucket(x) ((x) >> BUCKET_SPACE)
4028#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4029#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4030
4031static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
 
 
 
 
 
 
 
 
 
 
 
 
4032{
4033	struct net *net = seq_file_net(seq);
4034	struct net_device *dev;
4035	struct hlist_node *p;
4036	struct hlist_head *h;
4037	unsigned int count = 0, offset = get_offset(*pos);
4038
4039	h = &net->dev_name_head[get_bucket(*pos)];
4040	hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4041		if (++count == offset)
4042			return dev;
4043	}
4044
4045	return NULL;
 
 
 
 
 
 
 
4046}
 
4047
4048static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
 
 
 
 
 
 
 
 
 
 
 
4049{
4050	struct net_device *dev;
4051	unsigned int bucket;
4052
4053	do {
4054		dev = dev_from_same_bucket(seq, pos);
4055		if (dev)
4056			return dev;
4057
4058		bucket = get_bucket(*pos) + 1;
4059		*pos = set_bucket_offset(bucket, 1);
4060	} while (bucket < NETDEV_HASHENTRIES);
4061
4062	return NULL;
 
 
 
 
 
4063}
 
4064
4065/*
4066 *	This is invoked by the /proc filesystem handler to display a device
4067 *	in detail.
 
 
 
 
 
 
 
4068 */
4069void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4070	__acquires(RCU)
4071{
4072	rcu_read_lock();
4073	if (!*pos)
4074		return SEQ_START_TOKEN;
4075
4076	if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
 
 
4077		return NULL;
4078
4079	return dev_from_bucket(seq, pos);
 
 
4080}
 
4081
4082void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
4083{
4084	++*pos;
4085	return dev_from_bucket(seq, pos);
 
 
 
 
 
 
 
 
4086}
4087
4088void dev_seq_stop(struct seq_file *seq, void *v)
4089	__releases(RCU)
 
4090{
4091	rcu_read_unlock();
 
 
 
 
 
 
 
 
 
 
4092}
4093
4094static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
 
 
 
4095{
4096	struct rtnl_link_stats64 temp;
4097	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4098
4099	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4100		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4101		   dev->name, stats->rx_bytes, stats->rx_packets,
4102		   stats->rx_errors,
4103		   stats->rx_dropped + stats->rx_missed_errors,
4104		   stats->rx_fifo_errors,
4105		   stats->rx_length_errors + stats->rx_over_errors +
4106		    stats->rx_crc_errors + stats->rx_frame_errors,
4107		   stats->rx_compressed, stats->multicast,
4108		   stats->tx_bytes, stats->tx_packets,
4109		   stats->tx_errors, stats->tx_dropped,
4110		   stats->tx_fifo_errors, stats->collisions,
4111		   stats->tx_carrier_errors +
4112		    stats->tx_aborted_errors +
4113		    stats->tx_window_errors +
4114		    stats->tx_heartbeat_errors,
4115		   stats->tx_compressed);
4116}
 
4117
4118/*
4119 *	Called from the PROCfs module. This now uses the new arbitrary sized
4120 *	/proc/net interface to create /proc/net/dev
4121 */
4122static int dev_seq_show(struct seq_file *seq, void *v)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4123{
4124	if (v == SEQ_START_TOKEN)
4125		seq_puts(seq, "Inter-|   Receive                            "
4126			      "                    |  Transmit\n"
4127			      " face |bytes    packets errs drop fifo frame "
4128			      "compressed multicast|bytes    packets errs "
4129			      "drop fifo colls carrier compressed\n");
4130	else
4131		dev_seq_printf_stats(seq, v);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4132	return 0;
4133}
4134
4135static struct softnet_data *softnet_get_online(loff_t *pos)
 
 
 
 
 
 
 
 
 
 
 
4136{
4137	struct softnet_data *sd = NULL;
 
 
 
 
 
 
 
 
 
 
 
 
4138
4139	while (*pos < nr_cpu_ids)
4140		if (cpu_online(*pos)) {
4141			sd = &per_cpu(softnet_data, *pos);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4142			break;
4143		} else
4144			++*pos;
4145	return sd;
 
 
 
 
 
 
 
 
 
 
 
4146}
 
4147
4148static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
 
 
 
 
 
 
 
 
 
4149{
4150	return softnet_get_online(pos);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4151}
 
4152
4153static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 
4154{
4155	++*pos;
4156	return softnet_get_online(pos);
 
 
 
 
4157}
 
 
 
 
 
4158
4159static void softnet_seq_stop(struct seq_file *seq, void *v)
 
 
 
 
 
 
 
4160{
 
 
 
4161}
4162
4163static int softnet_seq_show(struct seq_file *seq, void *v)
 
 
 
4164{
4165	struct softnet_data *sd = v;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4166
4167	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4168		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4169		   0, 0, 0, 0, /* was fastroute */
4170		   sd->cpu_collision, sd->received_rps);
4171	return 0;
 
 
 
 
 
 
 
 
 
4172}
4173
4174static const struct seq_operations dev_seq_ops = {
4175	.start = dev_seq_start,
4176	.next  = dev_seq_next,
4177	.stop  = dev_seq_stop,
4178	.show  = dev_seq_show,
4179};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4180
4181static int dev_seq_open(struct inode *inode, struct file *file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4182{
4183	return seq_open_net(inode, file, &dev_seq_ops,
4184			    sizeof(struct seq_net_private));
 
 
 
 
4185}
 
4186
4187static const struct file_operations dev_seq_fops = {
4188	.owner	 = THIS_MODULE,
4189	.open    = dev_seq_open,
4190	.read    = seq_read,
4191	.llseek  = seq_lseek,
4192	.release = seq_release_net,
4193};
4194
4195static const struct seq_operations softnet_seq_ops = {
4196	.start = softnet_seq_start,
4197	.next  = softnet_seq_next,
4198	.stop  = softnet_seq_stop,
4199	.show  = softnet_seq_show,
4200};
4201
4202static int softnet_seq_open(struct inode *inode, struct file *file)
 
 
 
 
 
 
4203{
4204	return seq_open(file, &softnet_seq_ops);
4205}
4206
4207static const struct file_operations softnet_seq_fops = {
4208	.owner	 = THIS_MODULE,
4209	.open    = softnet_seq_open,
4210	.read    = seq_read,
4211	.llseek  = seq_lseek,
4212	.release = seq_release,
4213};
4214
4215static void *ptype_get_idx(loff_t pos)
 
 
 
4216{
4217	struct packet_type *pt = NULL;
4218	loff_t i = 0;
4219	int t;
4220
4221	list_for_each_entry_rcu(pt, &ptype_all, list) {
4222		if (i == pos)
4223			return pt;
4224		++i;
4225	}
4226
4227	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4228		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4229			if (i == pos)
4230				return pt;
4231			++i;
4232		}
 
4233	}
4234	return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4235}
 
4236
4237static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4238	__acquires(RCU)
 
4239{
4240	rcu_read_lock();
4241	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
 
 
 
 
 
 
 
 
 
 
4242}
 
4243
4244static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 
 
 
 
 
 
 
 
4245{
4246	struct packet_type *pt;
4247	struct list_head *nxt;
4248	int hash;
4249
4250	++*pos;
4251	if (v == SEQ_START_TOKEN)
4252		return ptype_get_idx(0);
4253
4254	pt = v;
4255	nxt = pt->list.next;
4256	if (pt->type == htons(ETH_P_ALL)) {
4257		if (nxt != &ptype_all)
4258			goto found;
4259		hash = 0;
4260		nxt = ptype_base[0].next;
4261	} else
4262		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 
 
 
 
 
 
 
 
 
4263
4264	while (nxt == &ptype_base[hash]) {
4265		if (++hash >= PTYPE_HASH_SIZE)
4266			return NULL;
4267		nxt = ptype_base[hash].next;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4268	}
4269found:
4270	return list_entry(nxt, struct packet_type, list);
 
4271}
 
4272
4273static void ptype_seq_stop(struct seq_file *seq, void *v)
4274	__releases(RCU)
4275{
4276	rcu_read_unlock();
 
 
 
 
 
 
 
 
4277}
4278
4279static int ptype_seq_show(struct seq_file *seq, void *v)
 
4280{
4281	struct packet_type *pt = v;
4282
4283	if (v == SEQ_START_TOKEN)
4284		seq_puts(seq, "Type Device      Function\n");
4285	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4286		if (pt->type == htons(ETH_P_ALL))
4287			seq_puts(seq, "ALL ");
4288		else
4289			seq_printf(seq, "%04x", ntohs(pt->type));
4290
4291		seq_printf(seq, " %-8s %pF\n",
4292			   pt->dev ? pt->dev->name : "", pt->func);
 
 
4293	}
4294
4295	return 0;
 
4296}
 
4297
4298static const struct seq_operations ptype_seq_ops = {
4299	.start = ptype_seq_start,
4300	.next  = ptype_seq_next,
4301	.stop  = ptype_seq_stop,
4302	.show  = ptype_seq_show,
4303};
 
 
 
 
 
 
 
 
 
 
 
4304
4305static int ptype_seq_open(struct inode *inode, struct file *file)
 
4306{
4307	return seq_open_net(inode, file, &ptype_seq_ops,
4308			sizeof(struct seq_net_private));
 
4309}
 
4310
4311static const struct file_operations ptype_seq_fops = {
4312	.owner	 = THIS_MODULE,
4313	.open    = ptype_seq_open,
4314	.read    = seq_read,
4315	.llseek  = seq_lseek,
4316	.release = seq_release_net,
4317};
4318
 
 
 
 
4319
4320static int __net_init dev_proc_net_init(struct net *net)
 
4321{
4322	int rc = -ENOMEM;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4323
4324	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4325		goto out;
4326	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4327		goto out_dev;
4328	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4329		goto out_softnet;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4330
4331	if (wext_proc_init(net))
4332		goto out_ptype;
4333	rc = 0;
4334out:
4335	return rc;
4336out_ptype:
4337	proc_net_remove(net, "ptype");
4338out_softnet:
4339	proc_net_remove(net, "softnet_stat");
4340out_dev:
4341	proc_net_remove(net, "dev");
4342	goto out;
 
 
 
 
 
4343}
4344
4345static void __net_exit dev_proc_net_exit(struct net *net)
 
 
 
4346{
4347	wext_proc_exit(net);
4348
4349	proc_net_remove(net, "ptype");
4350	proc_net_remove(net, "softnet_stat");
4351	proc_net_remove(net, "dev");
 
 
 
4352}
 
4353
4354static struct pernet_operations __net_initdata dev_proc_ops = {
4355	.init = dev_proc_net_init,
4356	.exit = dev_proc_net_exit,
4357};
 
 
 
 
4358
4359static int __init dev_proc_init(void)
 
4360{
4361	return register_pernet_subsys(&dev_proc_ops);
4362}
4363#else
4364#define dev_proc_init() 0
4365#endif	/* CONFIG_PROC_FS */
 
 
 
 
 
 
4366
 
 
 
 
 
 
 
4367
4368/**
4369 *	netdev_set_master	-	set up master pointer
4370 *	@slave: slave device
4371 *	@master: new master device
 
4372 *
4373 *	Changes the master device of the slave. Pass %NULL to break the
4374 *	bonding. The caller must hold the RTNL semaphore. On a failure
4375 *	a negative errno code is returned. On success the reference counts
4376 *	are adjusted and the function returns zero.
4377 */
4378int netdev_set_master(struct net_device *slave, struct net_device *master)
 
 
 
4379{
4380	struct net_device *old = slave->master;
4381
4382	ASSERT_RTNL();
 
 
 
 
4383
4384	if (master) {
4385		if (old)
4386			return -EBUSY;
4387		dev_hold(master);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4388	}
4389
4390	slave->master = master;
 
 
 
 
 
 
4391
4392	if (old)
4393		dev_put(old);
4394	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4395}
4396EXPORT_SYMBOL(netdev_set_master);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4397
4398/**
4399 *	netdev_set_bond_master	-	set up bonding master/slave pair
4400 *	@slave: slave device
4401 *	@master: new master device
4402 *
4403 *	Changes the master device of the slave. Pass %NULL to break the
4404 *	bonding. The caller must hold the RTNL semaphore. On a failure
4405 *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4406 *	to the routing socket and the function returns zero.
4407 */
4408int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
 
4409{
4410	int err;
 
 
4411
4412	ASSERT_RTNL();
4413
4414	err = netdev_set_master(slave, master);
4415	if (err)
4416		return err;
4417	if (master)
4418		slave->flags |= IFF_SLAVE;
4419	else
4420		slave->flags &= ~IFF_SLAVE;
4421
4422	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4423	return 0;
4424}
4425EXPORT_SYMBOL(netdev_set_bond_master);
4426
4427static void dev_change_rx_flags(struct net_device *dev, int flags)
4428{
4429	const struct net_device_ops *ops = dev->netdev_ops;
4430
4431	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4432		ops->ndo_change_rx_flags(dev, flags);
4433}
4434
4435static int __dev_set_promiscuity(struct net_device *dev, int inc)
4436{
4437	unsigned int old_flags = dev->flags;
4438	uid_t uid;
4439	gid_t gid;
4440
4441	ASSERT_RTNL();
4442
4443	dev->flags |= IFF_PROMISC;
4444	dev->promiscuity += inc;
4445	if (dev->promiscuity == 0) {
4446		/*
4447		 * Avoid overflow.
4448		 * If inc causes overflow, untouch promisc and return error.
4449		 */
4450		if (inc < 0)
4451			dev->flags &= ~IFF_PROMISC;
4452		else {
4453			dev->promiscuity -= inc;
4454			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4455				dev->name);
4456			return -EOVERFLOW;
4457		}
4458	}
4459	if (dev->flags != old_flags) {
4460		pr_info("device %s %s promiscuous mode\n",
4461			dev->name,
4462			dev->flags & IFF_PROMISC ? "entered" : "left");
4463		if (audit_enabled) {
4464			current_uid_gid(&uid, &gid);
4465			audit_log(current->audit_context, GFP_ATOMIC,
4466				AUDIT_ANOM_PROMISCUOUS,
4467				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4468				dev->name, (dev->flags & IFF_PROMISC),
4469				(old_flags & IFF_PROMISC),
4470				audit_get_loginuid(current),
4471				uid, gid,
4472				audit_get_sessionid(current));
 
4473		}
4474
4475		dev_change_rx_flags(dev, IFF_PROMISC);
4476	}
 
 
4477	return 0;
4478}
4479
4480/**
4481 *	dev_set_promiscuity	- update promiscuity count on a device
4482 *	@dev: device
4483 *	@inc: modifier
4484 *
4485 *	Add or remove promiscuity from a device. While the count in the device
4486 *	remains above zero the interface remains promiscuous. Once it hits zero
4487 *	the device reverts back to normal filtering operation. A negative inc
4488 *	value is used to drop promiscuity on the device.
4489 *	Return 0 if successful or a negative errno code on error.
4490 */
4491int dev_set_promiscuity(struct net_device *dev, int inc)
4492{
4493	unsigned int old_flags = dev->flags;
4494	int err;
4495
4496	err = __dev_set_promiscuity(dev, inc);
4497	if (err < 0)
4498		return err;
4499	if (dev->flags != old_flags)
4500		dev_set_rx_mode(dev);
4501	return err;
4502}
4503EXPORT_SYMBOL(dev_set_promiscuity);
4504
4505/**
4506 *	dev_set_allmulti	- update allmulti count on a device
4507 *	@dev: device
4508 *	@inc: modifier
4509 *
4510 *	Add or remove reception of all multicast frames to a device. While the
4511 *	count in the device remains above zero the interface remains listening
4512 *	to all interfaces. Once it hits zero the device reverts back to normal
4513 *	filtering operation. A negative @inc value is used to drop the counter
4514 *	when releasing a resource needing all multicasts.
4515 *	Return 0 if successful or a negative errno code on error.
4516 */
4517
4518int dev_set_allmulti(struct net_device *dev, int inc)
4519{
4520	unsigned int old_flags = dev->flags;
4521
4522	ASSERT_RTNL();
4523
4524	dev->flags |= IFF_ALLMULTI;
4525	dev->allmulti += inc;
4526	if (dev->allmulti == 0) {
4527		/*
4528		 * Avoid overflow.
4529		 * If inc causes overflow, untouch allmulti and return error.
4530		 */
4531		if (inc < 0)
4532			dev->flags &= ~IFF_ALLMULTI;
4533		else {
4534			dev->allmulti -= inc;
4535			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4536				dev->name);
4537			return -EOVERFLOW;
4538		}
4539	}
4540	if (dev->flags ^ old_flags) {
4541		dev_change_rx_flags(dev, IFF_ALLMULTI);
4542		dev_set_rx_mode(dev);
 
 
 
4543	}
4544	return 0;
4545}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4546EXPORT_SYMBOL(dev_set_allmulti);
4547
4548/*
4549 *	Upload unicast and multicast address lists to device and
4550 *	configure RX filtering. When the device doesn't support unicast
4551 *	filtering it is put in promiscuous mode while unicast addresses
4552 *	are present.
4553 */
4554void __dev_set_rx_mode(struct net_device *dev)
4555{
4556	const struct net_device_ops *ops = dev->netdev_ops;
4557
4558	/* dev_open will call this function so the list will stay sane. */
4559	if (!(dev->flags&IFF_UP))
4560		return;
4561
4562	if (!netif_device_present(dev))
4563		return;
4564
4565	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4566		/* Unicast addresses changes may only happen under the rtnl,
4567		 * therefore calling __dev_set_promiscuity here is safe.
4568		 */
4569		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4570			__dev_set_promiscuity(dev, 1);
4571			dev->uc_promisc = true;
4572		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4573			__dev_set_promiscuity(dev, -1);
4574			dev->uc_promisc = false;
4575		}
4576	}
4577
4578	if (ops->ndo_set_rx_mode)
4579		ops->ndo_set_rx_mode(dev);
4580}
4581
4582void dev_set_rx_mode(struct net_device *dev)
4583{
4584	netif_addr_lock_bh(dev);
4585	__dev_set_rx_mode(dev);
4586	netif_addr_unlock_bh(dev);
4587}
4588
4589/**
4590 *	dev_get_flags - get flags reported to userspace
4591 *	@dev: device
4592 *
4593 *	Get the combination of flag bits exported through APIs to userspace.
4594 */
4595unsigned int dev_get_flags(const struct net_device *dev)
4596{
4597	unsigned int flags;
4598
4599	flags = (dev->flags & ~(IFF_PROMISC |
4600				IFF_ALLMULTI |
4601				IFF_RUNNING |
4602				IFF_LOWER_UP |
4603				IFF_DORMANT)) |
4604		(dev->gflags & (IFF_PROMISC |
4605				IFF_ALLMULTI));
4606
4607	if (netif_running(dev)) {
4608		if (netif_oper_up(dev))
4609			flags |= IFF_RUNNING;
4610		if (netif_carrier_ok(dev))
4611			flags |= IFF_LOWER_UP;
4612		if (netif_dormant(dev))
4613			flags |= IFF_DORMANT;
4614	}
4615
4616	return flags;
4617}
4618EXPORT_SYMBOL(dev_get_flags);
4619
4620int __dev_change_flags(struct net_device *dev, unsigned int flags)
 
4621{
4622	unsigned int old_flags = dev->flags;
4623	int ret;
4624
4625	ASSERT_RTNL();
4626
4627	/*
4628	 *	Set the flags on our device.
4629	 */
4630
4631	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4632			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4633			       IFF_AUTOMEDIA)) |
4634		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4635				    IFF_ALLMULTI));
4636
4637	/*
4638	 *	Load in the correct multicast list now the flags have changed.
4639	 */
4640
4641	if ((old_flags ^ flags) & IFF_MULTICAST)
4642		dev_change_rx_flags(dev, IFF_MULTICAST);
4643
4644	dev_set_rx_mode(dev);
4645
4646	/*
4647	 *	Have we downed the interface. We handle IFF_UP ourselves
4648	 *	according to user attempts to set it, rather than blindly
4649	 *	setting it.
4650	 */
4651
4652	ret = 0;
4653	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4654		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4655
4656		if (!ret)
4657			dev_set_rx_mode(dev);
4658	}
4659
4660	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4661		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 
4662
4663		dev->gflags ^= IFF_PROMISC;
4664		dev_set_promiscuity(dev, inc);
 
 
 
4665	}
4666
4667	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4668	   is important. Some (broken) drivers set IFF_PROMISC, when
4669	   IFF_ALLMULTI is requested not asking us and not reporting.
4670	 */
4671	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4672		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4673
4674		dev->gflags ^= IFF_ALLMULTI;
4675		dev_set_allmulti(dev, inc);
4676	}
4677
4678	return ret;
4679}
4680
4681void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
 
 
4682{
4683	unsigned int changes = dev->flags ^ old_flags;
4684
 
 
 
4685	if (changes & IFF_UP) {
4686		if (dev->flags & IFF_UP)
4687			call_netdevice_notifiers(NETDEV_UP, dev);
4688		else
4689			call_netdevice_notifiers(NETDEV_DOWN, dev);
4690	}
4691
4692	if (dev->flags & IFF_UP &&
4693	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4694		call_netdevice_notifiers(NETDEV_CHANGE, dev);
 
 
 
 
 
 
 
 
4695}
4696
4697/**
4698 *	dev_change_flags - change device settings
4699 *	@dev: device
4700 *	@flags: device state flags
 
4701 *
4702 *	Change settings on device based state flags. The flags are
4703 *	in the userspace exported format.
4704 */
4705int dev_change_flags(struct net_device *dev, unsigned int flags)
 
4706{
4707	int ret;
4708	unsigned int changes, old_flags = dev->flags;
4709
4710	ret = __dev_change_flags(dev, flags);
4711	if (ret < 0)
4712		return ret;
4713
4714	changes = old_flags ^ dev->flags;
4715	if (changes)
4716		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4717
4718	__dev_notify_flags(dev, old_flags);
4719	return ret;
4720}
4721EXPORT_SYMBOL(dev_change_flags);
4722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4723/**
4724 *	dev_set_mtu - Change maximum transfer unit
4725 *	@dev: device
4726 *	@new_mtu: new transfer unit
 
4727 *
4728 *	Change the maximum transfer size of the network device.
4729 */
4730int dev_set_mtu(struct net_device *dev, int new_mtu)
 
4731{
4732	const struct net_device_ops *ops = dev->netdev_ops;
4733	int err;
4734
4735	if (new_mtu == dev->mtu)
4736		return 0;
4737
4738	/*	MTU must be positive.	 */
4739	if (new_mtu < 0)
4740		return -EINVAL;
4741
4742	if (!netif_device_present(dev))
4743		return -ENODEV;
4744
4745	err = 0;
4746	if (ops->ndo_change_mtu)
4747		err = ops->ndo_change_mtu(dev, new_mtu);
4748	else
4749		dev->mtu = new_mtu;
 
 
4750
4751	if (!err && dev->flags & IFF_UP)
4752		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4753	return err;
4754}
4755EXPORT_SYMBOL(dev_set_mtu);
4756
4757/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4758 *	dev_set_group - Change group this device belongs to
4759 *	@dev: device
4760 *	@new_group: group this device should belong to
4761 */
4762void dev_set_group(struct net_device *dev, int new_group)
4763{
4764	dev->group = new_group;
4765}
4766EXPORT_SYMBOL(dev_set_group);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4767
4768/**
4769 *	dev_set_mac_address - Change Media Access Control Address
4770 *	@dev: device
4771 *	@sa: new address
 
4772 *
4773 *	Change the hardware (MAC) address of the device
4774 */
4775int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 
4776{
4777	const struct net_device_ops *ops = dev->netdev_ops;
4778	int err;
4779
4780	if (!ops->ndo_set_mac_address)
4781		return -EOPNOTSUPP;
4782	if (sa->sa_family != dev->type)
4783		return -EINVAL;
4784	if (!netif_device_present(dev))
4785		return -ENODEV;
 
 
 
4786	err = ops->ndo_set_mac_address(dev, sa);
4787	if (!err)
4788		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 
 
4789	add_device_randomness(dev->dev_addr, dev->addr_len);
4790	return err;
4791}
4792EXPORT_SYMBOL(dev_set_mac_address);
4793
4794/*
4795 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4796 */
4797static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4798{
4799	int err;
4800	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4801
4802	if (!dev)
4803		return -ENODEV;
 
 
 
 
4804
4805	switch (cmd) {
4806	case SIOCGIFFLAGS:	/* Get interface flags */
4807		ifr->ifr_flags = (short) dev_get_flags(dev);
4808		return 0;
 
4809
4810	case SIOCGIFMETRIC:	/* Get the metric on the interface
4811				   (currently unused) */
4812		ifr->ifr_metric = 0;
4813		return 0;
4814
4815	case SIOCGIFMTU:	/* Get the MTU of a device */
4816		ifr->ifr_mtu = dev->mtu;
4817		return 0;
 
 
 
 
 
 
 
 
4818
4819	case SIOCGIFHWADDR:
4820		if (!dev->addr_len)
4821			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4822		else
4823			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4824			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4825		ifr->ifr_hwaddr.sa_family = dev->type;
4826		return 0;
4827
4828	case SIOCGIFSLAVE:
4829		err = -EINVAL;
4830		break;
 
 
 
 
 
 
 
4831
4832	case SIOCGIFMAP:
4833		ifr->ifr_map.mem_start = dev->mem_start;
4834		ifr->ifr_map.mem_end   = dev->mem_end;
4835		ifr->ifr_map.base_addr = dev->base_addr;
4836		ifr->ifr_map.irq       = dev->irq;
4837		ifr->ifr_map.dma       = dev->dma;
4838		ifr->ifr_map.port      = dev->if_port;
4839		return 0;
4840
4841	case SIOCGIFINDEX:
4842		ifr->ifr_ifindex = dev->ifindex;
4843		return 0;
 
 
 
 
 
 
 
 
4844
4845	case SIOCGIFTXQLEN:
4846		ifr->ifr_qlen = dev->tx_queue_len;
4847		return 0;
 
4848
4849	default:
4850		/* dev_ioctl() should ensure this case
4851		 * is never reached
4852		 */
4853		WARN_ON(1);
4854		err = -ENOTTY;
4855		break;
 
 
 
 
 
 
4856
 
 
 
 
4857	}
4858	return err;
4859}
4860
4861/*
4862 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
 
 
 
 
 
4863 */
4864static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 
 
4865{
 
 
 
 
4866	int err;
4867	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4868	const struct net_device_ops *ops;
4869
4870	if (!dev)
4871		return -ENODEV;
 
 
 
4872
4873	ops = dev->netdev_ops;
 
 
4874
4875	switch (cmd) {
4876	case SIOCSIFFLAGS:	/* Set interface flags */
4877		return dev_change_flags(dev, ifr->ifr_flags);
 
 
 
 
 
 
4878
4879	case SIOCSIFMETRIC:	/* Set the metric on the interface
4880				   (currently unused) */
4881		return -EOPNOTSUPP;
4882
4883	case SIOCSIFMTU:	/* Set the MTU of a device */
4884		return dev_set_mtu(dev, ifr->ifr_mtu);
 
 
 
 
 
 
 
 
4885
4886	case SIOCSIFHWADDR:
4887		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
 
4888
4889	case SIOCSIFHWBROADCAST:
4890		if (ifr->ifr_hwaddr.sa_family != dev->type)
4891			return -EINVAL;
4892		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4893		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4894		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4895		return 0;
4896
4897	case SIOCSIFMAP:
4898		if (ops->ndo_set_config) {
4899			if (!netif_device_present(dev))
4900				return -ENODEV;
4901			return ops->ndo_set_config(dev, &ifr->ifr_map);
4902		}
 
 
 
4903		return -EOPNOTSUPP;
 
 
 
 
 
 
 
 
 
4904
4905	case SIOCADDMULTI:
4906		if (!ops->ndo_set_rx_mode ||
4907		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4908			return -EINVAL;
4909		if (!netif_device_present(dev))
4910			return -ENODEV;
4911		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
 
 
 
 
4912
4913	case SIOCDELMULTI:
4914		if (!ops->ndo_set_rx_mode ||
4915		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4916			return -EINVAL;
4917		if (!netif_device_present(dev))
4918			return -ENODEV;
4919		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
 
 
 
 
4920
4921	case SIOCSIFTXQLEN:
4922		if (ifr->ifr_qlen < 0)
4923			return -EINVAL;
4924		dev->tx_queue_len = ifr->ifr_qlen;
4925		return 0;
4926
4927	case SIOCSIFNAME:
4928		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4929		return dev_change_name(dev, ifr->ifr_newname);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4930
4931	case SIOCSHWTSTAMP:
4932		err = net_hwtstamp_validate(ifr);
4933		if (err)
4934			return err;
4935		/* fall through */
4936
4937	/*
4938	 *	Unknown or private ioctl
4939	 */
4940	default:
4941		if ((cmd >= SIOCDEVPRIVATE &&
4942		    cmd <= SIOCDEVPRIVATE + 15) ||
4943		    cmd == SIOCBONDENSLAVE ||
4944		    cmd == SIOCBONDRELEASE ||
4945		    cmd == SIOCBONDSETHWADDR ||
4946		    cmd == SIOCBONDSLAVEINFOQUERY ||
4947		    cmd == SIOCBONDINFOQUERY ||
4948		    cmd == SIOCBONDCHANGEACTIVE ||
4949		    cmd == SIOCGMIIPHY ||
4950		    cmd == SIOCGMIIREG ||
4951		    cmd == SIOCSMIIREG ||
4952		    cmd == SIOCBRADDIF ||
4953		    cmd == SIOCBRDELIF ||
4954		    cmd == SIOCSHWTSTAMP ||
4955		    cmd == SIOCWANDEV) {
4956			err = -EOPNOTSUPP;
4957			if (ops->ndo_do_ioctl) {
4958				if (netif_device_present(dev))
4959					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4960				else
4961					err = -ENODEV;
4962			}
4963		} else
4964			err = -EINVAL;
4965
4966	}
4967	return err;
 
4968}
4969
4970/*
4971 *	This function handles all "interface"-type I/O control requests. The actual
4972 *	'doing' part of this is dev_ifsioc above.
4973 */
4974
4975/**
4976 *	dev_ioctl	-	network device ioctl
4977 *	@net: the applicable net namespace
4978 *	@cmd: command to issue
4979 *	@arg: pointer to a struct ifreq in user space
4980 *
4981 *	Issue ioctl functions to devices. This is normally called by the
4982 *	user space syscall interfaces but can sometimes be useful for
4983 *	other purposes. The return value is the return from the syscall if
4984 *	positive or a negative errno code on error.
4985 */
4986
4987int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4988{
4989	struct ifreq ifr;
4990	int ret;
4991	char *colon;
4992
4993	/* One special case: SIOCGIFCONF takes ifconf argument
4994	   and requires shared lock, because it sleeps writing
4995	   to user space.
4996	 */
4997
4998	if (cmd == SIOCGIFCONF) {
4999		rtnl_lock();
5000		ret = dev_ifconf(net, (char __user *) arg);
5001		rtnl_unlock();
5002		return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5003	}
5004	if (cmd == SIOCGIFNAME)
5005		return dev_ifname(net, (struct ifreq __user *)arg);
5006
5007	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5008		return -EFAULT;
5009
5010	ifr.ifr_name[IFNAMSIZ-1] = 0;
 
5011
5012	colon = strchr(ifr.ifr_name, ':');
5013	if (colon)
5014		*colon = 0;
 
 
 
5015
5016	/*
5017	 *	See which interface the caller is talking about.
5018	 */
5019
5020	switch (cmd) {
5021	/*
5022	 *	These ioctl calls:
5023	 *	- can be done by all.
5024	 *	- atomic and do not require locking.
5025	 *	- return a value
5026	 */
5027	case SIOCGIFFLAGS:
5028	case SIOCGIFMETRIC:
5029	case SIOCGIFMTU:
5030	case SIOCGIFHWADDR:
5031	case SIOCGIFSLAVE:
5032	case SIOCGIFMAP:
5033	case SIOCGIFINDEX:
5034	case SIOCGIFTXQLEN:
5035		dev_load(net, ifr.ifr_name);
5036		rcu_read_lock();
5037		ret = dev_ifsioc_locked(net, &ifr, cmd);
5038		rcu_read_unlock();
5039		if (!ret) {
5040			if (colon)
5041				*colon = ':';
5042			if (copy_to_user(arg, &ifr,
5043					 sizeof(struct ifreq)))
5044				ret = -EFAULT;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5045		}
5046		return ret;
5047
5048	case SIOCETHTOOL:
5049		dev_load(net, ifr.ifr_name);
5050		rtnl_lock();
5051		ret = dev_ethtool(net, &ifr);
5052		rtnl_unlock();
5053		if (!ret) {
5054			if (colon)
5055				*colon = ':';
5056			if (copy_to_user(arg, &ifr,
5057					 sizeof(struct ifreq)))
5058				ret = -EFAULT;
 
 
 
 
 
 
 
 
 
 
 
 
5059		}
5060		return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5061
5062	/*
5063	 *	These ioctl calls:
5064	 *	- require superuser power.
5065	 *	- require strict serialization.
5066	 *	- return a value
5067	 */
5068	case SIOCGMIIPHY:
5069	case SIOCGMIIREG:
5070	case SIOCSIFNAME:
5071		if (!capable(CAP_NET_ADMIN))
5072			return -EPERM;
5073		dev_load(net, ifr.ifr_name);
5074		rtnl_lock();
5075		ret = dev_ifsioc(net, &ifr, cmd);
5076		rtnl_unlock();
5077		if (!ret) {
5078			if (colon)
5079				*colon = ':';
5080			if (copy_to_user(arg, &ifr,
5081					 sizeof(struct ifreq)))
5082				ret = -EFAULT;
5083		}
5084		return ret;
5085
5086	/*
5087	 *	These ioctl calls:
5088	 *	- require superuser power.
5089	 *	- require strict serialization.
5090	 *	- do not return a value
5091	 */
5092	case SIOCSIFFLAGS:
5093	case SIOCSIFMETRIC:
5094	case SIOCSIFMTU:
5095	case SIOCSIFMAP:
5096	case SIOCSIFHWADDR:
5097	case SIOCSIFSLAVE:
5098	case SIOCADDMULTI:
5099	case SIOCDELMULTI:
5100	case SIOCSIFHWBROADCAST:
5101	case SIOCSIFTXQLEN:
5102	case SIOCSMIIREG:
5103	case SIOCBONDENSLAVE:
5104	case SIOCBONDRELEASE:
5105	case SIOCBONDSETHWADDR:
5106	case SIOCBONDCHANGEACTIVE:
5107	case SIOCBRADDIF:
5108	case SIOCBRDELIF:
5109	case SIOCSHWTSTAMP:
5110		if (!capable(CAP_NET_ADMIN))
5111			return -EPERM;
5112		/* fall through */
5113	case SIOCBONDSLAVEINFOQUERY:
5114	case SIOCBONDINFOQUERY:
5115		dev_load(net, ifr.ifr_name);
5116		rtnl_lock();
5117		ret = dev_ifsioc(net, &ifr, cmd);
5118		rtnl_unlock();
5119		return ret;
5120
5121	case SIOCGIFMEM:
5122		/* Get the per device memory space. We can add this but
5123		 * currently do not support it */
5124	case SIOCSIFMEM:
5125		/* Set the per device memory buffer space.
5126		 * Not applicable in our case */
5127	case SIOCSIFLINK:
5128		return -ENOTTY;
5129
5130	/*
5131	 *	Unknown or private ioctl.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5132	 */
5133	default:
5134		if (cmd == SIOCWANDEV ||
5135		    (cmd >= SIOCDEVPRIVATE &&
5136		     cmd <= SIOCDEVPRIVATE + 15)) {
5137			dev_load(net, ifr.ifr_name);
5138			rtnl_lock();
5139			ret = dev_ifsioc(net, &ifr, cmd);
5140			rtnl_unlock();
5141			if (!ret && copy_to_user(arg, &ifr,
5142						 sizeof(struct ifreq)))
5143				ret = -EFAULT;
5144			return ret;
5145		}
5146		/* Take care of Wireless Extensions */
5147		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5148			return wext_handle_ioctl(net, &ifr, cmd, arg);
5149		return -ENOTTY;
5150	}
 
 
5151}
5152
 
 
 
 
 
5153
5154/**
5155 *	dev_new_index	-	allocate an ifindex
5156 *	@net: the applicable net namespace
5157 *
5158 *	Returns a suitable unique value for a new device interface
5159 *	number.  The caller must hold the rtnl semaphore or the
5160 *	dev_base_lock to be sure it remains unique.
5161 */
5162static int dev_new_index(struct net *net)
5163{
5164	static int ifindex;
5165	for (;;) {
5166		if (++ifindex <= 0)
5167			ifindex = 1;
5168		if (!__dev_get_by_index(net, ifindex))
5169			return ifindex;
5170	}
5171}
5172
5173/* Delayed registration/unregisteration */
5174static LIST_HEAD(net_todo_list);
 
 
 
5175
5176static void net_set_todo(struct net_device *dev)
 
 
 
 
 
 
 
 
 
5177{
5178	list_add_tail(&dev->todo_list, &net_todo_list);
 
 
 
 
 
 
 
 
 
5179}
5180
5181static void rollback_registered_many(struct list_head *head)
 
5182{
5183	struct net_device *dev, *tmp;
 
 
 
5184
5185	BUG_ON(dev_boot_phase);
5186	ASSERT_RTNL();
5187
5188	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5189		/* Some devices call without registering
5190		 * for initialization unwind. Remove those
5191		 * devices and proceed with the remaining.
5192		 */
5193		if (dev->reg_state == NETREG_UNINITIALIZED) {
5194			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5195				 dev->name, dev);
5196
5197			WARN_ON(1);
5198			list_del(&dev->unreg_list);
5199			continue;
5200		}
5201		dev->dismantle = true;
5202		BUG_ON(dev->reg_state != NETREG_REGISTERED);
 
 
 
5203	}
5204
5205	/* If device is running, close it first. */
5206	dev_close_many(head);
 
 
 
5207
5208	list_for_each_entry(dev, head, unreg_list) {
5209		/* And unlink it from device chain. */
5210		unlist_netdevice(dev);
 
 
 
5211
5212		dev->reg_state = NETREG_UNREGISTERING;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5213	}
5214
5215	synchronize_net();
 
 
 
 
5216
5217	list_for_each_entry(dev, head, unreg_list) {
5218		/* Shutdown queueing discipline. */
5219		dev_shutdown(dev);
5220
 
 
 
 
 
5221
5222		/* Notify protocols, that we are about to destroy
5223		   this device. They should clean all the things.
5224		*/
5225		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5226
5227		if (!dev->rtnl_link_ops ||
5228		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5229			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
 
 
5230
5231		/*
5232		 *	Flush the unicast and multicast chains
5233		 */
5234		dev_uc_flush(dev);
5235		dev_mc_flush(dev);
5236
5237		if (dev->netdev_ops->ndo_uninit)
5238			dev->netdev_ops->ndo_uninit(dev);
5239
5240		/* Notifier chain MUST detach us from master device. */
5241		WARN_ON(dev->master);
 
 
5242
5243		/* Remove entries from kobject tree */
5244		netdev_unregister_kobject(dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5245	}
5246
5247	/* Process any work delayed until the end of the batch */
5248	dev = list_first_entry(head, struct net_device, unreg_list);
5249	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
 
 
 
 
 
 
5250
5251	synchronize_net();
 
 
 
 
 
 
 
 
 
 
5252
5253	list_for_each_entry(dev, head, unreg_list)
5254		dev_put(dev);
 
 
 
 
5255}
5256
5257static void rollback_registered(struct net_device *dev)
 
 
 
 
5258{
5259	LIST_HEAD(single);
 
 
5260
5261	list_add(&dev->unreg_list, &single);
5262	rollback_registered_many(&single);
5263	list_del(&single);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5264}
5265
5266static netdev_features_t netdev_fix_features(struct net_device *dev,
5267	netdev_features_t features)
5268{
5269	/* Fix illegal checksum combinations */
5270	if ((features & NETIF_F_HW_CSUM) &&
5271	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5272		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5273		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5274	}
5275
5276	/* Fix illegal SG+CSUM combinations. */
5277	if ((features & NETIF_F_SG) &&
5278	    !(features & NETIF_F_ALL_CSUM)) {
5279		netdev_dbg(dev,
5280			"Dropping NETIF_F_SG since no checksum feature.\n");
5281		features &= ~NETIF_F_SG;
5282	}
5283
5284	/* TSO requires that SG is present as well. */
5285	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5286		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5287		features &= ~NETIF_F_ALL_TSO;
5288	}
5289
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5290	/* TSO ECN requires that TSO is present as well. */
5291	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5292		features &= ~NETIF_F_TSO_ECN;
5293
5294	/* Software GSO depends on SG. */
5295	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5296		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5297		features &= ~NETIF_F_GSO;
5298	}
5299
5300	/* UFO needs SG and checksumming */
5301	if (features & NETIF_F_UFO) {
5302		/* maybe split UFO into V4 and V6? */
5303		if (!((features & NETIF_F_GEN_CSUM) ||
5304		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5305			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5306			netdev_dbg(dev,
5307				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5308			features &= ~NETIF_F_UFO;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5309		}
5310
5311		if (!(features & NETIF_F_SG)) {
5312			netdev_dbg(dev,
5313				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5314			features &= ~NETIF_F_UFO;
5315		}
5316	}
5317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5318	return features;
5319}
5320
5321int __netdev_update_features(struct net_device *dev)
5322{
 
5323	netdev_features_t features;
5324	int err = 0;
 
5325
5326	ASSERT_RTNL();
5327
5328	features = netdev_get_wanted_features(dev);
5329
5330	if (dev->netdev_ops->ndo_fix_features)
5331		features = dev->netdev_ops->ndo_fix_features(dev, features);
5332
5333	/* driver might be less strict about feature dependencies */
5334	features = netdev_fix_features(dev, features);
5335
 
 
 
 
5336	if (dev->features == features)
5337		return 0;
5338
5339	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5340		&dev->features, &features);
5341
5342	if (dev->netdev_ops->ndo_set_features)
5343		err = dev->netdev_ops->ndo_set_features(dev, features);
 
 
5344
5345	if (unlikely(err < 0)) {
5346		netdev_err(dev,
5347			"set_features() failed (%d); wanted %pNF, left %pNF\n",
5348			err, &features, &dev->features);
 
 
 
5349		return -1;
5350	}
5351
5352	if (!err)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5353		dev->features = features;
 
5354
5355	return 1;
5356}
5357
5358/**
5359 *	netdev_update_features - recalculate device features
5360 *	@dev: the device to check
5361 *
5362 *	Recalculate dev->features set and send notifications if it
5363 *	has changed. Should be called after driver or hardware dependent
5364 *	conditions might have changed that influence the features.
5365 */
5366void netdev_update_features(struct net_device *dev)
5367{
5368	if (__netdev_update_features(dev))
5369		netdev_features_change(dev);
5370}
5371EXPORT_SYMBOL(netdev_update_features);
5372
5373/**
5374 *	netdev_change_features - recalculate device features
5375 *	@dev: the device to check
5376 *
5377 *	Recalculate dev->features set and send notifications even
5378 *	if they have not changed. Should be called instead of
5379 *	netdev_update_features() if also dev->vlan_features might
5380 *	have changed to allow the changes to be propagated to stacked
5381 *	VLAN devices.
5382 */
5383void netdev_change_features(struct net_device *dev)
5384{
5385	__netdev_update_features(dev);
5386	netdev_features_change(dev);
5387}
5388EXPORT_SYMBOL(netdev_change_features);
5389
5390/**
5391 *	netif_stacked_transfer_operstate -	transfer operstate
5392 *	@rootdev: the root or lower level device to transfer state from
5393 *	@dev: the device to transfer operstate to
5394 *
5395 *	Transfer operational state from root to device. This is normally
5396 *	called when a stacking relationship exists between the root
5397 *	device and the device(a leaf device).
5398 */
5399void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5400					struct net_device *dev)
5401{
5402	if (rootdev->operstate == IF_OPER_DORMANT)
5403		netif_dormant_on(dev);
5404	else
5405		netif_dormant_off(dev);
5406
5407	if (netif_carrier_ok(rootdev)) {
5408		if (!netif_carrier_ok(dev))
5409			netif_carrier_on(dev);
5410	} else {
5411		if (netif_carrier_ok(dev))
5412			netif_carrier_off(dev);
5413	}
 
 
5414}
5415EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5416
5417#ifdef CONFIG_RPS
5418static int netif_alloc_rx_queues(struct net_device *dev)
5419{
5420	unsigned int i, count = dev->num_rx_queues;
5421	struct netdev_rx_queue *rx;
 
 
5422
5423	BUG_ON(count < 1);
5424
5425	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5426	if (!rx) {
5427		pr_err("netdev: Unable to allocate %u rx queues\n", count);
5428		return -ENOMEM;
5429	}
5430	dev->_rx = rx;
5431
5432	for (i = 0; i < count; i++)
5433		rx[i].dev = dev;
 
 
 
 
 
 
5434	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5435}
5436#endif
5437
5438static void netdev_init_one_queue(struct net_device *dev,
5439				  struct netdev_queue *queue, void *_unused)
5440{
5441	/* Initialize queue lock */
5442	spin_lock_init(&queue->_xmit_lock);
5443	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5444	queue->xmit_lock_owner = -1;
5445	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5446	queue->dev = dev;
5447#ifdef CONFIG_BQL
5448	dql_init(&queue->dql, HZ);
5449#endif
5450}
5451
 
 
 
 
 
5452static int netif_alloc_netdev_queues(struct net_device *dev)
5453{
5454	unsigned int count = dev->num_tx_queues;
5455	struct netdev_queue *tx;
 
5456
5457	BUG_ON(count < 1);
 
5458
5459	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5460	if (!tx) {
5461		pr_err("netdev: Unable to allocate %u tx queues\n", count);
5462		return -ENOMEM;
5463	}
5464	dev->_tx = tx;
5465
5466	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5467	spin_lock_init(&dev->tx_global_lock);
5468
5469	return 0;
5470}
5471
 
 
 
 
 
 
 
 
 
 
 
 
5472/**
5473 *	register_netdevice	- register a network device
5474 *	@dev: device to register
5475 *
5476 *	Take a completed network device structure and add it to the kernel
5477 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5478 *	chain. 0 is returned on success. A negative errno code is returned
5479 *	on a failure to set up the device, or if the name is a duplicate.
5480 *
5481 *	Callers must hold the rtnl semaphore. You may want
5482 *	register_netdev() instead of this.
5483 *
5484 *	BUGS:
5485 *	The locking appears insufficient to guarantee two parallel registers
5486 *	will not get the same name.
5487 */
5488
5489int register_netdevice(struct net_device *dev)
5490{
5491	int ret;
5492	struct net *net = dev_net(dev);
5493
 
 
5494	BUG_ON(dev_boot_phase);
5495	ASSERT_RTNL();
5496
5497	might_sleep();
5498
5499	/* When net_device's are persistent, this will be fatal. */
5500	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5501	BUG_ON(!net);
5502
 
 
 
 
5503	spin_lock_init(&dev->addr_list_lock);
5504	netdev_set_addr_lockdep_class(dev);
5505
5506	dev->iflink = -1;
5507
5508	ret = dev_get_valid_name(dev, dev->name);
5509	if (ret < 0)
5510		goto out;
5511
 
 
 
 
 
5512	/* Init, if this function is available */
5513	if (dev->netdev_ops->ndo_init) {
5514		ret = dev->netdev_ops->ndo_init(dev);
5515		if (ret) {
5516			if (ret > 0)
5517				ret = -EIO;
5518			goto out;
5519		}
5520	}
5521
5522	dev->ifindex = dev_new_index(net);
5523	if (dev->iflink == -1)
5524		dev->iflink = dev->ifindex;
 
 
 
 
 
 
 
 
 
 
 
5525
5526	/* Transfer changeable features to wanted_features and enable
5527	 * software offloads (GSO and GRO).
5528	 */
5529	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5530	dev->features |= NETIF_F_SOFT_FEATURES;
 
 
 
 
 
 
5531	dev->wanted_features = dev->features & dev->hw_features;
5532
5533	/* Turn on no cache copy if HW is doing checksum */
5534	if (!(dev->flags & IFF_LOOPBACK)) {
5535		dev->hw_features |= NETIF_F_NOCACHE_COPY;
5536		if (dev->features & NETIF_F_ALL_CSUM) {
5537			dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5538			dev->features |= NETIF_F_NOCACHE_COPY;
5539		}
5540	}
 
 
 
 
 
 
 
 
 
5541
5542	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5543	 */
5544	dev->vlan_features |= NETIF_F_HIGHDMA;
5545
 
 
 
 
 
 
 
 
5546	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5547	ret = notifier_to_errno(ret);
5548	if (ret)
5549		goto err_uninit;
5550
5551	ret = netdev_register_kobject(dev);
 
 
 
5552	if (ret)
5553		goto err_uninit;
5554	dev->reg_state = NETREG_REGISTERED;
5555
5556	__netdev_update_features(dev);
5557
5558	/*
5559	 *	Default initial state at registry is that the
5560	 *	device is present.
5561	 */
5562
5563	set_bit(__LINK_STATE_PRESENT, &dev->state);
5564
 
 
5565	dev_init_scheduler(dev);
5566	dev_hold(dev);
 
5567	list_netdevice(dev);
 
5568	add_device_randomness(dev->dev_addr, dev->addr_len);
5569
 
 
 
 
 
 
 
5570	/* Notify protocols, that a new device appeared. */
5571	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5572	ret = notifier_to_errno(ret);
5573	if (ret) {
5574		rollback_registered(dev);
5575		dev->reg_state = NETREG_UNREGISTERED;
 
 
5576	}
5577	/*
5578	 *	Prevent userspace races by waiting until the network
5579	 *	device is fully setup before sending notifications.
5580	 */
5581	if (!dev->rtnl_link_ops ||
5582	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5583		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5584
5585out:
5586	return ret;
5587
 
 
5588err_uninit:
5589	if (dev->netdev_ops->ndo_uninit)
5590		dev->netdev_ops->ndo_uninit(dev);
 
 
 
 
5591	goto out;
5592}
5593EXPORT_SYMBOL(register_netdevice);
5594
5595/**
5596 *	init_dummy_netdev	- init a dummy network device for NAPI
5597 *	@dev: device to init
5598 *
5599 *	This takes a network device structure and initialize the minimum
5600 *	amount of fields so it can be used to schedule NAPI polls without
5601 *	registering a full blown interface. This is to be used by drivers
5602 *	that need to tie several hardware interfaces to a single NAPI
5603 *	poll scheduler due to HW limitations.
5604 */
5605int init_dummy_netdev(struct net_device *dev)
5606{
5607	/* Clear everything. Note we don't initialize spinlocks
5608	 * are they aren't supposed to be taken by any of the
5609	 * NAPI code and this dummy netdev is supposed to be
5610	 * only ever used for NAPI polls
5611	 */
5612	memset(dev, 0, sizeof(struct net_device));
5613
5614	/* make sure we BUG if trying to hit standard
5615	 * register/unregister code path
5616	 */
5617	dev->reg_state = NETREG_DUMMY;
5618
5619	/* NAPI wants this */
5620	INIT_LIST_HEAD(&dev->napi_list);
5621
5622	/* a dummy interface is started by default */
5623	set_bit(__LINK_STATE_PRESENT, &dev->state);
5624	set_bit(__LINK_STATE_START, &dev->state);
5625
 
 
 
5626	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5627	 * because users of this 'device' dont need to change
5628	 * its refcount.
5629	 */
5630
5631	return 0;
5632}
5633EXPORT_SYMBOL_GPL(init_dummy_netdev);
5634
5635
5636/**
5637 *	register_netdev	- register a network device
5638 *	@dev: device to register
5639 *
5640 *	Take a completed network device structure and add it to the kernel
5641 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5642 *	chain. 0 is returned on success. A negative errno code is returned
5643 *	on a failure to set up the device, or if the name is a duplicate.
5644 *
5645 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5646 *	and expands the device name if you passed a format string to
5647 *	alloc_netdev.
5648 */
5649int register_netdev(struct net_device *dev)
5650{
5651	int err;
5652
5653	rtnl_lock();
 
5654	err = register_netdevice(dev);
5655	rtnl_unlock();
5656	return err;
5657}
5658EXPORT_SYMBOL(register_netdev);
5659
5660int netdev_refcnt_read(const struct net_device *dev)
5661{
 
5662	int i, refcnt = 0;
5663
5664	for_each_possible_cpu(i)
5665		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5666	return refcnt;
 
 
 
5667}
5668EXPORT_SYMBOL(netdev_refcnt_read);
5669
5670/*
5671 * netdev_wait_allrefs - wait until all references are gone.
 
 
 
 
 
5672 *
5673 * This is called when unregistering network devices.
5674 *
5675 * Any protocol or device that holds a reference should register
5676 * for netdevice notification, and cleanup and put back the
5677 * reference if they receive an UNREGISTER event.
5678 * We can get stuck here if buggy protocols don't correctly
5679 * call dev_put.
5680 */
5681static void netdev_wait_allrefs(struct net_device *dev)
5682{
5683	unsigned long rebroadcast_time, warning_time;
5684	int refcnt;
5685
5686	linkwatch_forget_dev(dev);
5687
5688	rebroadcast_time = warning_time = jiffies;
5689	refcnt = netdev_refcnt_read(dev);
5690
5691	while (refcnt != 0) {
 
 
 
 
5692		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5693			rtnl_lock();
5694
5695			/* Rebroadcast unregister notification */
5696			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5697			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5698			 * should have already handle it the first time */
5699
5700			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5701				     &dev->state)) {
5702				/* We must not have linkwatch events
5703				 * pending on unregister. If this
5704				 * happens, we simply run the queue
5705				 * unscheduled, resulting in a noop
5706				 * for this device.
5707				 */
5708				linkwatch_run_queue();
5709			}
 
 
 
 
 
5710
5711			__rtnl_unlock();
5712
5713			rebroadcast_time = jiffies;
5714		}
5715
5716		msleep(250);
 
 
 
 
 
 
5717
5718		refcnt = netdev_refcnt_read(dev);
 
 
 
 
 
 
 
 
 
 
5719
5720		if (time_after(jiffies, warning_time + 10 * HZ)) {
5721			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5722				 dev->name, refcnt);
5723			warning_time = jiffies;
5724		}
5725	}
5726}
5727
5728/* The sequence is:
5729 *
5730 *	rtnl_lock();
5731 *	...
5732 *	register_netdevice(x1);
5733 *	register_netdevice(x2);
5734 *	...
5735 *	unregister_netdevice(y1);
5736 *	unregister_netdevice(y2);
5737 *      ...
5738 *	rtnl_unlock();
5739 *	free_netdev(y1);
5740 *	free_netdev(y2);
5741 *
5742 * We are invoked by rtnl_unlock().
5743 * This allows us to deal with problems:
5744 * 1) We can delete sysfs objects which invoke hotplug
5745 *    without deadlocking with linkwatch via keventd.
5746 * 2) Since we run with the RTNL semaphore not held, we can sleep
5747 *    safely in order to wait for the netdev refcnt to drop to zero.
5748 *
5749 * We must not return until all unregister events added during
5750 * the interval the lock was held have been completed.
5751 */
5752void netdev_run_todo(void)
5753{
 
5754	struct list_head list;
 
 
 
 
 
 
 
 
 
 
 
 
 
5755
5756	/* Snapshot list, allow later requests */
5757	list_replace_init(&net_todo_list, &list);
5758
5759	__rtnl_unlock();
5760
5761	/* Wait for rcu callbacks to finish before attempting to drain
5762	 * the device list.  This usually avoids a 250ms wait.
5763	 */
5764	if (!list_empty(&list))
5765		rcu_barrier();
5766
5767	while (!list_empty(&list)) {
5768		struct net_device *dev
5769			= list_first_entry(&list, struct net_device, todo_list);
5770		list_del(&dev->todo_list);
5771
5772		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5773			pr_err("network todo '%s' but state %d\n",
5774			       dev->name, dev->reg_state);
5775			dump_stack();
5776			continue;
5777		}
5778
 
5779		dev->reg_state = NETREG_UNREGISTERED;
 
 
 
5780
5781		on_each_cpu(flush_backlog, dev, 1);
5782
5783		netdev_wait_allrefs(dev);
5784
5785		/* paranoia */
5786		BUG_ON(netdev_refcnt_read(dev));
 
 
5787		WARN_ON(rcu_access_pointer(dev->ip_ptr));
5788		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5789		WARN_ON(dev->dn_ptr);
5790
5791		if (dev->destructor)
5792			dev->destructor(dev);
 
 
 
 
 
5793
5794		/* Free network device */
5795		kobject_put(&dev->dev.kobj);
5796	}
5797}
5798
5799/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5800 * fields in the same order, with only the type differing.
 
 
5801 */
5802void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5803			     const struct net_device_stats *netdev_stats)
5804{
5805#if BITS_PER_LONG == 64
5806	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5807	memcpy(stats64, netdev_stats, sizeof(*stats64));
5808#else
5809	size_t i, n = sizeof(*stats64) / sizeof(u64);
5810	const unsigned long *src = (const unsigned long *)netdev_stats;
5811	u64 *dst = (u64 *)stats64;
5812
5813	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5814		     sizeof(*stats64) / sizeof(u64));
5815	for (i = 0; i < n; i++)
5816		dst[i] = src[i];
5817#endif
 
 
5818}
5819EXPORT_SYMBOL(netdev_stats_to_stats64);
5820
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5821/**
5822 *	dev_get_stats	- get network device statistics
5823 *	@dev: device to get statistics from
5824 *	@storage: place to store stats
5825 *
5826 *	Get network statistics from device. Return @storage.
5827 *	The device driver may provide its own method by setting
5828 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5829 *	otherwise the internal statistics structure is used.
5830 */
5831struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5832					struct rtnl_link_stats64 *storage)
5833{
5834	const struct net_device_ops *ops = dev->netdev_ops;
 
5835
5836	if (ops->ndo_get_stats64) {
5837		memset(storage, 0, sizeof(*storage));
5838		ops->ndo_get_stats64(dev, storage);
5839	} else if (ops->ndo_get_stats) {
5840		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5841	} else {
5842		netdev_stats_to_stats64(storage, &dev->stats);
5843	}
5844	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5845	return storage;
5846}
5847EXPORT_SYMBOL(dev_get_stats);
5848
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5849struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5850{
5851	struct netdev_queue *queue = dev_ingress_queue(dev);
5852
5853#ifdef CONFIG_NET_CLS_ACT
5854	if (queue)
5855		return queue;
5856	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5857	if (!queue)
5858		return NULL;
5859	netdev_init_one_queue(dev, queue, NULL);
5860	queue->qdisc = &noop_qdisc;
5861	queue->qdisc_sleeping = &noop_qdisc;
5862	rcu_assign_pointer(dev->ingress_queue, queue);
5863#endif
5864	return queue;
5865}
5866
 
 
 
 
 
 
 
 
 
 
5867/**
5868 *	alloc_netdev_mqs - allocate network device
5869 *	@sizeof_priv:	size of private data to allocate space for
5870 *	@name:		device name format string
5871 *	@setup:		callback to initialize device
5872 *	@txqs:		the number of TX subqueues to allocate
5873 *	@rxqs:		the number of RX subqueues to allocate
5874 *
5875 *	Allocates a struct net_device with private data area for driver use
5876 *	and performs basic initialization.  Also allocates subquue structs
5877 *	for each queue on the device.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5878 */
5879struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
5880		void (*setup)(struct net_device *),
5881		unsigned int txqs, unsigned int rxqs)
5882{
5883	struct net_device *dev;
5884	size_t alloc_size;
5885	struct net_device *p;
5886
5887	BUG_ON(strlen(name) >= sizeof(dev->name));
5888
5889	if (txqs < 1) {
5890		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5891		return NULL;
5892	}
5893
5894#ifdef CONFIG_RPS
5895	if (rxqs < 1) {
5896		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5897		return NULL;
5898	}
5899#endif
5900
5901	alloc_size = sizeof(struct net_device);
5902	if (sizeof_priv) {
5903		/* ensure 32-byte alignment of private area */
5904		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5905		alloc_size += sizeof_priv;
5906	}
5907	/* ensure 32-byte alignment of whole construct */
5908	alloc_size += NETDEV_ALIGN - 1;
5909
5910	p = kzalloc(alloc_size, GFP_KERNEL);
5911	if (!p) {
5912		pr_err("alloc_netdev: Unable to allocate device\n");
5913		return NULL;
5914	}
5915
5916	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5917	dev->padded = (char *)dev - (char *)p;
5918
 
 
5919	dev->pcpu_refcnt = alloc_percpu(int);
5920	if (!dev->pcpu_refcnt)
5921		goto free_p;
 
 
 
 
5922
5923	if (dev_addr_init(dev))
5924		goto free_pcpu;
5925
5926	dev_mc_init(dev);
5927	dev_uc_init(dev);
5928
5929	dev_net_set(dev, &init_net);
5930
5931	dev->gso_max_size = GSO_MAX_SIZE;
5932	dev->gso_max_segs = GSO_MAX_SEGS;
 
 
 
 
 
 
 
 
 
5933
5934	INIT_LIST_HEAD(&dev->napi_list);
5935	INIT_LIST_HEAD(&dev->unreg_list);
 
5936	INIT_LIST_HEAD(&dev->link_watch_list);
5937	dev->priv_flags = IFF_XMIT_DST_RELEASE;
 
 
 
 
 
 
 
 
5938	setup(dev);
5939
 
 
 
 
 
5940	dev->num_tx_queues = txqs;
5941	dev->real_num_tx_queues = txqs;
5942	if (netif_alloc_netdev_queues(dev))
5943		goto free_all;
5944
5945#ifdef CONFIG_RPS
5946	dev->num_rx_queues = rxqs;
5947	dev->real_num_rx_queues = rxqs;
5948	if (netif_alloc_rx_queues(dev))
5949		goto free_all;
5950#endif
5951
5952	strcpy(dev->name, name);
 
5953	dev->group = INIT_NETDEV_GROUP;
 
 
 
 
 
5954	return dev;
5955
5956free_all:
5957	free_netdev(dev);
5958	return NULL;
5959
5960free_pcpu:
 
5961	free_percpu(dev->pcpu_refcnt);
5962	kfree(dev->_tx);
5963#ifdef CONFIG_RPS
5964	kfree(dev->_rx);
5965#endif
5966
5967free_p:
5968	kfree(p);
5969	return NULL;
5970}
5971EXPORT_SYMBOL(alloc_netdev_mqs);
5972
5973/**
5974 *	free_netdev - free network device
5975 *	@dev: device
5976 *
5977 *	This function does the last stage of destroying an allocated device
5978 * 	interface. The reference to the device object is released.
5979 *	If this is the last reference then it will be freed.
 
5980 */
5981void free_netdev(struct net_device *dev)
5982{
5983	struct napi_struct *p, *n;
5984
5985	release_net(dev_net(dev));
 
 
 
 
 
 
 
 
 
 
5986
5987	kfree(dev->_tx);
5988#ifdef CONFIG_RPS
5989	kfree(dev->_rx);
5990#endif
5991
5992	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5993
5994	/* Flush device addresses */
5995	dev_addr_flush(dev);
5996
5997	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5998		netif_napi_del(p);
5999
 
 
6000	free_percpu(dev->pcpu_refcnt);
6001	dev->pcpu_refcnt = NULL;
 
 
 
 
 
6002
6003	/*  Compatibility with error handling in drivers */
6004	if (dev->reg_state == NETREG_UNINITIALIZED) {
6005		kfree((char *)dev - dev->padded);
6006		return;
6007	}
6008
6009	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6010	dev->reg_state = NETREG_RELEASED;
6011
6012	/* will free via device release */
6013	put_device(&dev->dev);
6014}
6015EXPORT_SYMBOL(free_netdev);
6016
6017/**
6018 *	synchronize_net -  Synchronize with packet receive processing
6019 *
6020 *	Wait for packets currently being received to be done.
6021 *	Does not block later packets from starting.
6022 */
6023void synchronize_net(void)
6024{
6025	might_sleep();
6026	if (rtnl_is_locked())
6027		synchronize_rcu_expedited();
6028	else
6029		synchronize_rcu();
6030}
6031EXPORT_SYMBOL(synchronize_net);
6032
6033/**
6034 *	unregister_netdevice_queue - remove device from the kernel
6035 *	@dev: device
6036 *	@head: list
6037 *
6038 *	This function shuts down a device interface and removes it
6039 *	from the kernel tables.
6040 *	If head not NULL, device is queued to be unregistered later.
6041 *
6042 *	Callers must hold the rtnl semaphore.  You may want
6043 *	unregister_netdev() instead of this.
6044 */
6045
6046void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6047{
6048	ASSERT_RTNL();
6049
6050	if (head) {
6051		list_move_tail(&dev->unreg_list, head);
6052	} else {
6053		rollback_registered(dev);
6054		/* Finish processing unregister after unlock */
6055		net_set_todo(dev);
 
6056	}
6057}
6058EXPORT_SYMBOL(unregister_netdevice_queue);
6059
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6060/**
6061 *	unregister_netdevice_many - unregister many devices
6062 *	@head: list of devices
 
 
 
6063 */
6064void unregister_netdevice_many(struct list_head *head)
6065{
6066	struct net_device *dev;
6067
6068	if (!list_empty(head)) {
6069		rollback_registered_many(head);
6070		list_for_each_entry(dev, head, unreg_list)
6071			net_set_todo(dev);
6072	}
6073}
6074EXPORT_SYMBOL(unregister_netdevice_many);
6075
6076/**
6077 *	unregister_netdev - remove device from the kernel
6078 *	@dev: device
6079 *
6080 *	This function shuts down a device interface and removes it
6081 *	from the kernel tables.
6082 *
6083 *	This is just a wrapper for unregister_netdevice that takes
6084 *	the rtnl semaphore.  In general you want to use this and not
6085 *	unregister_netdevice.
6086 */
6087void unregister_netdev(struct net_device *dev)
6088{
6089	rtnl_lock();
6090	unregister_netdevice(dev);
6091	rtnl_unlock();
6092}
6093EXPORT_SYMBOL(unregister_netdev);
6094
6095/**
6096 *	dev_change_net_namespace - move device to different nethost namespace
6097 *	@dev: device
6098 *	@net: network namespace
6099 *	@pat: If not NULL name pattern to try if the current device name
6100 *	      is already taken in the destination network namespace.
 
 
6101 *
6102 *	This function shuts down a device interface and moves it
6103 *	to a new network namespace. On success 0 is returned, on
6104 *	a failure a netagive errno code is returned.
6105 *
6106 *	Callers must hold the rtnl semaphore.
6107 */
6108
6109int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
 
6110{
6111	int err;
 
6112
6113	ASSERT_RTNL();
6114
6115	/* Don't allow namespace local devices to be moved. */
6116	err = -EINVAL;
6117	if (dev->features & NETIF_F_NETNS_LOCAL)
6118		goto out;
6119
6120	/* Ensure the device has been registrered */
6121	err = -EINVAL;
6122	if (dev->reg_state != NETREG_REGISTERED)
6123		goto out;
6124
6125	/* Get out if there is nothing todo */
6126	err = 0;
6127	if (net_eq(dev_net(dev), net))
6128		goto out;
6129
6130	/* Pick the destination device name, and ensure
6131	 * we can use it in the destination network namespace.
6132	 */
6133	err = -EEXIST;
6134	if (__dev_get_by_name(net, dev->name)) {
6135		/* We get here if we can't use the current device name */
6136		if (!pat)
6137			goto out;
6138		if (dev_get_valid_name(dev, pat) < 0)
 
6139			goto out;
6140	}
6141
 
 
 
 
 
6142	/*
6143	 * And now a mini version of register_netdevice unregister_netdevice.
6144	 */
6145
6146	/* If device is running close it first. */
6147	dev_close(dev);
6148
6149	/* And unlink it from device chain */
6150	err = -ENODEV;
6151	unlist_netdevice(dev);
6152
6153	synchronize_net();
6154
6155	/* Shutdown queueing discipline. */
6156	dev_shutdown(dev);
6157
6158	/* Notify protocols, that we are about to destroy
6159	   this device. They should clean all the things.
6160
6161	   Note that dev->reg_state stays at NETREG_REGISTERED.
6162	   This is wanted because this way 8021q and macvlan know
6163	   the device is just moving and can keep their slaves up.
6164	*/
6165	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6166	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6167	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
 
 
 
 
 
 
 
 
 
 
 
6168
6169	/*
6170	 *	Flush the unicast and multicast chains
6171	 */
6172	dev_uc_flush(dev);
6173	dev_mc_flush(dev);
6174
 
 
 
 
 
 
 
6175	/* Actually switch the network namespace */
6176	dev_net_set(dev, net);
 
6177
6178	/* If there is an ifindex conflict assign a new one */
6179	if (__dev_get_by_index(net, dev->ifindex)) {
6180		int iflink = (dev->iflink == dev->ifindex);
6181		dev->ifindex = dev_new_index(net);
6182		if (iflink)
6183			dev->iflink = dev->ifindex;
6184	}
6185
6186	/* Fixup kobjects */
6187	err = device_rename(&dev->dev, dev->name);
6188	WARN_ON(err);
6189
 
 
 
 
 
 
6190	/* Add the device back in the hashes */
6191	list_netdevice(dev);
6192
6193	/* Notify protocols, that a new device appeared. */
6194	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6195
6196	/*
6197	 *	Prevent userspace races by waiting until the network
6198	 *	device is fully setup before sending notifications.
6199	 */
6200	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6201
6202	synchronize_net();
6203	err = 0;
6204out:
6205	return err;
6206}
6207EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6208
6209static int dev_cpu_callback(struct notifier_block *nfb,
6210			    unsigned long action,
6211			    void *ocpu)
6212{
6213	struct sk_buff **list_skb;
6214	struct sk_buff *skb;
6215	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6216	struct softnet_data *sd, *oldsd;
6217
6218	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6219		return NOTIFY_OK;
6220
6221	local_irq_disable();
6222	cpu = smp_processor_id();
6223	sd = &per_cpu(softnet_data, cpu);
6224	oldsd = &per_cpu(softnet_data, oldcpu);
6225
6226	/* Find end of our completion_queue. */
6227	list_skb = &sd->completion_queue;
6228	while (*list_skb)
6229		list_skb = &(*list_skb)->next;
6230	/* Append completion queue from offline CPU. */
6231	*list_skb = oldsd->completion_queue;
6232	oldsd->completion_queue = NULL;
6233
6234	/* Append output queue from offline CPU. */
6235	if (oldsd->output_queue) {
6236		*sd->output_queue_tailp = oldsd->output_queue;
6237		sd->output_queue_tailp = oldsd->output_queue_tailp;
6238		oldsd->output_queue = NULL;
6239		oldsd->output_queue_tailp = &oldsd->output_queue;
6240	}
6241	/* Append NAPI poll list from offline CPU. */
6242	if (!list_empty(&oldsd->poll_list)) {
6243		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6244		raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 
 
 
 
 
 
 
 
 
6245	}
6246
6247	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6248	local_irq_enable();
6249
 
 
 
 
 
 
 
6250	/* Process offline CPU's input_pkt_queue */
6251	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6252		netif_rx(skb);
6253		input_queue_head_incr(oldsd);
6254	}
6255	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6256		netif_rx(skb);
6257		input_queue_head_incr(oldsd);
6258	}
6259
6260	return NOTIFY_OK;
6261}
6262
6263
6264/**
6265 *	netdev_increment_features - increment feature set by one
6266 *	@all: current feature set
6267 *	@one: new feature set
6268 *	@mask: mask feature set
6269 *
6270 *	Computes a new feature set after adding a device with feature set
6271 *	@one to the master device with current feature set @all.  Will not
6272 *	enable anything that is off in @mask. Returns the new feature set.
6273 */
6274netdev_features_t netdev_increment_features(netdev_features_t all,
6275	netdev_features_t one, netdev_features_t mask)
6276{
6277	if (mask & NETIF_F_GEN_CSUM)
6278		mask |= NETIF_F_ALL_CSUM;
6279	mask |= NETIF_F_VLAN_CHALLENGED;
6280
6281	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6282	all &= one | ~NETIF_F_ALL_FOR_ALL;
6283
6284	/* If one device supports hw checksumming, set for all. */
6285	if (all & NETIF_F_GEN_CSUM)
6286		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6287
6288	return all;
6289}
6290EXPORT_SYMBOL(netdev_increment_features);
6291
6292static struct hlist_head *netdev_create_hash(void)
6293{
6294	int i;
6295	struct hlist_head *hash;
6296
6297	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6298	if (hash != NULL)
6299		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6300			INIT_HLIST_HEAD(&hash[i]);
6301
6302	return hash;
6303}
6304
6305/* Initialize per network namespace state */
6306static int __net_init netdev_init(struct net *net)
6307{
6308	if (net != &init_net)
6309		INIT_LIST_HEAD(&net->dev_base_head);
 
 
6310
6311	net->dev_name_head = netdev_create_hash();
6312	if (net->dev_name_head == NULL)
6313		goto err_name;
6314
6315	net->dev_index_head = netdev_create_hash();
6316	if (net->dev_index_head == NULL)
6317		goto err_idx;
6318
 
 
6319	return 0;
6320
6321err_idx:
6322	kfree(net->dev_name_head);
6323err_name:
6324	return -ENOMEM;
6325}
6326
6327/**
6328 *	netdev_drivername - network driver for the device
6329 *	@dev: network device
6330 *
6331 *	Determine network driver for device.
6332 */
6333const char *netdev_drivername(const struct net_device *dev)
6334{
6335	const struct device_driver *driver;
6336	const struct device *parent;
6337	const char *empty = "";
6338
6339	parent = dev->dev.parent;
6340	if (!parent)
6341		return empty;
6342
6343	driver = parent->driver;
6344	if (driver && driver->name)
6345		return driver->name;
6346	return empty;
6347}
6348
6349int __netdev_printk(const char *level, const struct net_device *dev,
6350			   struct va_format *vaf)
6351{
6352	int r;
6353
6354	if (dev && dev->dev.parent)
6355		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6356			       netdev_name(dev), vaf);
6357	else if (dev)
6358		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6359	else
6360		r = printk("%s(NULL net_device): %pV", level, vaf);
6361
6362	return r;
 
 
 
6363}
6364EXPORT_SYMBOL(__netdev_printk);
6365
6366int netdev_printk(const char *level, const struct net_device *dev,
6367		  const char *format, ...)
6368{
6369	struct va_format vaf;
6370	va_list args;
6371	int r;
6372
6373	va_start(args, format);
6374
6375	vaf.fmt = format;
6376	vaf.va = &args;
6377
6378	r = __netdev_printk(level, dev, &vaf);
6379	va_end(args);
6380
6381	return r;
6382}
6383EXPORT_SYMBOL(netdev_printk);
6384
6385#define define_netdev_printk_level(func, level)			\
6386int func(const struct net_device *dev, const char *fmt, ...)	\
6387{								\
6388	int r;							\
6389	struct va_format vaf;					\
6390	va_list args;						\
6391								\
6392	va_start(args, fmt);					\
6393								\
6394	vaf.fmt = fmt;						\
6395	vaf.va = &args;						\
6396								\
6397	r = __netdev_printk(level, dev, &vaf);			\
6398	va_end(args);						\
6399								\
6400	return r;						\
6401}								\
6402EXPORT_SYMBOL(func);
6403
6404define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6405define_netdev_printk_level(netdev_alert, KERN_ALERT);
6406define_netdev_printk_level(netdev_crit, KERN_CRIT);
6407define_netdev_printk_level(netdev_err, KERN_ERR);
6408define_netdev_printk_level(netdev_warn, KERN_WARNING);
6409define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6410define_netdev_printk_level(netdev_info, KERN_INFO);
6411
6412static void __net_exit netdev_exit(struct net *net)
6413{
6414	kfree(net->dev_name_head);
6415	kfree(net->dev_index_head);
 
 
6416}
6417
6418static struct pernet_operations __net_initdata netdev_net_ops = {
6419	.init = netdev_init,
6420	.exit = netdev_exit,
6421};
6422
6423static void __net_exit default_device_exit(struct net *net)
6424{
6425	struct net_device *dev, *aux;
6426	/*
6427	 * Push all migratable network devices back to the
6428	 * initial network namespace
6429	 */
6430	rtnl_lock();
6431	for_each_netdev_safe(net, dev, aux) {
6432		int err;
6433		char fb_name[IFNAMSIZ];
6434
6435		/* Ignore unmoveable devices (i.e. loopback) */
6436		if (dev->features & NETIF_F_NETNS_LOCAL)
6437			continue;
6438
6439		/* Leave virtual devices for the generic cleanup */
6440		if (dev->rtnl_link_ops)
6441			continue;
6442
6443		/* Push remaining network devices to init_net */
6444		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
 
 
6445		err = dev_change_net_namespace(dev, &init_net, fb_name);
6446		if (err) {
6447			pr_emerg("%s: failed to move %s to init_net: %d\n",
6448				 __func__, dev->name, err);
6449			BUG();
6450		}
6451	}
6452	rtnl_unlock();
6453}
6454
6455static void __net_exit default_device_exit_batch(struct list_head *net_list)
6456{
6457	/* At exit all network devices most be removed from a network
6458	 * namespace.  Do this in the reverse order of registration.
6459	 * Do this across as many network namespaces as possible to
6460	 * improve batching efficiency.
6461	 */
6462	struct net_device *dev;
6463	struct net *net;
6464	LIST_HEAD(dev_kill_list);
6465
6466	rtnl_lock();
6467	list_for_each_entry(net, net_list, exit_list) {
 
 
 
 
 
6468		for_each_netdev_reverse(net, dev) {
6469			if (dev->rtnl_link_ops)
6470				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6471			else
6472				unregister_netdevice_queue(dev, &dev_kill_list);
6473		}
6474	}
6475	unregister_netdevice_many(&dev_kill_list);
6476	list_del(&dev_kill_list);
6477	rtnl_unlock();
6478}
6479
6480static struct pernet_operations __net_initdata default_device_ops = {
6481	.exit = default_device_exit,
6482	.exit_batch = default_device_exit_batch,
6483};
6484
6485/*
6486 *	Initialize the DEV module. At boot time this walks the device list and
6487 *	unhooks any devices that fail to initialise (normally hardware not
6488 *	present) and leaves us with a valid list of present and active devices.
6489 *
6490 */
6491
6492/*
6493 *       This is called single threaded during boot, so no need
6494 *       to take the rtnl semaphore.
6495 */
6496static int __init net_dev_init(void)
6497{
6498	int i, rc = -ENOMEM;
6499
6500	BUG_ON(!dev_boot_phase);
6501
6502	if (dev_proc_init())
6503		goto out;
6504
6505	if (netdev_kobject_init())
6506		goto out;
6507
6508	INIT_LIST_HEAD(&ptype_all);
6509	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6510		INIT_LIST_HEAD(&ptype_base[i]);
6511
6512	if (register_pernet_subsys(&netdev_net_ops))
6513		goto out;
6514
6515	/*
6516	 *	Initialise the packet receive queues.
6517	 */
6518
6519	for_each_possible_cpu(i) {
 
6520		struct softnet_data *sd = &per_cpu(softnet_data, i);
6521
6522		memset(sd, 0, sizeof(*sd));
 
6523		skb_queue_head_init(&sd->input_pkt_queue);
6524		skb_queue_head_init(&sd->process_queue);
6525		sd->completion_queue = NULL;
 
 
6526		INIT_LIST_HEAD(&sd->poll_list);
6527		sd->output_queue = NULL;
6528		sd->output_queue_tailp = &sd->output_queue;
6529#ifdef CONFIG_RPS
6530		sd->csd.func = rps_trigger_softirq;
6531		sd->csd.info = sd;
6532		sd->csd.flags = 0;
6533		sd->cpu = i;
6534#endif
 
 
6535
 
6536		sd->backlog.poll = process_backlog;
6537		sd->backlog.weight = weight_p;
6538		sd->backlog.gro_list = NULL;
6539		sd->backlog.gro_count = 0;
6540	}
6541
6542	dev_boot_phase = 0;
6543
6544	/* The loopback device is special if any other network devices
6545	 * is present in a network namespace the loopback device must
6546	 * be present. Since we now dynamically allocate and free the
6547	 * loopback device ensure this invariant is maintained by
6548	 * keeping the loopback device as the first device on the
6549	 * list of network devices.  Ensuring the loopback devices
6550	 * is the first device that appears and the last network device
6551	 * that disappears.
6552	 */
6553	if (register_pernet_device(&loopback_net_ops))
6554		goto out;
6555
6556	if (register_pernet_device(&default_device_ops))
6557		goto out;
6558
6559	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6560	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6561
6562	hotcpu_notifier(dev_cpu_callback, 0);
6563	dst_init();
6564	dev_mcast_init();
6565	rc = 0;
6566out:
6567	return rc;
6568}
6569
6570subsys_initcall(net_dev_init);
6571
6572static int __init initialize_hashrnd(void)
6573{
6574	get_random_bytes(&hashrnd, sizeof(hashrnd));
6575	return 0;
6576}
6577
6578late_initcall_sync(initialize_hashrnd);
6579
v6.2
    1// SPDX-License-Identifier: GPL-2.0-or-later
    2/*
    3 *      NET3    Protocol independent device support routines.
 
 
 
 
 
    4 *
    5 *	Derived from the non IP parts of dev.c 1.0.19
    6 *              Authors:	Ross Biro
    7 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
    8 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
    9 *
   10 *	Additional Authors:
   11 *		Florian la Roche <rzsfl@rz.uni-sb.de>
   12 *		Alan Cox <gw4pts@gw4pts.ampr.org>
   13 *		David Hinds <dahinds@users.sourceforge.net>
   14 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
   15 *		Adam Sulmicki <adam@cfar.umd.edu>
   16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
   17 *
   18 *	Changes:
   19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
   20 *                                      to 2 if register_netdev gets called
   21 *                                      before net_dev_init & also removed a
   22 *                                      few lines of code in the process.
   23 *		Alan Cox	:	device private ioctl copies fields back.
   24 *		Alan Cox	:	Transmit queue code does relevant
   25 *					stunts to keep the queue safe.
   26 *		Alan Cox	:	Fixed double lock.
   27 *		Alan Cox	:	Fixed promisc NULL pointer trap
   28 *		????????	:	Support the full private ioctl range
   29 *		Alan Cox	:	Moved ioctl permission check into
   30 *					drivers
   31 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
   32 *		Alan Cox	:	100 backlog just doesn't cut it when
   33 *					you start doing multicast video 8)
   34 *		Alan Cox	:	Rewrote net_bh and list manager.
   35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
   36 *		Alan Cox	:	Took out transmit every packet pass
   37 *					Saved a few bytes in the ioctl handler
   38 *		Alan Cox	:	Network driver sets packet type before
   39 *					calling netif_rx. Saves a function
   40 *					call a packet.
   41 *		Alan Cox	:	Hashed net_bh()
   42 *		Richard Kooijman:	Timestamp fixes.
   43 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
   44 *		Alan Cox	:	Device lock protection.
   45 *              Alan Cox        :       Fixed nasty side effect of device close
   46 *					changes.
   47 *		Rudi Cilibrasi	:	Pass the right thing to
   48 *					set_mac_address()
   49 *		Dave Miller	:	32bit quantity for the device lock to
   50 *					make it work out on a Sparc.
   51 *		Bjorn Ekwall	:	Added KERNELD hack.
   52 *		Alan Cox	:	Cleaned up the backlog initialise.
   53 *		Craig Metz	:	SIOCGIFCONF fix if space for under
   54 *					1 device.
   55 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
   56 *					is no device open function.
   57 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
   58 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
   59 *		Cyrus Durgin	:	Cleaned for KMOD
   60 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
   61 *					A network device unload needs to purge
   62 *					the backlog queue.
   63 *	Paul Rusty Russell	:	SIOCSIFNAME
   64 *              Pekka Riikonen  :	Netdev boot-time settings code
   65 *              Andrew Morton   :       Make unregister_netdevice wait
   66 *                                      indefinitely on dev->refcnt
   67 *              J Hadi Salim    :       - Backlog queue sampling
   68 *				        - netif_rx() feedback
   69 */
   70
   71#include <linux/uaccess.h>
   72#include <linux/bitops.h>
   73#include <linux/capability.h>
   74#include <linux/cpu.h>
   75#include <linux/types.h>
   76#include <linux/kernel.h>
   77#include <linux/hash.h>
   78#include <linux/slab.h>
   79#include <linux/sched.h>
   80#include <linux/sched/mm.h>
   81#include <linux/mutex.h>
   82#include <linux/rwsem.h>
   83#include <linux/string.h>
   84#include <linux/mm.h>
   85#include <linux/socket.h>
   86#include <linux/sockios.h>
   87#include <linux/errno.h>
   88#include <linux/interrupt.h>
   89#include <linux/if_ether.h>
   90#include <linux/netdevice.h>
   91#include <linux/etherdevice.h>
   92#include <linux/ethtool.h>
 
   93#include <linux/skbuff.h>
   94#include <linux/kthread.h>
   95#include <linux/bpf.h>
   96#include <linux/bpf_trace.h>
   97#include <net/net_namespace.h>
   98#include <net/sock.h>
   99#include <net/busy_poll.h>
  100#include <linux/rtnetlink.h>
 
 
  101#include <linux/stat.h>
  102#include <net/dsa.h>
  103#include <net/dst.h>
  104#include <net/dst_metadata.h>
  105#include <net/gro.h>
  106#include <net/pkt_sched.h>
  107#include <net/pkt_cls.h>
  108#include <net/checksum.h>
  109#include <net/xfrm.h>
  110#include <linux/highmem.h>
  111#include <linux/init.h>
 
  112#include <linux/module.h>
  113#include <linux/netpoll.h>
  114#include <linux/rcupdate.h>
  115#include <linux/delay.h>
 
  116#include <net/iw_handler.h>
  117#include <asm/current.h>
  118#include <linux/audit.h>
  119#include <linux/dmaengine.h>
  120#include <linux/err.h>
  121#include <linux/ctype.h>
  122#include <linux/if_arp.h>
  123#include <linux/if_vlan.h>
  124#include <linux/ip.h>
  125#include <net/ip.h>
  126#include <net/mpls.h>
  127#include <linux/ipv6.h>
  128#include <linux/in.h>
  129#include <linux/jhash.h>
  130#include <linux/random.h>
  131#include <trace/events/napi.h>
  132#include <trace/events/net.h>
  133#include <trace/events/skb.h>
  134#include <trace/events/qdisc.h>
  135#include <linux/inetdevice.h>
  136#include <linux/cpu_rmap.h>
 
  137#include <linux/static_key.h>
  138#include <linux/hashtable.h>
  139#include <linux/vmalloc.h>
  140#include <linux/if_macvlan.h>
  141#include <linux/errqueue.h>
  142#include <linux/hrtimer.h>
  143#include <linux/netfilter_netdev.h>
  144#include <linux/crash_dump.h>
  145#include <linux/sctp.h>
  146#include <net/udp_tunnel.h>
  147#include <linux/net_namespace.h>
  148#include <linux/indirect_call_wrapper.h>
  149#include <net/devlink.h>
  150#include <linux/pm_runtime.h>
  151#include <linux/prandom.h>
  152#include <linux/once_lite.h>
  153
  154#include "dev.h"
  155#include "net-sysfs.h"
  156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  157
  158static DEFINE_SPINLOCK(ptype_lock);
  159struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
  160struct list_head ptype_all __read_mostly;	/* Taps */
  161
  162static int netif_rx_internal(struct sk_buff *skb);
  163static int call_netdevice_notifiers_info(unsigned long val,
  164					 struct netdev_notifier_info *info);
  165static int call_netdevice_notifiers_extack(unsigned long val,
  166					   struct net_device *dev,
  167					   struct netlink_ext_ack *extack);
  168static struct napi_struct *napi_by_id(unsigned int napi_id);
  169
  170/*
  171 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
  172 * semaphore.
  173 *
  174 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
  175 *
  176 * Writers must hold the rtnl semaphore while they loop through the
  177 * dev_base_head list, and hold dev_base_lock for writing when they do the
  178 * actual updates.  This allows pure readers to access the list even
  179 * while a writer is preparing to update it.
  180 *
  181 * To put it another way, dev_base_lock is held for writing only to
  182 * protect against pure readers; the rtnl semaphore provides the
  183 * protection against other writers.
  184 *
  185 * See, for example usages, register_netdevice() and
  186 * unregister_netdevice(), which must be called with the rtnl
  187 * semaphore held.
  188 */
  189DEFINE_RWLOCK(dev_base_lock);
  190EXPORT_SYMBOL(dev_base_lock);
  191
  192static DEFINE_MUTEX(ifalias_mutex);
  193
  194/* protects napi_hash addition/deletion and napi_gen_id */
  195static DEFINE_SPINLOCK(napi_hash_lock);
  196
  197static unsigned int napi_gen_id = NR_CPUS;
  198static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
  199
  200static DECLARE_RWSEM(devnet_rename_sem);
  201
  202static inline void dev_base_seq_inc(struct net *net)
  203{
  204	while (++net->dev_base_seq == 0)
  205		;
  206}
  207
  208static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
  209{
  210	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
  211
  212	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
  213}
  214
  215static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
  216{
  217	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
  218}
  219
  220static inline void rps_lock_irqsave(struct softnet_data *sd,
  221				    unsigned long *flags)
  222{
  223	if (IS_ENABLED(CONFIG_RPS))
  224		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
  225	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
  226		local_irq_save(*flags);
  227}
  228
  229static inline void rps_lock_irq_disable(struct softnet_data *sd)
  230{
  231	if (IS_ENABLED(CONFIG_RPS))
  232		spin_lock_irq(&sd->input_pkt_queue.lock);
  233	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
  234		local_irq_disable();
  235}
  236
  237static inline void rps_unlock_irq_restore(struct softnet_data *sd,
  238					  unsigned long *flags)
  239{
  240	if (IS_ENABLED(CONFIG_RPS))
  241		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
  242	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
  243		local_irq_restore(*flags);
  244}
  245
  246static inline void rps_unlock_irq_enable(struct softnet_data *sd)
  247{
  248	if (IS_ENABLED(CONFIG_RPS))
  249		spin_unlock_irq(&sd->input_pkt_queue.lock);
  250	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
  251		local_irq_enable();
  252}
  253
  254static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
  255						       const char *name)
  256{
  257	struct netdev_name_node *name_node;
  258
  259	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
  260	if (!name_node)
  261		return NULL;
  262	INIT_HLIST_NODE(&name_node->hlist);
  263	name_node->dev = dev;
  264	name_node->name = name;
  265	return name_node;
  266}
  267
  268static struct netdev_name_node *
  269netdev_name_node_head_alloc(struct net_device *dev)
  270{
  271	struct netdev_name_node *name_node;
  272
  273	name_node = netdev_name_node_alloc(dev, dev->name);
  274	if (!name_node)
  275		return NULL;
  276	INIT_LIST_HEAD(&name_node->list);
  277	return name_node;
  278}
  279
  280static void netdev_name_node_free(struct netdev_name_node *name_node)
  281{
  282	kfree(name_node);
  283}
  284
  285static void netdev_name_node_add(struct net *net,
  286				 struct netdev_name_node *name_node)
  287{
  288	hlist_add_head_rcu(&name_node->hlist,
  289			   dev_name_hash(net, name_node->name));
  290}
  291
  292static void netdev_name_node_del(struct netdev_name_node *name_node)
  293{
  294	hlist_del_rcu(&name_node->hlist);
  295}
  296
  297static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
  298							const char *name)
  299{
  300	struct hlist_head *head = dev_name_hash(net, name);
  301	struct netdev_name_node *name_node;
  302
  303	hlist_for_each_entry(name_node, head, hlist)
  304		if (!strcmp(name_node->name, name))
  305			return name_node;
  306	return NULL;
  307}
  308
  309static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
  310							    const char *name)
  311{
  312	struct hlist_head *head = dev_name_hash(net, name);
  313	struct netdev_name_node *name_node;
  314
  315	hlist_for_each_entry_rcu(name_node, head, hlist)
  316		if (!strcmp(name_node->name, name))
  317			return name_node;
  318	return NULL;
  319}
  320
  321bool netdev_name_in_use(struct net *net, const char *name)
  322{
  323	return netdev_name_node_lookup(net, name);
  324}
  325EXPORT_SYMBOL(netdev_name_in_use);
  326
  327int netdev_name_node_alt_create(struct net_device *dev, const char *name)
  328{
  329	struct netdev_name_node *name_node;
  330	struct net *net = dev_net(dev);
  331
  332	name_node = netdev_name_node_lookup(net, name);
  333	if (name_node)
  334		return -EEXIST;
  335	name_node = netdev_name_node_alloc(dev, name);
  336	if (!name_node)
  337		return -ENOMEM;
  338	netdev_name_node_add(net, name_node);
  339	/* The node that holds dev->name acts as a head of per-device list. */
  340	list_add_tail(&name_node->list, &dev->name_node->list);
  341
  342	return 0;
  343}
  344
  345static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
  346{
  347	list_del(&name_node->list);
  348	netdev_name_node_del(name_node);
  349	kfree(name_node->name);
  350	netdev_name_node_free(name_node);
  351}
  352
  353int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
  354{
  355	struct netdev_name_node *name_node;
  356	struct net *net = dev_net(dev);
  357
  358	name_node = netdev_name_node_lookup(net, name);
  359	if (!name_node)
  360		return -ENOENT;
  361	/* lookup might have found our primary name or a name belonging
  362	 * to another device.
  363	 */
  364	if (name_node == dev->name_node || name_node->dev != dev)
  365		return -EINVAL;
  366
  367	__netdev_name_node_alt_destroy(name_node);
  368
  369	return 0;
  370}
  371
  372static void netdev_name_node_alt_flush(struct net_device *dev)
  373{
  374	struct netdev_name_node *name_node, *tmp;
  375
  376	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
  377		__netdev_name_node_alt_destroy(name_node);
  378}
  379
  380/* Device list insertion */
  381static void list_netdevice(struct net_device *dev)
  382{
  383	struct net *net = dev_net(dev);
  384
  385	ASSERT_RTNL();
  386
  387	write_lock(&dev_base_lock);
  388	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
  389	netdev_name_node_add(net, dev->name_node);
  390	hlist_add_head_rcu(&dev->index_hlist,
  391			   dev_index_hash(net, dev->ifindex));
  392	write_unlock(&dev_base_lock);
  393
  394	dev_base_seq_inc(net);
 
 
  395}
  396
  397/* Device list removal
  398 * caller must respect a RCU grace period before freeing/reusing dev
  399 */
  400static void unlist_netdevice(struct net_device *dev, bool lock)
  401{
  402	ASSERT_RTNL();
  403
  404	/* Unlink dev from the device chain */
  405	if (lock)
  406		write_lock(&dev_base_lock);
  407	list_del_rcu(&dev->dev_list);
  408	netdev_name_node_del(dev->name_node);
  409	hlist_del_rcu(&dev->index_hlist);
  410	if (lock)
  411		write_unlock(&dev_base_lock);
  412
  413	dev_base_seq_inc(dev_net(dev));
  414}
  415
  416/*
  417 *	Our notifier list
  418 */
  419
  420static RAW_NOTIFIER_HEAD(netdev_chain);
  421
  422/*
  423 *	Device drivers call our routines to queue packets here. We empty the
  424 *	queue in the local softnet handler.
  425 */
  426
  427DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
  428EXPORT_PER_CPU_SYMBOL(softnet_data);
  429
  430#ifdef CONFIG_LOCKDEP
  431/*
  432 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
  433 * according to dev->type
  434 */
  435static const unsigned short netdev_lock_type[] = {
  436	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
  437	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
  438	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
  439	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
  440	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
  441	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
  442	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
  443	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
  444	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
  445	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
  446	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
  447	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
  448	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
  449	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
  450	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
  451
  452static const char *const netdev_lock_name[] = {
  453	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
  454	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
  455	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
  456	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
  457	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
  458	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
  459	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
  460	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
  461	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
  462	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
  463	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
  464	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
  465	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
  466	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
  467	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
  468
  469static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
  470static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
  471
  472static inline unsigned short netdev_lock_pos(unsigned short dev_type)
  473{
  474	int i;
  475
  476	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
  477		if (netdev_lock_type[i] == dev_type)
  478			return i;
  479	/* the last key is used by default */
  480	return ARRAY_SIZE(netdev_lock_type) - 1;
  481}
  482
  483static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  484						 unsigned short dev_type)
  485{
  486	int i;
  487
  488	i = netdev_lock_pos(dev_type);
  489	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
  490				   netdev_lock_name[i]);
  491}
  492
  493static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  494{
  495	int i;
  496
  497	i = netdev_lock_pos(dev->type);
  498	lockdep_set_class_and_name(&dev->addr_list_lock,
  499				   &netdev_addr_lock_key[i],
  500				   netdev_lock_name[i]);
  501}
  502#else
  503static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  504						 unsigned short dev_type)
  505{
  506}
  507
  508static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  509{
  510}
  511#endif
  512
  513/*******************************************************************************
  514 *
  515 *		Protocol management and registration routines
  516 *
  517 *******************************************************************************/
  518
 
 
 
  519
  520/*
  521 *	Add a protocol ID to the list. Now that the input handler is
  522 *	smarter we can dispense with all the messy stuff that used to be
  523 *	here.
  524 *
  525 *	BEWARE!!! Protocol handlers, mangling input packets,
  526 *	MUST BE last in hash buckets and checking protocol handlers
  527 *	MUST start from promiscuous ptype_all chain in net_bh.
  528 *	It is true now, do not change it.
  529 *	Explanation follows: if protocol handler, mangling packet, will
  530 *	be the first on list, it is not able to sense, that packet
  531 *	is cloned and should be copied-on-write, so that it will
  532 *	change it and subsequent readers will get broken packet.
  533 *							--ANK (980803)
  534 */
  535
  536static inline struct list_head *ptype_head(const struct packet_type *pt)
  537{
  538	if (pt->type == htons(ETH_P_ALL))
  539		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
  540	else
  541		return pt->dev ? &pt->dev->ptype_specific :
  542				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
  543}
  544
  545/**
  546 *	dev_add_pack - add packet handler
  547 *	@pt: packet type declaration
  548 *
  549 *	Add a protocol handler to the networking stack. The passed &packet_type
  550 *	is linked into kernel lists and may not be freed until it has been
  551 *	removed from the kernel lists.
  552 *
  553 *	This call does not sleep therefore it can not
  554 *	guarantee all CPU's that are in middle of receiving packets
  555 *	will see the new packet type (until the next received packet).
  556 */
  557
  558void dev_add_pack(struct packet_type *pt)
  559{
  560	struct list_head *head = ptype_head(pt);
  561
  562	spin_lock(&ptype_lock);
  563	list_add_rcu(&pt->list, head);
  564	spin_unlock(&ptype_lock);
  565}
  566EXPORT_SYMBOL(dev_add_pack);
  567
  568/**
  569 *	__dev_remove_pack	 - remove packet handler
  570 *	@pt: packet type declaration
  571 *
  572 *	Remove a protocol handler that was previously added to the kernel
  573 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  574 *	from the kernel lists and can be freed or reused once this function
  575 *	returns.
  576 *
  577 *      The packet type might still be in use by receivers
  578 *	and must not be freed until after all the CPU's have gone
  579 *	through a quiescent state.
  580 */
  581void __dev_remove_pack(struct packet_type *pt)
  582{
  583	struct list_head *head = ptype_head(pt);
  584	struct packet_type *pt1;
  585
  586	spin_lock(&ptype_lock);
  587
  588	list_for_each_entry(pt1, head, list) {
  589		if (pt == pt1) {
  590			list_del_rcu(&pt->list);
  591			goto out;
  592		}
  593	}
  594
  595	pr_warn("dev_remove_pack: %p not found\n", pt);
  596out:
  597	spin_unlock(&ptype_lock);
  598}
  599EXPORT_SYMBOL(__dev_remove_pack);
  600
  601/**
  602 *	dev_remove_pack	 - remove packet handler
  603 *	@pt: packet type declaration
  604 *
  605 *	Remove a protocol handler that was previously added to the kernel
  606 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  607 *	from the kernel lists and can be freed or reused once this function
  608 *	returns.
  609 *
  610 *	This call sleeps to guarantee that no CPU is looking at the packet
  611 *	type after return.
  612 */
  613void dev_remove_pack(struct packet_type *pt)
  614{
  615	__dev_remove_pack(pt);
  616
  617	synchronize_net();
  618}
  619EXPORT_SYMBOL(dev_remove_pack);
  620
 
 
 
  621
  622/*******************************************************************************
  623 *
  624 *			    Device Interface Subroutines
  625 *
  626 *******************************************************************************/
  627
  628/**
  629 *	dev_get_iflink	- get 'iflink' value of a interface
  630 *	@dev: targeted interface
 
  631 *
  632 *	Indicates the ifindex the interface is linked to.
  633 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
 
  634 */
 
 
 
 
  635
  636int dev_get_iflink(const struct net_device *dev)
  637{
  638	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
  639		return dev->netdev_ops->ndo_get_iflink(dev);
 
 
 
 
 
  640
  641	return dev->ifindex;
  642}
  643EXPORT_SYMBOL(dev_get_iflink);
  644
  645/**
  646 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
  647 *	@dev: targeted interface
  648 *	@skb: The packet.
  649 *
  650 *	For better visibility of tunnel traffic OVS needs to retrieve
  651 *	egress tunnel information for a packet. Following API allows
  652 *	user to get this info.
 
  653 */
  654int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
  655{
  656	struct ip_tunnel_info *info;
 
  657
  658	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
  659		return -EINVAL;
  660
  661	info = skb_tunnel_info_unclone(skb);
  662	if (!info)
  663		return -ENOMEM;
  664	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
  665		return -EINVAL;
 
 
 
 
 
  666
  667	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
  668}
  669EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
  670
  671static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
 
 
 
 
 
 
 
 
 
 
  672{
  673	int k = stack->num_paths++;
 
 
 
 
  674
  675	if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
  676		return NULL;
 
 
 
 
  677
  678	return &stack->path[k];
 
 
 
  679}
  680
  681int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
  682			  struct net_device_path_stack *stack)
 
 
  683{
  684	const struct net_device *last_dev;
  685	struct net_device_path_ctx ctx = {
  686		.dev	= dev,
  687	};
  688	struct net_device_path *path;
  689	int ret = 0;
  690
  691	memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
  692	stack->num_paths = 0;
  693	while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
  694		last_dev = ctx.dev;
  695		path = dev_fwd_path(stack);
  696		if (!path)
  697			return -1;
 
 
 
 
 
 
 
  698
  699		memset(path, 0, sizeof(struct net_device_path));
  700		ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
  701		if (ret < 0)
  702			return -1;
  703
  704		if (WARN_ON_ONCE(last_dev == ctx.dev))
  705			return -1;
  706	}
  707
  708	if (!ctx.dev)
  709		return ret;
  710
  711	path = dev_fwd_path(stack);
  712	if (!path)
  713		return -1;
  714	path->type = DEV_PATH_ETHERNET;
  715	path->dev = ctx.dev;
  716
  717	return ret;
  718}
  719EXPORT_SYMBOL_GPL(dev_fill_forward_path);
  720
  721/**
  722 *	__dev_get_by_name	- find a device by its name
  723 *	@net: the applicable net namespace
  724 *	@name: name to find
  725 *
  726 *	Find an interface by name. Must be called under RTNL semaphore
  727 *	or @dev_base_lock. If the name is found a pointer to the device
  728 *	is returned. If the name is not found then %NULL is returned. The
  729 *	reference counters are not incremented so the caller must be
  730 *	careful with locks.
  731 */
  732
  733struct net_device *__dev_get_by_name(struct net *net, const char *name)
  734{
  735	struct netdev_name_node *node_name;
 
 
  736
  737	node_name = netdev_name_node_lookup(net, name);
  738	return node_name ? node_name->dev : NULL;
 
 
 
  739}
  740EXPORT_SYMBOL(__dev_get_by_name);
  741
  742/**
  743 * dev_get_by_name_rcu	- find a device by its name
  744 * @net: the applicable net namespace
  745 * @name: name to find
  746 *
  747 * Find an interface by name.
  748 * If the name is found a pointer to the device is returned.
  749 * If the name is not found then %NULL is returned.
  750 * The reference counters are not incremented so the caller must be
  751 * careful with locks. The caller must hold RCU lock.
  752 */
  753
  754struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
  755{
  756	struct netdev_name_node *node_name;
 
 
 
 
 
 
  757
  758	node_name = netdev_name_node_lookup_rcu(net, name);
  759	return node_name ? node_name->dev : NULL;
  760}
  761EXPORT_SYMBOL(dev_get_by_name_rcu);
  762
  763/**
  764 *	dev_get_by_name		- find a device by its name
  765 *	@net: the applicable net namespace
  766 *	@name: name to find
  767 *
  768 *	Find an interface by name. This can be called from any
  769 *	context and does its own locking. The returned handle has
  770 *	the usage count incremented and the caller must use dev_put() to
  771 *	release it when it is no longer needed. %NULL is returned if no
  772 *	matching device is found.
  773 */
  774
  775struct net_device *dev_get_by_name(struct net *net, const char *name)
  776{
  777	struct net_device *dev;
  778
  779	rcu_read_lock();
  780	dev = dev_get_by_name_rcu(net, name);
  781	dev_hold(dev);
 
  782	rcu_read_unlock();
  783	return dev;
  784}
  785EXPORT_SYMBOL(dev_get_by_name);
  786
  787/**
  788 *	__dev_get_by_index - find a device by its ifindex
  789 *	@net: the applicable net namespace
  790 *	@ifindex: index of device
  791 *
  792 *	Search for an interface by index. Returns %NULL if the device
  793 *	is not found or a pointer to the device. The device has not
  794 *	had its reference counter increased so the caller must be careful
  795 *	about locking. The caller must hold either the RTNL semaphore
  796 *	or @dev_base_lock.
  797 */
  798
  799struct net_device *__dev_get_by_index(struct net *net, int ifindex)
  800{
 
  801	struct net_device *dev;
  802	struct hlist_head *head = dev_index_hash(net, ifindex);
  803
  804	hlist_for_each_entry(dev, head, index_hlist)
  805		if (dev->ifindex == ifindex)
  806			return dev;
  807
  808	return NULL;
  809}
  810EXPORT_SYMBOL(__dev_get_by_index);
  811
  812/**
  813 *	dev_get_by_index_rcu - find a device by its ifindex
  814 *	@net: the applicable net namespace
  815 *	@ifindex: index of device
  816 *
  817 *	Search for an interface by index. Returns %NULL if the device
  818 *	is not found or a pointer to the device. The device has not
  819 *	had its reference counter increased so the caller must be careful
  820 *	about locking. The caller must hold RCU lock.
  821 */
  822
  823struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
  824{
 
  825	struct net_device *dev;
  826	struct hlist_head *head = dev_index_hash(net, ifindex);
  827
  828	hlist_for_each_entry_rcu(dev, head, index_hlist)
  829		if (dev->ifindex == ifindex)
  830			return dev;
  831
  832	return NULL;
  833}
  834EXPORT_SYMBOL(dev_get_by_index_rcu);
  835
  836
  837/**
  838 *	dev_get_by_index - find a device by its ifindex
  839 *	@net: the applicable net namespace
  840 *	@ifindex: index of device
  841 *
  842 *	Search for an interface by index. Returns NULL if the device
  843 *	is not found or a pointer to the device. The device returned has
  844 *	had a reference added and the pointer is safe until the user calls
  845 *	dev_put to indicate they have finished with it.
  846 */
  847
  848struct net_device *dev_get_by_index(struct net *net, int ifindex)
  849{
  850	struct net_device *dev;
  851
  852	rcu_read_lock();
  853	dev = dev_get_by_index_rcu(net, ifindex);
  854	dev_hold(dev);
 
  855	rcu_read_unlock();
  856	return dev;
  857}
  858EXPORT_SYMBOL(dev_get_by_index);
  859
  860/**
  861 *	dev_get_by_napi_id - find a device by napi_id
  862 *	@napi_id: ID of the NAPI struct
  863 *
  864 *	Search for an interface by NAPI ID. Returns %NULL if the device
  865 *	is not found or a pointer to the device. The device has not had
  866 *	its reference counter increased so the caller must be careful
  867 *	about locking. The caller must hold RCU lock.
  868 */
  869
  870struct net_device *dev_get_by_napi_id(unsigned int napi_id)
  871{
  872	struct napi_struct *napi;
  873
  874	WARN_ON_ONCE(!rcu_read_lock_held());
  875
  876	if (napi_id < MIN_NAPI_ID)
  877		return NULL;
  878
  879	napi = napi_by_id(napi_id);
  880
  881	return napi ? napi->dev : NULL;
  882}
  883EXPORT_SYMBOL(dev_get_by_napi_id);
  884
  885/**
  886 *	netdev_get_name - get a netdevice name, knowing its ifindex.
  887 *	@net: network namespace
  888 *	@name: a pointer to the buffer where the name will be stored.
  889 *	@ifindex: the ifindex of the interface to get the name from.
  890 */
  891int netdev_get_name(struct net *net, char *name, int ifindex)
  892{
  893	struct net_device *dev;
  894	int ret;
  895
  896	down_read(&devnet_rename_sem);
  897	rcu_read_lock();
  898
  899	dev = dev_get_by_index_rcu(net, ifindex);
  900	if (!dev) {
  901		ret = -ENODEV;
  902		goto out;
  903	}
  904
  905	strcpy(name, dev->name);
  906
  907	ret = 0;
  908out:
  909	rcu_read_unlock();
  910	up_read(&devnet_rename_sem);
  911	return ret;
  912}
  913
  914/**
  915 *	dev_getbyhwaddr_rcu - find a device by its hardware address
  916 *	@net: the applicable net namespace
  917 *	@type: media type of device
  918 *	@ha: hardware address
  919 *
  920 *	Search for an interface by MAC address. Returns NULL if the device
  921 *	is not found or a pointer to the device.
  922 *	The caller must hold RCU or RTNL.
  923 *	The returned device has not had its ref count increased
  924 *	and the caller must therefore be careful about locking
  925 *
  926 */
  927
  928struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
  929				       const char *ha)
  930{
  931	struct net_device *dev;
  932
  933	for_each_netdev_rcu(net, dev)
  934		if (dev->type == type &&
  935		    !memcmp(dev->dev_addr, ha, dev->addr_len))
  936			return dev;
  937
  938	return NULL;
  939}
  940EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
  941
 
 
 
 
 
 
 
 
 
 
 
 
 
  942struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
  943{
  944	struct net_device *dev, *ret = NULL;
  945
  946	rcu_read_lock();
  947	for_each_netdev_rcu(net, dev)
  948		if (dev->type == type) {
  949			dev_hold(dev);
  950			ret = dev;
  951			break;
  952		}
  953	rcu_read_unlock();
  954	return ret;
  955}
  956EXPORT_SYMBOL(dev_getfirstbyhwtype);
  957
  958/**
  959 *	__dev_get_by_flags - find any device with given flags
  960 *	@net: the applicable net namespace
  961 *	@if_flags: IFF_* values
  962 *	@mask: bitmask of bits in if_flags to check
  963 *
  964 *	Search for any interface with the given flags. Returns NULL if a device
  965 *	is not found or a pointer to the device. Must be called inside
  966 *	rtnl_lock(), and result refcount is unchanged.
  967 */
  968
  969struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
  970				      unsigned short mask)
  971{
  972	struct net_device *dev, *ret;
  973
  974	ASSERT_RTNL();
  975
  976	ret = NULL;
  977	for_each_netdev(net, dev) {
  978		if (((dev->flags ^ if_flags) & mask) == 0) {
  979			ret = dev;
  980			break;
  981		}
  982	}
  983	return ret;
  984}
  985EXPORT_SYMBOL(__dev_get_by_flags);
  986
  987/**
  988 *	dev_valid_name - check if name is okay for network device
  989 *	@name: name string
  990 *
  991 *	Network device names need to be valid file names to
  992 *	allow sysfs to work.  We also disallow any kind of
  993 *	whitespace.
  994 */
  995bool dev_valid_name(const char *name)
  996{
  997	if (*name == '\0')
  998		return false;
  999	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
 1000		return false;
 1001	if (!strcmp(name, ".") || !strcmp(name, ".."))
 1002		return false;
 1003
 1004	while (*name) {
 1005		if (*name == '/' || *name == ':' || isspace(*name))
 1006			return false;
 1007		name++;
 1008	}
 1009	return true;
 1010}
 1011EXPORT_SYMBOL(dev_valid_name);
 1012
 1013/**
 1014 *	__dev_alloc_name - allocate a name for a device
 1015 *	@net: network namespace to allocate the device name in
 1016 *	@name: name format string
 1017 *	@buf:  scratch buffer and result name string
 1018 *
 1019 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1020 *	id. It scans list of devices to build up a free map, then chooses
 1021 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1022 *	while allocating the name and adding the device in order to avoid
 1023 *	duplicates.
 1024 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1025 *	Returns the number of the unit assigned or a negative errno code.
 1026 */
 1027
 1028static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 1029{
 1030	int i = 0;
 1031	const char *p;
 1032	const int max_netdevices = 8*PAGE_SIZE;
 1033	unsigned long *inuse;
 1034	struct net_device *d;
 1035
 1036	if (!dev_valid_name(name))
 1037		return -EINVAL;
 1038
 1039	p = strchr(name, '%');
 1040	if (p) {
 1041		/*
 1042		 * Verify the string as this thing may have come from
 1043		 * the user.  There must be either one "%d" and no other "%"
 1044		 * characters.
 1045		 */
 1046		if (p[1] != 'd' || strchr(p + 2, '%'))
 1047			return -EINVAL;
 1048
 1049		/* Use one page as a bit array of possible slots */
 1050		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 1051		if (!inuse)
 1052			return -ENOMEM;
 1053
 1054		for_each_netdev(net, d) {
 1055			struct netdev_name_node *name_node;
 1056			list_for_each_entry(name_node, &d->name_node->list, list) {
 1057				if (!sscanf(name_node->name, name, &i))
 1058					continue;
 1059				if (i < 0 || i >= max_netdevices)
 1060					continue;
 1061
 1062				/*  avoid cases where sscanf is not exact inverse of printf */
 1063				snprintf(buf, IFNAMSIZ, name, i);
 1064				if (!strncmp(buf, name_node->name, IFNAMSIZ))
 1065					__set_bit(i, inuse);
 1066			}
 1067			if (!sscanf(d->name, name, &i))
 1068				continue;
 1069			if (i < 0 || i >= max_netdevices)
 1070				continue;
 1071
 1072			/*  avoid cases where sscanf is not exact inverse of printf */
 1073			snprintf(buf, IFNAMSIZ, name, i);
 1074			if (!strncmp(buf, d->name, IFNAMSIZ))
 1075				__set_bit(i, inuse);
 1076		}
 1077
 1078		i = find_first_zero_bit(inuse, max_netdevices);
 1079		free_page((unsigned long) inuse);
 1080	}
 1081
 1082	snprintf(buf, IFNAMSIZ, name, i);
 1083	if (!netdev_name_in_use(net, buf))
 
 1084		return i;
 1085
 1086	/* It is possible to run out of possible slots
 1087	 * when the name is long and there isn't enough space left
 1088	 * for the digits, or if all bits are used.
 1089	 */
 1090	return -ENFILE;
 1091}
 1092
 1093static int dev_alloc_name_ns(struct net *net,
 1094			     struct net_device *dev,
 1095			     const char *name)
 1096{
 1097	char buf[IFNAMSIZ];
 1098	int ret;
 1099
 1100	BUG_ON(!net);
 1101	ret = __dev_alloc_name(net, name, buf);
 1102	if (ret >= 0)
 1103		strscpy(dev->name, buf, IFNAMSIZ);
 1104	return ret;
 1105}
 1106
 1107/**
 1108 *	dev_alloc_name - allocate a name for a device
 1109 *	@dev: device
 1110 *	@name: name format string
 1111 *
 1112 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1113 *	id. It scans list of devices to build up a free map, then chooses
 1114 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1115 *	while allocating the name and adding the device in order to avoid
 1116 *	duplicates.
 1117 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1118 *	Returns the number of the unit assigned or a negative errno code.
 1119 */
 1120
 1121int dev_alloc_name(struct net_device *dev, const char *name)
 1122{
 1123	return dev_alloc_name_ns(dev_net(dev), dev, name);
 
 
 
 
 
 
 
 
 
 1124}
 1125EXPORT_SYMBOL(dev_alloc_name);
 1126
 1127static int dev_get_valid_name(struct net *net, struct net_device *dev,
 1128			      const char *name)
 1129{
 1130	BUG_ON(!net);
 
 
 
 1131
 1132	if (!dev_valid_name(name))
 1133		return -EINVAL;
 1134
 1135	if (strchr(name, '%'))
 1136		return dev_alloc_name_ns(net, dev, name);
 1137	else if (netdev_name_in_use(net, name))
 1138		return -EEXIST;
 1139	else if (dev->name != name)
 1140		strscpy(dev->name, name, IFNAMSIZ);
 1141
 1142	return 0;
 1143}
 1144
 1145/**
 1146 *	dev_change_name - change name of a device
 1147 *	@dev: device
 1148 *	@newname: name (or format string) must be at least IFNAMSIZ
 1149 *
 1150 *	Change name of a device, can pass format strings "eth%d".
 1151 *	for wildcarding.
 1152 */
 1153int dev_change_name(struct net_device *dev, const char *newname)
 1154{
 1155	unsigned char old_assign_type;
 1156	char oldname[IFNAMSIZ];
 1157	int err = 0;
 1158	int ret;
 1159	struct net *net;
 1160
 1161	ASSERT_RTNL();
 1162	BUG_ON(!dev_net(dev));
 1163
 1164	net = dev_net(dev);
 
 
 1165
 1166	down_write(&devnet_rename_sem);
 1167
 1168	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
 1169		up_write(&devnet_rename_sem);
 1170		return 0;
 1171	}
 1172
 1173	memcpy(oldname, dev->name, IFNAMSIZ);
 1174
 1175	err = dev_get_valid_name(net, dev, newname);
 1176	if (err < 0) {
 1177		up_write(&devnet_rename_sem);
 1178		return err;
 1179	}
 1180
 1181	if (oldname[0] && !strchr(oldname, '%'))
 1182		netdev_info(dev, "renamed from %s%s\n", oldname,
 1183			    dev->flags & IFF_UP ? " (while UP)" : "");
 1184
 1185	old_assign_type = dev->name_assign_type;
 1186	dev->name_assign_type = NET_NAME_RENAMED;
 1187
 1188rollback:
 1189	ret = device_rename(&dev->dev, dev->name);
 1190	if (ret) {
 1191		memcpy(dev->name, oldname, IFNAMSIZ);
 1192		dev->name_assign_type = old_assign_type;
 1193		up_write(&devnet_rename_sem);
 1194		return ret;
 1195	}
 1196
 1197	up_write(&devnet_rename_sem);
 1198
 1199	netdev_adjacent_rename_links(dev, oldname);
 1200
 1201	write_lock(&dev_base_lock);
 1202	netdev_name_node_del(dev->name_node);
 1203	write_unlock(&dev_base_lock);
 1204
 1205	synchronize_rcu();
 1206
 1207	write_lock(&dev_base_lock);
 1208	netdev_name_node_add(net, dev->name_node);
 1209	write_unlock(&dev_base_lock);
 1210
 1211	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 1212	ret = notifier_to_errno(ret);
 1213
 1214	if (ret) {
 1215		/* err >= 0 after dev_alloc_name() or stores the first errno */
 1216		if (err >= 0) {
 1217			err = ret;
 1218			down_write(&devnet_rename_sem);
 1219			memcpy(dev->name, oldname, IFNAMSIZ);
 1220			memcpy(oldname, newname, IFNAMSIZ);
 1221			dev->name_assign_type = old_assign_type;
 1222			old_assign_type = NET_NAME_RENAMED;
 1223			goto rollback;
 1224		} else {
 1225			netdev_err(dev, "name change rollback failed: %d\n",
 1226				   ret);
 1227		}
 1228	}
 1229
 1230	return err;
 1231}
 1232
 1233/**
 1234 *	dev_set_alias - change ifalias of a device
 1235 *	@dev: device
 1236 *	@alias: name up to IFALIASZ
 1237 *	@len: limit of bytes to copy from info
 1238 *
 1239 *	Set ifalias for a device,
 1240 */
 1241int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 1242{
 1243	struct dev_ifalias *new_alias = NULL;
 
 
 1244
 1245	if (len >= IFALIASZ)
 1246		return -EINVAL;
 1247
 1248	if (len) {
 1249		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
 1250		if (!new_alias)
 1251			return -ENOMEM;
 1252
 1253		memcpy(new_alias->ifalias, alias, len);
 1254		new_alias->ifalias[len] = 0;
 1255	}
 1256
 1257	mutex_lock(&ifalias_mutex);
 1258	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
 1259					mutex_is_locked(&ifalias_mutex));
 1260	mutex_unlock(&ifalias_mutex);
 1261
 1262	if (new_alias)
 1263		kfree_rcu(new_alias, rcuhead);
 1264
 
 1265	return len;
 1266}
 1267EXPORT_SYMBOL(dev_set_alias);
 1268
 1269/**
 1270 *	dev_get_alias - get ifalias of a device
 1271 *	@dev: device
 1272 *	@name: buffer to store name of ifalias
 1273 *	@len: size of buffer
 1274 *
 1275 *	get ifalias for a device.  Caller must make sure dev cannot go
 1276 *	away,  e.g. rcu read lock or own a reference count to device.
 1277 */
 1278int dev_get_alias(const struct net_device *dev, char *name, size_t len)
 1279{
 1280	const struct dev_ifalias *alias;
 1281	int ret = 0;
 1282
 1283	rcu_read_lock();
 1284	alias = rcu_dereference(dev->ifalias);
 1285	if (alias)
 1286		ret = snprintf(name, len, "%s", alias->ifalias);
 1287	rcu_read_unlock();
 1288
 1289	return ret;
 1290}
 1291
 1292/**
 1293 *	netdev_features_change - device changes features
 1294 *	@dev: device to cause notification
 1295 *
 1296 *	Called to indicate a device has changed features.
 1297 */
 1298void netdev_features_change(struct net_device *dev)
 1299{
 1300	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 1301}
 1302EXPORT_SYMBOL(netdev_features_change);
 1303
 1304/**
 1305 *	netdev_state_change - device changes state
 1306 *	@dev: device to cause notification
 1307 *
 1308 *	Called to indicate a device has changed state. This function calls
 1309 *	the notifier chains for netdev_chain and sends a NEWLINK message
 1310 *	to the routing socket.
 1311 */
 1312void netdev_state_change(struct net_device *dev)
 1313{
 1314	if (dev->flags & IFF_UP) {
 1315		struct netdev_notifier_change_info change_info = {
 1316			.info.dev = dev,
 1317		};
 1318
 1319		call_netdevice_notifiers_info(NETDEV_CHANGE,
 1320					      &change_info.info);
 1321		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
 1322	}
 1323}
 1324EXPORT_SYMBOL(netdev_state_change);
 1325
 1326/**
 1327 * __netdev_notify_peers - notify network peers about existence of @dev,
 1328 * to be called when rtnl lock is already held.
 1329 * @dev: network device
 1330 *
 1331 * Generate traffic such that interested network peers are aware of
 1332 * @dev, such as by generating a gratuitous ARP. This may be used when
 1333 * a device wants to inform the rest of the network about some sort of
 1334 * reconfiguration such as a failover event or virtual machine
 1335 * migration.
 1336 */
 1337void __netdev_notify_peers(struct net_device *dev)
 1338{
 1339	ASSERT_RTNL();
 1340	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
 1341	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
 1342}
 1343EXPORT_SYMBOL(__netdev_notify_peers);
 1344
 1345/**
 1346 * netdev_notify_peers - notify network peers about existence of @dev
 1347 * @dev: network device
 
 1348 *
 1349 * Generate traffic such that interested network peers are aware of
 1350 * @dev, such as by generating a gratuitous ARP. This may be used when
 1351 * a device wants to inform the rest of the network about some sort of
 1352 * reconfiguration such as a failover event or virtual machine
 1353 * migration.
 1354 */
 1355void netdev_notify_peers(struct net_device *dev)
 
 1356{
 1357	rtnl_lock();
 1358	__netdev_notify_peers(dev);
 1359	rtnl_unlock();
 1360}
 1361EXPORT_SYMBOL(netdev_notify_peers);
 1362
 1363static int napi_threaded_poll(void *data);
 
 
 1364
 1365static int napi_kthread_create(struct napi_struct *n)
 1366{
 1367	int err = 0;
 1368
 1369	/* Create and wake up the kthread once to put it in
 1370	 * TASK_INTERRUPTIBLE mode to avoid the blocked task
 1371	 * warning and work with loadavg.
 1372	 */
 1373	n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
 1374				n->dev->name, n->napi_id);
 1375	if (IS_ERR(n->thread)) {
 1376		err = PTR_ERR(n->thread);
 1377		pr_err("kthread_run failed with err %d\n", err);
 1378		n->thread = NULL;
 1379	}
 1380
 1381	return err;
 1382}
 
 1383
 1384static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1385{
 1386	const struct net_device_ops *ops = dev->netdev_ops;
 1387	int ret;
 1388
 1389	ASSERT_RTNL();
 1390	dev_addr_check(dev);
 1391
 1392	if (!netif_device_present(dev)) {
 1393		/* may be detached because parent is runtime-suspended */
 1394		if (dev->dev.parent)
 1395			pm_runtime_resume(dev->dev.parent);
 1396		if (!netif_device_present(dev))
 1397			return -ENODEV;
 1398	}
 1399
 1400	/* Block netpoll from trying to do any rx path servicing.
 1401	 * If we don't do this there is a chance ndo_poll_controller
 1402	 * or ndo_poll may be running while we open the device
 1403	 */
 1404	netpoll_poll_disable(dev);
 1405
 1406	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
 1407	ret = notifier_to_errno(ret);
 1408	if (ret)
 1409		return ret;
 1410
 1411	set_bit(__LINK_STATE_START, &dev->state);
 1412
 1413	if (ops->ndo_validate_addr)
 1414		ret = ops->ndo_validate_addr(dev);
 1415
 1416	if (!ret && ops->ndo_open)
 1417		ret = ops->ndo_open(dev);
 1418
 1419	netpoll_poll_enable(dev);
 1420
 1421	if (ret)
 1422		clear_bit(__LINK_STATE_START, &dev->state);
 1423	else {
 1424		dev->flags |= IFF_UP;
 
 1425		dev_set_rx_mode(dev);
 1426		dev_activate(dev);
 1427		add_device_randomness(dev->dev_addr, dev->addr_len);
 1428	}
 1429
 1430	return ret;
 1431}
 1432
 1433/**
 1434 *	dev_open	- prepare an interface for use.
 1435 *	@dev: device to open
 1436 *	@extack: netlink extended ack
 1437 *
 1438 *	Takes a device from down to up state. The device's private open
 1439 *	function is invoked and then the multicast lists are loaded. Finally
 1440 *	the device is moved into the up state and a %NETDEV_UP message is
 1441 *	sent to the netdev notifier chain.
 1442 *
 1443 *	Calling this function on an active interface is a nop. On a failure
 1444 *	a negative errno code is returned.
 1445 */
 1446int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1447{
 1448	int ret;
 1449
 1450	if (dev->flags & IFF_UP)
 1451		return 0;
 1452
 1453	ret = __dev_open(dev, extack);
 1454	if (ret < 0)
 1455		return ret;
 1456
 1457	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
 1458	call_netdevice_notifiers(NETDEV_UP, dev);
 1459
 1460	return ret;
 1461}
 1462EXPORT_SYMBOL(dev_open);
 1463
 1464static void __dev_close_many(struct list_head *head)
 1465{
 1466	struct net_device *dev;
 1467
 1468	ASSERT_RTNL();
 1469	might_sleep();
 1470
 1471	list_for_each_entry(dev, head, close_list) {
 1472		/* Temporarily disable netpoll until the interface is down */
 1473		netpoll_poll_disable(dev);
 1474
 1475		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
 1476
 1477		clear_bit(__LINK_STATE_START, &dev->state);
 1478
 1479		/* Synchronize to scheduled poll. We cannot touch poll list, it
 1480		 * can be even on different cpu. So just clear netif_running().
 1481		 *
 1482		 * dev->stop() will invoke napi_disable() on all of it's
 1483		 * napi_struct instances on this device.
 1484		 */
 1485		smp_mb__after_atomic(); /* Commit netif_running(). */
 1486	}
 1487
 1488	dev_deactivate_many(head);
 1489
 1490	list_for_each_entry(dev, head, close_list) {
 1491		const struct net_device_ops *ops = dev->netdev_ops;
 1492
 1493		/*
 1494		 *	Call the device specific close. This cannot fail.
 1495		 *	Only if device is UP
 1496		 *
 1497		 *	We allow it to be called even after a DETACH hot-plug
 1498		 *	event.
 1499		 */
 1500		if (ops->ndo_stop)
 1501			ops->ndo_stop(dev);
 1502
 1503		dev->flags &= ~IFF_UP;
 1504		netpoll_poll_enable(dev);
 1505	}
 
 
 1506}
 1507
 1508static void __dev_close(struct net_device *dev)
 1509{
 
 1510	LIST_HEAD(single);
 1511
 1512	list_add(&dev->close_list, &single);
 1513	__dev_close_many(&single);
 1514	list_del(&single);
 
 1515}
 1516
 1517void dev_close_many(struct list_head *head, bool unlink)
 1518{
 1519	struct net_device *dev, *tmp;
 
 1520
 1521	/* Remove the devices that don't need to be closed */
 1522	list_for_each_entry_safe(dev, tmp, head, close_list)
 1523		if (!(dev->flags & IFF_UP))
 1524			list_del_init(&dev->close_list);
 1525
 1526	__dev_close_many(head);
 1527
 1528	list_for_each_entry_safe(dev, tmp, head, close_list) {
 1529		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
 1530		call_netdevice_notifiers(NETDEV_DOWN, dev);
 1531		if (unlink)
 1532			list_del_init(&dev->close_list);
 1533	}
 
 
 
 
 1534}
 1535EXPORT_SYMBOL(dev_close_many);
 1536
 1537/**
 1538 *	dev_close - shutdown an interface.
 1539 *	@dev: device to shutdown
 1540 *
 1541 *	This function moves an active device into down state. A
 1542 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 1543 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 1544 *	chain.
 1545 */
 1546void dev_close(struct net_device *dev)
 1547{
 1548	if (dev->flags & IFF_UP) {
 1549		LIST_HEAD(single);
 1550
 1551		list_add(&dev->close_list, &single);
 1552		dev_close_many(&single, true);
 1553		list_del(&single);
 1554	}
 
 1555}
 1556EXPORT_SYMBOL(dev_close);
 1557
 1558
 1559/**
 1560 *	dev_disable_lro - disable Large Receive Offload on a device
 1561 *	@dev: device
 1562 *
 1563 *	Disable Large Receive Offload (LRO) on a net device.  Must be
 1564 *	called under RTNL.  This is needed if received packets may be
 1565 *	forwarded to another interface.
 1566 */
 1567void dev_disable_lro(struct net_device *dev)
 1568{
 1569	struct net_device *lower_dev;
 1570	struct list_head *iter;
 
 
 
 
 1571
 1572	dev->wanted_features &= ~NETIF_F_LRO;
 1573	netdev_update_features(dev);
 1574
 1575	if (unlikely(dev->features & NETIF_F_LRO))
 1576		netdev_WARN(dev, "failed to disable LRO!\n");
 1577
 1578	netdev_for_each_lower_dev(dev, lower_dev, iter)
 1579		dev_disable_lro(lower_dev);
 1580}
 1581EXPORT_SYMBOL(dev_disable_lro);
 1582
 1583/**
 1584 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 1585 *	@dev: device
 1586 *
 1587 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 1588 *	called under RTNL.  This is needed if Generic XDP is installed on
 1589 *	the device.
 1590 */
 1591static void dev_disable_gro_hw(struct net_device *dev)
 1592{
 1593	dev->wanted_features &= ~NETIF_F_GRO_HW;
 1594	netdev_update_features(dev);
 1595
 1596	if (unlikely(dev->features & NETIF_F_GRO_HW))
 1597		netdev_WARN(dev, "failed to disable GRO_HW!\n");
 1598}
 1599
 1600const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 1601{
 1602#define N(val) 						\
 1603	case NETDEV_##val:				\
 1604		return "NETDEV_" __stringify(val);
 1605	switch (cmd) {
 1606	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
 1607	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
 1608	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
 1609	N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
 1610	N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
 1611	N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
 1612	N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 1613	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 1614	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 1615	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
 1616	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
 1617	}
 1618#undef N
 1619	return "UNKNOWN_NETDEV_EVENT";
 1620}
 1621EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
 1622
 1623static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 1624				   struct net_device *dev)
 1625{
 1626	struct netdev_notifier_info info = {
 1627		.dev = dev,
 1628	};
 1629
 1630	return nb->notifier_call(nb, val, &info);
 1631}
 1632
 1633static int call_netdevice_register_notifiers(struct notifier_block *nb,
 1634					     struct net_device *dev)
 1635{
 1636	int err;
 1637
 1638	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
 1639	err = notifier_to_errno(err);
 1640	if (err)
 1641		return err;
 1642
 1643	if (!(dev->flags & IFF_UP))
 1644		return 0;
 1645
 1646	call_netdevice_notifier(nb, NETDEV_UP, dev);
 1647	return 0;
 1648}
 1649
 1650static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
 1651						struct net_device *dev)
 1652{
 1653	if (dev->flags & IFF_UP) {
 1654		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
 1655					dev);
 1656		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
 1657	}
 1658	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 1659}
 1660
 1661static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
 1662						 struct net *net)
 1663{
 1664	struct net_device *dev;
 1665	int err;
 1666
 1667	for_each_netdev(net, dev) {
 1668		err = call_netdevice_register_notifiers(nb, dev);
 1669		if (err)
 1670			goto rollback;
 1671	}
 1672	return 0;
 1673
 1674rollback:
 1675	for_each_netdev_continue_reverse(net, dev)
 1676		call_netdevice_unregister_notifiers(nb, dev);
 1677	return err;
 1678}
 1679
 1680static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
 1681						    struct net *net)
 1682{
 1683	struct net_device *dev;
 1684
 1685	for_each_netdev(net, dev)
 1686		call_netdevice_unregister_notifiers(nb, dev);
 1687}
 1688
 1689static int dev_boot_phase = 1;
 1690
 1691/**
 1692 * register_netdevice_notifier - register a network notifier block
 1693 * @nb: notifier
 1694 *
 1695 * Register a notifier to be called when network device events occur.
 1696 * The notifier passed is linked into the kernel structures and must
 1697 * not be reused until it has been unregistered. A negative errno code
 1698 * is returned on a failure.
 1699 *
 1700 * When registered all registration and up events are replayed
 1701 * to the new notifier to allow device to have a race free
 1702 * view of the network device list.
 1703 */
 1704
 1705int register_netdevice_notifier(struct notifier_block *nb)
 1706{
 
 
 1707	struct net *net;
 1708	int err;
 1709
 1710	/* Close race with setup_net() and cleanup_net() */
 1711	down_write(&pernet_ops_rwsem);
 1712	rtnl_lock();
 1713	err = raw_notifier_chain_register(&netdev_chain, nb);
 1714	if (err)
 1715		goto unlock;
 1716	if (dev_boot_phase)
 1717		goto unlock;
 1718	for_each_net(net) {
 1719		err = call_netdevice_register_net_notifiers(nb, net);
 1720		if (err)
 1721			goto rollback;
 
 
 
 
 
 
 
 
 1722	}
 1723
 1724unlock:
 1725	rtnl_unlock();
 1726	up_write(&pernet_ops_rwsem);
 1727	return err;
 1728
 1729rollback:
 1730	for_each_net_continue_reverse(net)
 1731		call_netdevice_unregister_net_notifiers(nb, net);
 
 
 
 
 
 
 
 
 
 
 
 
 1732
 
 1733	raw_notifier_chain_unregister(&netdev_chain, nb);
 1734	goto unlock;
 1735}
 1736EXPORT_SYMBOL(register_netdevice_notifier);
 1737
 1738/**
 1739 * unregister_netdevice_notifier - unregister a network notifier block
 1740 * @nb: notifier
 1741 *
 1742 * Unregister a notifier previously registered by
 1743 * register_netdevice_notifier(). The notifier is unlinked into the
 1744 * kernel structures and may then be reused. A negative errno code
 1745 * is returned on a failure.
 1746 *
 1747 * After unregistering unregister and down device events are synthesized
 1748 * for all devices on the device list to the removed notifier to remove
 1749 * the need for special case cleanup code.
 1750 */
 1751
 1752int unregister_netdevice_notifier(struct notifier_block *nb)
 1753{
 
 1754	struct net *net;
 1755	int err;
 1756
 1757	/* Close race with setup_net() and cleanup_net() */
 1758	down_write(&pernet_ops_rwsem);
 1759	rtnl_lock();
 1760	err = raw_notifier_chain_unregister(&netdev_chain, nb);
 1761	if (err)
 1762		goto unlock;
 1763
 1764	for_each_net(net)
 1765		call_netdevice_unregister_net_notifiers(nb, net);
 1766
 
 
 
 
 
 
 
 1767unlock:
 1768	rtnl_unlock();
 1769	up_write(&pernet_ops_rwsem);
 1770	return err;
 1771}
 1772EXPORT_SYMBOL(unregister_netdevice_notifier);
 1773
 1774static int __register_netdevice_notifier_net(struct net *net,
 1775					     struct notifier_block *nb,
 1776					     bool ignore_call_fail)
 1777{
 1778	int err;
 1779
 1780	err = raw_notifier_chain_register(&net->netdev_chain, nb);
 1781	if (err)
 1782		return err;
 1783	if (dev_boot_phase)
 1784		return 0;
 1785
 1786	err = call_netdevice_register_net_notifiers(nb, net);
 1787	if (err && !ignore_call_fail)
 1788		goto chain_unregister;
 1789
 1790	return 0;
 1791
 1792chain_unregister:
 1793	raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1794	return err;
 1795}
 1796
 1797static int __unregister_netdevice_notifier_net(struct net *net,
 1798					       struct notifier_block *nb)
 1799{
 1800	int err;
 1801
 1802	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1803	if (err)
 1804		return err;
 1805
 1806	call_netdevice_unregister_net_notifiers(nb, net);
 1807	return 0;
 1808}
 1809
 1810/**
 1811 * register_netdevice_notifier_net - register a per-netns network notifier block
 1812 * @net: network namespace
 1813 * @nb: notifier
 1814 *
 1815 * Register a notifier to be called when network device events occur.
 1816 * The notifier passed is linked into the kernel structures and must
 1817 * not be reused until it has been unregistered. A negative errno code
 1818 * is returned on a failure.
 1819 *
 1820 * When registered all registration and up events are replayed
 1821 * to the new notifier to allow device to have a race free
 1822 * view of the network device list.
 1823 */
 1824
 1825int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
 1826{
 1827	int err;
 1828
 1829	rtnl_lock();
 1830	err = __register_netdevice_notifier_net(net, nb, false);
 1831	rtnl_unlock();
 1832	return err;
 1833}
 1834EXPORT_SYMBOL(register_netdevice_notifier_net);
 1835
 1836/**
 1837 * unregister_netdevice_notifier_net - unregister a per-netns
 1838 *                                     network notifier block
 1839 * @net: network namespace
 1840 * @nb: notifier
 1841 *
 1842 * Unregister a notifier previously registered by
 1843 * register_netdevice_notifier(). The notifier is unlinked into the
 1844 * kernel structures and may then be reused. A negative errno code
 1845 * is returned on a failure.
 1846 *
 1847 * After unregistering unregister and down device events are synthesized
 1848 * for all devices on the device list to the removed notifier to remove
 1849 * the need for special case cleanup code.
 1850 */
 1851
 1852int unregister_netdevice_notifier_net(struct net *net,
 1853				      struct notifier_block *nb)
 1854{
 1855	int err;
 1856
 1857	rtnl_lock();
 1858	err = __unregister_netdevice_notifier_net(net, nb);
 1859	rtnl_unlock();
 1860	return err;
 1861}
 1862EXPORT_SYMBOL(unregister_netdevice_notifier_net);
 1863
 1864static void __move_netdevice_notifier_net(struct net *src_net,
 1865					  struct net *dst_net,
 1866					  struct notifier_block *nb)
 1867{
 1868	__unregister_netdevice_notifier_net(src_net, nb);
 1869	__register_netdevice_notifier_net(dst_net, nb, true);
 1870}
 1871
 1872int register_netdevice_notifier_dev_net(struct net_device *dev,
 1873					struct notifier_block *nb,
 1874					struct netdev_net_notifier *nn)
 1875{
 1876	int err;
 1877
 1878	rtnl_lock();
 1879	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
 1880	if (!err) {
 1881		nn->nb = nb;
 1882		list_add(&nn->list, &dev->net_notifier_list);
 1883	}
 1884	rtnl_unlock();
 1885	return err;
 1886}
 1887EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
 1888
 1889int unregister_netdevice_notifier_dev_net(struct net_device *dev,
 1890					  struct notifier_block *nb,
 1891					  struct netdev_net_notifier *nn)
 1892{
 1893	int err;
 1894
 1895	rtnl_lock();
 1896	list_del(&nn->list);
 1897	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
 1898	rtnl_unlock();
 1899	return err;
 1900}
 1901EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
 1902
 1903static void move_netdevice_notifiers_dev_net(struct net_device *dev,
 1904					     struct net *net)
 1905{
 1906	struct netdev_net_notifier *nn;
 1907
 1908	list_for_each_entry(nn, &dev->net_notifier_list, list)
 1909		__move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
 1910}
 1911
 1912/**
 1913 *	call_netdevice_notifiers_info - call all network notifier blocks
 1914 *	@val: value passed unmodified to notifier function
 1915 *	@info: notifier information data
 1916 *
 1917 *	Call all network notifier blocks.  Parameters and return value
 1918 *	are as for raw_notifier_call_chain().
 1919 */
 1920
 1921static int call_netdevice_notifiers_info(unsigned long val,
 1922					 struct netdev_notifier_info *info)
 1923{
 1924	struct net *net = dev_net(info->dev);
 1925	int ret;
 1926
 1927	ASSERT_RTNL();
 1928
 1929	/* Run per-netns notifier block chain first, then run the global one.
 1930	 * Hopefully, one day, the global one is going to be removed after
 1931	 * all notifier block registrators get converted to be per-netns.
 1932	 */
 1933	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
 1934	if (ret & NOTIFY_STOP_MASK)
 1935		return ret;
 1936	return raw_notifier_call_chain(&netdev_chain, val, info);
 1937}
 1938
 1939/**
 1940 *	call_netdevice_notifiers_info_robust - call per-netns notifier blocks
 1941 *	                                       for and rollback on error
 1942 *	@val_up: value passed unmodified to notifier function
 1943 *	@val_down: value passed unmodified to the notifier function when
 1944 *	           recovering from an error on @val_up
 1945 *	@info: notifier information data
 1946 *
 1947 *	Call all per-netns network notifier blocks, but not notifier blocks on
 1948 *	the global notifier chain. Parameters and return value are as for
 1949 *	raw_notifier_call_chain_robust().
 1950 */
 1951
 1952static int
 1953call_netdevice_notifiers_info_robust(unsigned long val_up,
 1954				     unsigned long val_down,
 1955				     struct netdev_notifier_info *info)
 1956{
 1957	struct net *net = dev_net(info->dev);
 1958
 1959	ASSERT_RTNL();
 1960
 1961	return raw_notifier_call_chain_robust(&net->netdev_chain,
 1962					      val_up, val_down, info);
 1963}
 1964
 1965static int call_netdevice_notifiers_extack(unsigned long val,
 1966					   struct net_device *dev,
 1967					   struct netlink_ext_ack *extack)
 1968{
 1969	struct netdev_notifier_info info = {
 1970		.dev = dev,
 1971		.extack = extack,
 1972	};
 1973
 1974	return call_netdevice_notifiers_info(val, &info);
 1975}
 1976
 1977/**
 1978 *	call_netdevice_notifiers - call all network notifier blocks
 1979 *      @val: value passed unmodified to notifier function
 1980 *      @dev: net_device pointer passed unmodified to notifier function
 1981 *
 1982 *	Call all network notifier blocks.  Parameters and return value
 1983 *	are as for raw_notifier_call_chain().
 1984 */
 1985
 1986int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 1987{
 1988	return call_netdevice_notifiers_extack(val, dev, NULL);
 
 1989}
 1990EXPORT_SYMBOL(call_netdevice_notifiers);
 1991
 1992/**
 1993 *	call_netdevice_notifiers_mtu - call all network notifier blocks
 1994 *	@val: value passed unmodified to notifier function
 1995 *	@dev: net_device pointer passed unmodified to notifier function
 1996 *	@arg: additional u32 argument passed to the notifier function
 1997 *
 1998 *	Call all network notifier blocks.  Parameters and return value
 1999 *	are as for raw_notifier_call_chain().
 2000 */
 2001static int call_netdevice_notifiers_mtu(unsigned long val,
 2002					struct net_device *dev, u32 arg)
 2003{
 2004	struct netdev_notifier_info_ext info = {
 2005		.info.dev = dev,
 2006		.ext.mtu = arg,
 2007	};
 2008
 2009	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
 2010
 2011	return call_netdevice_notifiers_info(val, &info.info);
 2012}
 2013
 2014#ifdef CONFIG_NET_INGRESS
 2015static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
 2016
 2017void net_inc_ingress_queue(void)
 2018{
 2019	static_branch_inc(&ingress_needed_key);
 2020}
 2021EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
 2022
 2023void net_dec_ingress_queue(void)
 2024{
 2025	static_branch_dec(&ingress_needed_key);
 2026}
 2027EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
 2028#endif
 2029
 2030#ifdef CONFIG_NET_EGRESS
 2031static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
 2032
 2033void net_inc_egress_queue(void)
 2034{
 2035	static_branch_inc(&egress_needed_key);
 2036}
 2037EXPORT_SYMBOL_GPL(net_inc_egress_queue);
 2038
 2039void net_dec_egress_queue(void)
 2040{
 2041	static_branch_dec(&egress_needed_key);
 2042}
 2043EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 2044#endif
 2045
 2046DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
 2047EXPORT_SYMBOL(netstamp_needed_key);
 2048#ifdef CONFIG_JUMP_LABEL
 2049static atomic_t netstamp_needed_deferred;
 2050static atomic_t netstamp_wanted;
 2051static void netstamp_clear(struct work_struct *work)
 2052{
 2053	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
 2054	int wanted;
 2055
 2056	wanted = atomic_add_return(deferred, &netstamp_wanted);
 2057	if (wanted > 0)
 2058		static_branch_enable(&netstamp_needed_key);
 2059	else
 2060		static_branch_disable(&netstamp_needed_key);
 2061}
 2062static DECLARE_WORK(netstamp_work, netstamp_clear);
 2063#endif
 2064
 2065void net_enable_timestamp(void)
 2066{
 2067#ifdef CONFIG_JUMP_LABEL
 2068	int wanted = atomic_read(&netstamp_wanted);
 2069
 2070	while (wanted > 0) {
 2071		if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
 2072			return;
 
 2073	}
 2074	atomic_inc(&netstamp_needed_deferred);
 2075	schedule_work(&netstamp_work);
 2076#else
 2077	static_branch_inc(&netstamp_needed_key);
 2078#endif
 
 
 2079}
 2080EXPORT_SYMBOL(net_enable_timestamp);
 2081
 2082void net_disable_timestamp(void)
 2083{
 2084#ifdef CONFIG_JUMP_LABEL
 2085	int wanted = atomic_read(&netstamp_wanted);
 2086
 2087	while (wanted > 1) {
 2088		if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
 2089			return;
 2090	}
 2091	atomic_dec(&netstamp_needed_deferred);
 2092	schedule_work(&netstamp_work);
 2093#else
 2094	static_branch_dec(&netstamp_needed_key);
 2095#endif
 
 2096}
 2097EXPORT_SYMBOL(net_disable_timestamp);
 2098
 2099static inline void net_timestamp_set(struct sk_buff *skb)
 2100{
 2101	skb->tstamp = 0;
 2102	skb->mono_delivery_time = 0;
 2103	if (static_branch_unlikely(&netstamp_needed_key))
 2104		skb->tstamp = ktime_get_real();
 2105}
 2106
 2107#define net_timestamp_check(COND, SKB)				\
 2108	if (static_branch_unlikely(&netstamp_needed_key)) {	\
 2109		if ((COND) && !(SKB)->tstamp)			\
 2110			(SKB)->tstamp = ktime_get_real();	\
 2111	}							\
 2112
 2113bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
 2114{
 2115	return __is_skb_forwardable(dev, skb, true);
 2116}
 2117EXPORT_SYMBOL_GPL(is_skb_forwardable);
 
 
 
 
 
 
 
 
 
 
 
 2118
 2119static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
 2120			      bool check_mtu)
 2121{
 2122	int ret = ____dev_forward_skb(dev, skb, check_mtu);
 
 
 
 2123
 2124	if (likely(!ret)) {
 2125		skb->protocol = eth_type_trans(skb, dev);
 2126		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2127	}
 2128
 2129	return ret;
 
 
 
 2130}
 2131
 2132int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 
 2133{
 2134	return __dev_forward_skb2(dev, skb, true);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2135}
 2136EXPORT_SYMBOL_GPL(__dev_forward_skb);
 2137
 2138/**
 2139 * dev_forward_skb - loopback an skb to another netif
 2140 *
 2141 * @dev: destination network device
 2142 * @skb: buffer to forward
 2143 *
 2144 * return values:
 2145 *	NET_RX_SUCCESS	(no congestion)
 2146 *	NET_RX_DROP     (packet was dropped, but freed)
 2147 *
 2148 * dev_forward_skb can be used for injecting an skb from the
 2149 * start_xmit function of one device into the receive queue
 2150 * of another device.
 2151 *
 2152 * The receiving device may be in another namespace, so
 2153 * we have to clear all information in the skb that could
 2154 * impact namespace isolation.
 2155 */
 2156int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2157{
 2158	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2159}
 2160EXPORT_SYMBOL_GPL(dev_forward_skb);
 2161
 2162int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
 2163{
 2164	return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
 2165}
 2166
 2167static inline int deliver_skb(struct sk_buff *skb,
 2168			      struct packet_type *pt_prev,
 2169			      struct net_device *orig_dev)
 2170{
 2171	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 2172		return -ENOMEM;
 2173	refcount_inc(&skb->users);
 2174	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 2175}
 2176
 2177static inline void deliver_ptype_list_skb(struct sk_buff *skb,
 2178					  struct packet_type **pt,
 2179					  struct net_device *orig_dev,
 2180					  __be16 type,
 2181					  struct list_head *ptype_list)
 2182{
 2183	struct packet_type *ptype, *pt_prev = *pt;
 2184
 2185	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2186		if (ptype->type != type)
 2187			continue;
 2188		if (pt_prev)
 2189			deliver_skb(skb, pt_prev, orig_dev);
 2190		pt_prev = ptype;
 2191	}
 2192	*pt = pt_prev;
 2193}
 2194
 2195static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 2196{
 2197	if (!ptype->af_packet_priv || !skb->sk)
 2198		return false;
 2199
 2200	if (ptype->id_match)
 2201		return ptype->id_match(ptype, skb->sk);
 2202	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
 2203		return true;
 2204
 2205	return false;
 2206}
 2207
 2208/**
 2209 * dev_nit_active - return true if any network interface taps are in use
 2210 *
 2211 * @dev: network device to check for the presence of taps
 2212 */
 2213bool dev_nit_active(struct net_device *dev)
 2214{
 2215	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
 2216}
 2217EXPORT_SYMBOL_GPL(dev_nit_active);
 2218
 2219/*
 2220 *	Support routine. Sends outgoing frames to any network
 2221 *	taps currently in use.
 2222 */
 2223
 2224void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 2225{
 2226	struct packet_type *ptype;
 2227	struct sk_buff *skb2 = NULL;
 2228	struct packet_type *pt_prev = NULL;
 2229	struct list_head *ptype_list = &ptype_all;
 2230
 2231	rcu_read_lock();
 2232again:
 2233	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2234		if (ptype->ignore_outgoing)
 2235			continue;
 2236
 2237		/* Never send packets back to the socket
 2238		 * they originated from - MvS (miquels@drinkel.ow.org)
 2239		 */
 2240		if (skb_loop_sk(ptype, skb))
 2241			continue;
 
 
 
 
 
 2242
 2243		if (pt_prev) {
 2244			deliver_skb(skb2, pt_prev, skb->dev);
 2245			pt_prev = ptype;
 2246			continue;
 2247		}
 2248
 2249		/* need to clone skb, done only once */
 2250		skb2 = skb_clone(skb, GFP_ATOMIC);
 2251		if (!skb2)
 2252			goto out_unlock;
 2253
 2254		net_timestamp_set(skb2);
 2255
 2256		/* skb->nh should be correctly
 2257		 * set by sender, so that the second statement is
 2258		 * just protection against buggy protocols.
 2259		 */
 2260		skb_reset_mac_header(skb2);
 2261
 2262		if (skb_network_header(skb2) < skb2->data ||
 2263		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
 2264			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
 2265					     ntohs(skb2->protocol),
 2266					     dev->name);
 2267			skb_reset_network_header(skb2);
 2268		}
 2269
 2270		skb2->transport_header = skb2->network_header;
 2271		skb2->pkt_type = PACKET_OUTGOING;
 2272		pt_prev = ptype;
 2273	}
 
 
 
 2274
 2275	if (ptype_list == &ptype_all) {
 2276		ptype_list = &dev->ptype_all;
 2277		goto again;
 2278	}
 2279out_unlock:
 2280	if (pt_prev) {
 2281		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
 2282			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 2283		else
 2284			kfree_skb(skb2);
 2285	}
 
 
 2286	rcu_read_unlock();
 2287}
 2288EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
 2289
 2290/**
 2291 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 2292 * @dev: Network device
 2293 * @txq: number of queues available
 2294 *
 2295 * If real_num_tx_queues is changed the tc mappings may no longer be
 2296 * valid. To resolve this verify the tc mapping remains valid and if
 2297 * not NULL the mapping. With no priorities mapping to this
 2298 * offset/count pair it will no longer be used. In the worst case TC0
 2299 * is invalid nothing can be done so disable priority mappings. If is
 2300 * expected that drivers will fix this mapping if they can before
 2301 * calling netif_set_real_num_tx_queues.
 2302 */
 2303static void netif_setup_tc(struct net_device *dev, unsigned int txq)
 2304{
 2305	int i;
 2306	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2307
 2308	/* If TC0 is invalidated disable TC mapping */
 2309	if (tc->offset + tc->count > txq) {
 2310		netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
 2311		dev->num_tc = 0;
 2312		return;
 2313	}
 2314
 2315	/* Invalidated prio to tc mappings set to TC0 */
 2316	for (i = 1; i < TC_BITMASK + 1; i++) {
 2317		int q = netdev_get_prio_tc_map(dev, i);
 2318
 2319		tc = &dev->tc_to_txq[q];
 2320		if (tc->offset + tc->count > txq) {
 2321			netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
 2322				    i, q);
 2323			netdev_set_prio_tc_map(dev, i, 0);
 2324		}
 2325	}
 2326}
 2327
 2328int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
 2329{
 2330	if (dev->num_tc) {
 2331		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2332		int i;
 2333
 2334		/* walk through the TCs and see if it falls into any of them */
 2335		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
 2336			if ((txq - tc->offset) < tc->count)
 2337				return i;
 2338		}
 2339
 2340		/* didn't find it, just return -1 to indicate no match */
 2341		return -1;
 2342	}
 2343
 2344	return 0;
 2345}
 2346EXPORT_SYMBOL(netdev_txq_to_tc);
 2347
 2348#ifdef CONFIG_XPS
 2349static struct static_key xps_needed __read_mostly;
 2350static struct static_key xps_rxqs_needed __read_mostly;
 2351static DEFINE_MUTEX(xps_map_mutex);
 2352#define xmap_dereference(P)		\
 2353	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
 2354
 2355static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 2356			     struct xps_dev_maps *old_maps, int tci, u16 index)
 2357{
 2358	struct xps_map *map = NULL;
 2359	int pos;
 2360
 2361	if (dev_maps)
 2362		map = xmap_dereference(dev_maps->attr_map[tci]);
 2363	if (!map)
 2364		return false;
 2365
 2366	for (pos = map->len; pos--;) {
 2367		if (map->queues[pos] != index)
 2368			continue;
 2369
 2370		if (map->len > 1) {
 2371			map->queues[pos] = map->queues[--map->len];
 2372			break;
 2373		}
 2374
 2375		if (old_maps)
 2376			RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
 2377		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 2378		kfree_rcu(map, rcu);
 2379		return false;
 2380	}
 2381
 2382	return true;
 2383}
 2384
 2385static bool remove_xps_queue_cpu(struct net_device *dev,
 2386				 struct xps_dev_maps *dev_maps,
 2387				 int cpu, u16 offset, u16 count)
 2388{
 2389	int num_tc = dev_maps->num_tc;
 2390	bool active = false;
 2391	int tci;
 2392
 2393	for (tci = cpu * num_tc; num_tc--; tci++) {
 2394		int i, j;
 2395
 2396		for (i = count, j = offset; i--; j++) {
 2397			if (!remove_xps_queue(dev_maps, NULL, tci, j))
 2398				break;
 2399		}
 2400
 2401		active |= i < 0;
 2402	}
 2403
 2404	return active;
 2405}
 2406
 2407static void reset_xps_maps(struct net_device *dev,
 2408			   struct xps_dev_maps *dev_maps,
 2409			   enum xps_map_type type)
 2410{
 2411	static_key_slow_dec_cpuslocked(&xps_needed);
 2412	if (type == XPS_RXQS)
 2413		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
 2414
 2415	RCU_INIT_POINTER(dev->xps_maps[type], NULL);
 2416
 2417	kfree_rcu(dev_maps, rcu);
 2418}
 2419
 2420static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
 2421			   u16 offset, u16 count)
 2422{
 2423	struct xps_dev_maps *dev_maps;
 2424	bool active = false;
 2425	int i, j;
 2426
 2427	dev_maps = xmap_dereference(dev->xps_maps[type]);
 2428	if (!dev_maps)
 2429		return;
 2430
 2431	for (j = 0; j < dev_maps->nr_ids; j++)
 2432		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
 2433	if (!active)
 2434		reset_xps_maps(dev, dev_maps, type);
 2435
 2436	if (type == XPS_CPUS) {
 2437		for (i = offset + (count - 1); count--; i--)
 2438			netdev_queue_numa_node_write(
 2439				netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
 2440	}
 2441}
 2442
 2443static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 2444				   u16 count)
 2445{
 2446	if (!static_key_false(&xps_needed))
 2447		return;
 2448
 2449	cpus_read_lock();
 2450	mutex_lock(&xps_map_mutex);
 2451
 2452	if (static_key_false(&xps_rxqs_needed))
 2453		clean_xps_maps(dev, XPS_RXQS, offset, count);
 2454
 2455	clean_xps_maps(dev, XPS_CPUS, offset, count);
 2456
 2457	mutex_unlock(&xps_map_mutex);
 2458	cpus_read_unlock();
 2459}
 2460
 2461static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
 2462{
 2463	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
 2464}
 2465
 2466static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
 2467				      u16 index, bool is_rxqs_map)
 2468{
 2469	struct xps_map *new_map;
 2470	int alloc_len = XPS_MIN_MAP_ALLOC;
 2471	int i, pos;
 2472
 2473	for (pos = 0; map && pos < map->len; pos++) {
 2474		if (map->queues[pos] != index)
 2475			continue;
 2476		return map;
 2477	}
 2478
 2479	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
 2480	if (map) {
 2481		if (pos < map->alloc_len)
 2482			return map;
 2483
 2484		alloc_len = map->alloc_len * 2;
 2485	}
 2486
 2487	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
 2488	 *  map
 2489	 */
 2490	if (is_rxqs_map)
 2491		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
 2492	else
 2493		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
 2494				       cpu_to_node(attr_index));
 2495	if (!new_map)
 2496		return NULL;
 2497
 2498	for (i = 0; i < pos; i++)
 2499		new_map->queues[i] = map->queues[i];
 2500	new_map->alloc_len = alloc_len;
 2501	new_map->len = pos;
 2502
 2503	return new_map;
 2504}
 2505
 2506/* Copy xps maps at a given index */
 2507static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
 2508			      struct xps_dev_maps *new_dev_maps, int index,
 2509			      int tc, bool skip_tc)
 2510{
 2511	int i, tci = index * dev_maps->num_tc;
 2512	struct xps_map *map;
 2513
 2514	/* copy maps belonging to foreign traffic classes */
 2515	for (i = 0; i < dev_maps->num_tc; i++, tci++) {
 2516		if (i == tc && skip_tc)
 2517			continue;
 2518
 2519		/* fill in the new device map from the old device map */
 2520		map = xmap_dereference(dev_maps->attr_map[tci]);
 2521		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2522	}
 2523}
 2524
 2525/* Must be called under cpus_read_lock */
 2526int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 2527			  u16 index, enum xps_map_type type)
 2528{
 2529	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
 2530	const unsigned long *online_mask = NULL;
 2531	bool active = false, copy = false;
 2532	int i, j, tci, numa_node_id = -2;
 2533	int maps_sz, num_tc = 1, tc = 0;
 2534	struct xps_map *map, *new_map;
 2535	unsigned int nr_ids;
 2536
 2537	if (dev->num_tc) {
 2538		/* Do not allow XPS on subordinate device directly */
 2539		num_tc = dev->num_tc;
 2540		if (num_tc < 0)
 2541			return -EINVAL;
 2542
 2543		/* If queue belongs to subordinate dev use its map */
 2544		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
 2545
 2546		tc = netdev_txq_to_tc(dev, index);
 2547		if (tc < 0)
 2548			return -EINVAL;
 2549	}
 2550
 2551	mutex_lock(&xps_map_mutex);
 2552
 2553	dev_maps = xmap_dereference(dev->xps_maps[type]);
 2554	if (type == XPS_RXQS) {
 2555		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
 2556		nr_ids = dev->num_rx_queues;
 2557	} else {
 2558		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
 2559		if (num_possible_cpus() > 1)
 2560			online_mask = cpumask_bits(cpu_online_mask);
 2561		nr_ids = nr_cpu_ids;
 2562	}
 2563
 2564	if (maps_sz < L1_CACHE_BYTES)
 2565		maps_sz = L1_CACHE_BYTES;
 2566
 2567	/* The old dev_maps could be larger or smaller than the one we're
 2568	 * setting up now, as dev->num_tc or nr_ids could have been updated in
 2569	 * between. We could try to be smart, but let's be safe instead and only
 2570	 * copy foreign traffic classes if the two map sizes match.
 2571	 */
 2572	if (dev_maps &&
 2573	    dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
 2574		copy = true;
 2575
 2576	/* allocate memory for queue storage */
 2577	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
 2578	     j < nr_ids;) {
 2579		if (!new_dev_maps) {
 2580			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 2581			if (!new_dev_maps) {
 2582				mutex_unlock(&xps_map_mutex);
 2583				return -ENOMEM;
 2584			}
 2585
 2586			new_dev_maps->nr_ids = nr_ids;
 2587			new_dev_maps->num_tc = num_tc;
 2588		}
 2589
 2590		tci = j * num_tc + tc;
 2591		map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
 2592
 2593		map = expand_xps_map(map, j, index, type == XPS_RXQS);
 2594		if (!map)
 2595			goto error;
 2596
 2597		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2598	}
 2599
 2600	if (!new_dev_maps)
 2601		goto out_no_new_maps;
 2602
 2603	if (!dev_maps) {
 2604		/* Increment static keys at most once per type */
 2605		static_key_slow_inc_cpuslocked(&xps_needed);
 2606		if (type == XPS_RXQS)
 2607			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
 2608	}
 2609
 2610	for (j = 0; j < nr_ids; j++) {
 2611		bool skip_tc = false;
 2612
 2613		tci = j * num_tc + tc;
 2614		if (netif_attr_test_mask(j, mask, nr_ids) &&
 2615		    netif_attr_test_online(j, online_mask, nr_ids)) {
 2616			/* add tx-queue to CPU/rx-queue maps */
 2617			int pos = 0;
 2618
 2619			skip_tc = true;
 2620
 2621			map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2622			while ((pos < map->len) && (map->queues[pos] != index))
 2623				pos++;
 2624
 2625			if (pos == map->len)
 2626				map->queues[map->len++] = index;
 2627#ifdef CONFIG_NUMA
 2628			if (type == XPS_CPUS) {
 2629				if (numa_node_id == -2)
 2630					numa_node_id = cpu_to_node(j);
 2631				else if (numa_node_id != cpu_to_node(j))
 2632					numa_node_id = -1;
 2633			}
 2634#endif
 2635		}
 2636
 2637		if (copy)
 2638			xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
 2639					  skip_tc);
 2640	}
 2641
 2642	rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
 2643
 2644	/* Cleanup old maps */
 2645	if (!dev_maps)
 2646		goto out_no_old_maps;
 2647
 2648	for (j = 0; j < dev_maps->nr_ids; j++) {
 2649		for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
 2650			map = xmap_dereference(dev_maps->attr_map[tci]);
 2651			if (!map)
 2652				continue;
 2653
 2654			if (copy) {
 2655				new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2656				if (map == new_map)
 2657					continue;
 2658			}
 2659
 2660			RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 2661			kfree_rcu(map, rcu);
 2662		}
 2663	}
 2664
 2665	old_dev_maps = dev_maps;
 2666
 2667out_no_old_maps:
 2668	dev_maps = new_dev_maps;
 2669	active = true;
 2670
 2671out_no_new_maps:
 2672	if (type == XPS_CPUS)
 2673		/* update Tx queue numa node */
 2674		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
 2675					     (numa_node_id >= 0) ?
 2676					     numa_node_id : NUMA_NO_NODE);
 2677
 2678	if (!dev_maps)
 2679		goto out_no_maps;
 2680
 2681	/* removes tx-queue from unused CPUs/rx-queues */
 2682	for (j = 0; j < dev_maps->nr_ids; j++) {
 2683		tci = j * dev_maps->num_tc;
 2684
 2685		for (i = 0; i < dev_maps->num_tc; i++, tci++) {
 2686			if (i == tc &&
 2687			    netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
 2688			    netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
 2689				continue;
 2690
 2691			active |= remove_xps_queue(dev_maps,
 2692						   copy ? old_dev_maps : NULL,
 2693						   tci, index);
 2694		}
 2695	}
 2696
 2697	if (old_dev_maps)
 2698		kfree_rcu(old_dev_maps, rcu);
 2699
 2700	/* free map if not active */
 2701	if (!active)
 2702		reset_xps_maps(dev, dev_maps, type);
 2703
 2704out_no_maps:
 2705	mutex_unlock(&xps_map_mutex);
 2706
 2707	return 0;
 2708error:
 2709	/* remove any maps that we added */
 2710	for (j = 0; j < nr_ids; j++) {
 2711		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2712			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2713			map = copy ?
 2714			      xmap_dereference(dev_maps->attr_map[tci]) :
 2715			      NULL;
 2716			if (new_map && new_map != map)
 2717				kfree(new_map);
 2718		}
 2719	}
 2720
 2721	mutex_unlock(&xps_map_mutex);
 2722
 2723	kfree(new_dev_maps);
 2724	return -ENOMEM;
 2725}
 2726EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
 2727
 2728int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 2729			u16 index)
 2730{
 2731	int ret;
 2732
 2733	cpus_read_lock();
 2734	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
 2735	cpus_read_unlock();
 2736
 2737	return ret;
 2738}
 2739EXPORT_SYMBOL(netif_set_xps_queue);
 2740
 2741#endif
 2742static void netdev_unbind_all_sb_channels(struct net_device *dev)
 2743{
 2744	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2745
 2746	/* Unbind any subordinate channels */
 2747	while (txq-- != &dev->_tx[0]) {
 2748		if (txq->sb_dev)
 2749			netdev_unbind_sb_channel(dev, txq->sb_dev);
 2750	}
 2751}
 2752
 2753void netdev_reset_tc(struct net_device *dev)
 2754{
 2755#ifdef CONFIG_XPS
 2756	netif_reset_xps_queues_gt(dev, 0);
 2757#endif
 2758	netdev_unbind_all_sb_channels(dev);
 2759
 2760	/* Reset TC configuration of device */
 2761	dev->num_tc = 0;
 2762	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
 2763	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
 2764}
 2765EXPORT_SYMBOL(netdev_reset_tc);
 2766
 2767int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
 2768{
 2769	if (tc >= dev->num_tc)
 2770		return -EINVAL;
 2771
 2772#ifdef CONFIG_XPS
 2773	netif_reset_xps_queues(dev, offset, count);
 2774#endif
 2775	dev->tc_to_txq[tc].count = count;
 2776	dev->tc_to_txq[tc].offset = offset;
 2777	return 0;
 2778}
 2779EXPORT_SYMBOL(netdev_set_tc_queue);
 2780
 2781int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 2782{
 2783	if (num_tc > TC_MAX_QUEUE)
 2784		return -EINVAL;
 2785
 2786#ifdef CONFIG_XPS
 2787	netif_reset_xps_queues_gt(dev, 0);
 2788#endif
 2789	netdev_unbind_all_sb_channels(dev);
 2790
 2791	dev->num_tc = num_tc;
 2792	return 0;
 2793}
 2794EXPORT_SYMBOL(netdev_set_num_tc);
 2795
 2796void netdev_unbind_sb_channel(struct net_device *dev,
 2797			      struct net_device *sb_dev)
 2798{
 2799	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2800
 2801#ifdef CONFIG_XPS
 2802	netif_reset_xps_queues_gt(sb_dev, 0);
 2803#endif
 2804	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
 2805	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
 2806
 2807	while (txq-- != &dev->_tx[0]) {
 2808		if (txq->sb_dev == sb_dev)
 2809			txq->sb_dev = NULL;
 2810	}
 2811}
 2812EXPORT_SYMBOL(netdev_unbind_sb_channel);
 2813
 2814int netdev_bind_sb_channel_queue(struct net_device *dev,
 2815				 struct net_device *sb_dev,
 2816				 u8 tc, u16 count, u16 offset)
 2817{
 2818	/* Make certain the sb_dev and dev are already configured */
 2819	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
 2820		return -EINVAL;
 2821
 2822	/* We cannot hand out queues we don't have */
 2823	if ((offset + count) > dev->real_num_tx_queues)
 2824		return -EINVAL;
 2825
 2826	/* Record the mapping */
 2827	sb_dev->tc_to_txq[tc].count = count;
 2828	sb_dev->tc_to_txq[tc].offset = offset;
 2829
 2830	/* Provide a way for Tx queue to find the tc_to_txq map or
 2831	 * XPS map for itself.
 2832	 */
 2833	while (count--)
 2834		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
 2835
 2836	return 0;
 2837}
 2838EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
 2839
 2840int netdev_set_sb_channel(struct net_device *dev, u16 channel)
 2841{
 2842	/* Do not use a multiqueue device to represent a subordinate channel */
 2843	if (netif_is_multiqueue(dev))
 2844		return -ENODEV;
 2845
 2846	/* We allow channels 1 - 32767 to be used for subordinate channels.
 2847	 * Channel 0 is meant to be "native" mode and used only to represent
 2848	 * the main root device. We allow writing 0 to reset the device back
 2849	 * to normal mode after being used as a subordinate channel.
 2850	 */
 2851	if (channel > S16_MAX)
 2852		return -EINVAL;
 2853
 2854	dev->num_tc = -channel;
 2855
 2856	return 0;
 2857}
 2858EXPORT_SYMBOL(netdev_set_sb_channel);
 2859
 2860/*
 2861 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
 2862 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
 2863 */
 2864int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 2865{
 2866	bool disabling;
 2867	int rc;
 2868
 2869	disabling = txq < dev->real_num_tx_queues;
 2870
 2871	if (txq < 1 || txq > dev->num_tx_queues)
 2872		return -EINVAL;
 2873
 2874	if (dev->reg_state == NETREG_REGISTERED ||
 2875	    dev->reg_state == NETREG_UNREGISTERING) {
 2876		ASSERT_RTNL();
 2877
 2878		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
 2879						  txq);
 2880		if (rc)
 2881			return rc;
 2882
 2883		if (dev->num_tc)
 2884			netif_setup_tc(dev, txq);
 2885
 2886		dev_qdisc_change_real_num_tx(dev, txq);
 2887
 2888		dev->real_num_tx_queues = txq;
 2889
 2890		if (disabling) {
 2891			synchronize_net();
 2892			qdisc_reset_all_tx_gt(dev, txq);
 2893#ifdef CONFIG_XPS
 2894			netif_reset_xps_queues_gt(dev, txq);
 2895#endif
 2896		}
 2897	} else {
 2898		dev->real_num_tx_queues = txq;
 2899	}
 2900
 
 2901	return 0;
 2902}
 2903EXPORT_SYMBOL(netif_set_real_num_tx_queues);
 2904
 2905#ifdef CONFIG_SYSFS
 2906/**
 2907 *	netif_set_real_num_rx_queues - set actual number of RX queues used
 2908 *	@dev: Network device
 2909 *	@rxq: Actual number of RX queues
 2910 *
 2911 *	This must be called either with the rtnl_lock held or before
 2912 *	registration of the net device.  Returns 0 on success, or a
 2913 *	negative error code.  If called before registration, it always
 2914 *	succeeds.
 2915 */
 2916int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
 2917{
 2918	int rc;
 2919
 2920	if (rxq < 1 || rxq > dev->num_rx_queues)
 2921		return -EINVAL;
 2922
 2923	if (dev->reg_state == NETREG_REGISTERED) {
 2924		ASSERT_RTNL();
 2925
 2926		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
 2927						  rxq);
 2928		if (rc)
 2929			return rc;
 2930	}
 2931
 2932	dev->real_num_rx_queues = rxq;
 2933	return 0;
 2934}
 2935EXPORT_SYMBOL(netif_set_real_num_rx_queues);
 2936#endif
 2937
 2938/**
 2939 *	netif_set_real_num_queues - set actual number of RX and TX queues used
 2940 *	@dev: Network device
 2941 *	@txq: Actual number of TX queues
 2942 *	@rxq: Actual number of RX queues
 2943 *
 2944 *	Set the real number of both TX and RX queues.
 2945 *	Does nothing if the number of queues is already correct.
 2946 */
 2947int netif_set_real_num_queues(struct net_device *dev,
 2948			      unsigned int txq, unsigned int rxq)
 2949{
 2950	unsigned int old_rxq = dev->real_num_rx_queues;
 2951	int err;
 2952
 2953	if (txq < 1 || txq > dev->num_tx_queues ||
 2954	    rxq < 1 || rxq > dev->num_rx_queues)
 2955		return -EINVAL;
 2956
 2957	/* Start from increases, so the error path only does decreases -
 2958	 * decreases can't fail.
 2959	 */
 2960	if (rxq > dev->real_num_rx_queues) {
 2961		err = netif_set_real_num_rx_queues(dev, rxq);
 2962		if (err)
 2963			return err;
 2964	}
 2965	if (txq > dev->real_num_tx_queues) {
 2966		err = netif_set_real_num_tx_queues(dev, txq);
 2967		if (err)
 2968			goto undo_rx;
 2969	}
 2970	if (rxq < dev->real_num_rx_queues)
 2971		WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
 2972	if (txq < dev->real_num_tx_queues)
 2973		WARN_ON(netif_set_real_num_tx_queues(dev, txq));
 2974
 2975	return 0;
 2976undo_rx:
 2977	WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
 2978	return err;
 2979}
 2980EXPORT_SYMBOL(netif_set_real_num_queues);
 2981
 2982/**
 2983 * netif_set_tso_max_size() - set the max size of TSO frames supported
 2984 * @dev:	netdev to update
 2985 * @size:	max skb->len of a TSO frame
 2986 *
 2987 * Set the limit on the size of TSO super-frames the device can handle.
 2988 * Unless explicitly set the stack will assume the value of
 2989 * %GSO_LEGACY_MAX_SIZE.
 2990 */
 2991void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
 2992{
 2993	dev->tso_max_size = min(GSO_MAX_SIZE, size);
 2994	if (size < READ_ONCE(dev->gso_max_size))
 2995		netif_set_gso_max_size(dev, size);
 2996}
 2997EXPORT_SYMBOL(netif_set_tso_max_size);
 2998
 2999/**
 3000 * netif_set_tso_max_segs() - set the max number of segs supported for TSO
 3001 * @dev:	netdev to update
 3002 * @segs:	max number of TCP segments
 3003 *
 3004 * Set the limit on the number of TCP segments the device can generate from
 3005 * a single TSO super-frame.
 3006 * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
 3007 */
 3008void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
 3009{
 3010	dev->tso_max_segs = segs;
 3011	if (segs < READ_ONCE(dev->gso_max_segs))
 3012		netif_set_gso_max_segs(dev, segs);
 3013}
 3014EXPORT_SYMBOL(netif_set_tso_max_segs);
 3015
 3016/**
 3017 * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
 3018 * @to:		netdev to update
 3019 * @from:	netdev from which to copy the limits
 3020 */
 3021void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
 3022{
 3023	netif_set_tso_max_size(to, from->tso_max_size);
 3024	netif_set_tso_max_segs(to, from->tso_max_segs);
 3025}
 3026EXPORT_SYMBOL(netif_inherit_tso_max);
 3027
 3028/**
 3029 * netif_get_num_default_rss_queues - default number of RSS queues
 3030 *
 3031 * Default value is the number of physical cores if there are only 1 or 2, or
 3032 * divided by 2 if there are more.
 3033 */
 3034int netif_get_num_default_rss_queues(void)
 3035{
 3036	cpumask_var_t cpus;
 3037	int cpu, count = 0;
 3038
 3039	if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
 3040		return 1;
 3041
 3042	cpumask_copy(cpus, cpu_online_mask);
 3043	for_each_cpu(cpu, cpus) {
 3044		++count;
 3045		cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
 3046	}
 3047	free_cpumask_var(cpus);
 3048
 3049	return count > 2 ? DIV_ROUND_UP(count, 2) : count;
 3050}
 3051EXPORT_SYMBOL(netif_get_num_default_rss_queues);
 3052
 3053static void __netif_reschedule(struct Qdisc *q)
 3054{
 3055	struct softnet_data *sd;
 3056	unsigned long flags;
 3057
 3058	local_irq_save(flags);
 3059	sd = this_cpu_ptr(&softnet_data);
 3060	q->next_sched = NULL;
 3061	*sd->output_queue_tailp = q;
 3062	sd->output_queue_tailp = &q->next_sched;
 3063	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3064	local_irq_restore(flags);
 3065}
 3066
 3067void __netif_schedule(struct Qdisc *q)
 3068{
 3069	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
 3070		__netif_reschedule(q);
 3071}
 3072EXPORT_SYMBOL(__netif_schedule);
 3073
 3074struct dev_kfree_skb_cb {
 3075	enum skb_free_reason reason;
 3076};
 3077
 3078static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
 3079{
 3080	return (struct dev_kfree_skb_cb *)skb->cb;
 3081}
 
 3082
 3083void netif_schedule_queue(struct netdev_queue *txq)
 3084{
 3085	rcu_read_lock();
 3086	if (!netif_xmit_stopped(txq)) {
 3087		struct Qdisc *q = rcu_dereference(txq->qdisc);
 3088
 3089		__netif_schedule(q);
 3090	}
 3091	rcu_read_unlock();
 3092}
 3093EXPORT_SYMBOL(netif_schedule_queue);
 3094
 3095void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 3096{
 3097	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
 3098		struct Qdisc *q;
 3099
 3100		rcu_read_lock();
 3101		q = rcu_dereference(dev_queue->qdisc);
 3102		__netif_schedule(q);
 3103		rcu_read_unlock();
 3104	}
 3105}
 3106EXPORT_SYMBOL(netif_tx_wake_queue);
 3107
 3108void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
 3109{
 3110	unsigned long flags;
 3111
 3112	if (unlikely(!skb))
 3113		return;
 3114
 3115	if (likely(refcount_read(&skb->users) == 1)) {
 3116		smp_rmb();
 3117		refcount_set(&skb->users, 0);
 3118	} else if (likely(!refcount_dec_and_test(&skb->users))) {
 3119		return;
 3120	}
 3121	get_kfree_skb_cb(skb)->reason = reason;
 3122	local_irq_save(flags);
 3123	skb->next = __this_cpu_read(softnet_data.completion_queue);
 3124	__this_cpu_write(softnet_data.completion_queue, skb);
 3125	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3126	local_irq_restore(flags);
 3127}
 3128EXPORT_SYMBOL(__dev_kfree_skb_irq);
 3129
 3130void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
 3131{
 3132	if (in_hardirq() || irqs_disabled())
 3133		__dev_kfree_skb_irq(skb, reason);
 3134	else
 3135		dev_kfree_skb(skb);
 3136}
 3137EXPORT_SYMBOL(__dev_kfree_skb_any);
 3138
 3139
 3140/**
 3141 * netif_device_detach - mark device as removed
 3142 * @dev: network device
 3143 *
 3144 * Mark device as removed from system and therefore no longer available.
 3145 */
 3146void netif_device_detach(struct net_device *dev)
 3147{
 3148	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3149	    netif_running(dev)) {
 3150		netif_tx_stop_all_queues(dev);
 3151	}
 3152}
 3153EXPORT_SYMBOL(netif_device_detach);
 3154
 3155/**
 3156 * netif_device_attach - mark device as attached
 3157 * @dev: network device
 3158 *
 3159 * Mark device as attached from system and restart if needed.
 3160 */
 3161void netif_device_attach(struct net_device *dev)
 3162{
 3163	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3164	    netif_running(dev)) {
 3165		netif_tx_wake_all_queues(dev);
 3166		__netdev_watchdog_up(dev);
 3167	}
 3168}
 3169EXPORT_SYMBOL(netif_device_attach);
 3170
 3171/*
 3172 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 3173 * to be used as a distribution range.
 3174 */
 3175static u16 skb_tx_hash(const struct net_device *dev,
 3176		       const struct net_device *sb_dev,
 3177		       struct sk_buff *skb)
 3178{
 3179	u32 hash;
 3180	u16 qoffset = 0;
 3181	u16 qcount = dev->real_num_tx_queues;
 3182
 3183	if (dev->num_tc) {
 3184		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
 3185
 3186		qoffset = sb_dev->tc_to_txq[tc].offset;
 3187		qcount = sb_dev->tc_to_txq[tc].count;
 3188		if (unlikely(!qcount)) {
 3189			net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
 3190					     sb_dev->name, qoffset, tc);
 3191			qoffset = 0;
 3192			qcount = dev->real_num_tx_queues;
 3193		}
 3194	}
 3195
 3196	if (skb_rx_queue_recorded(skb)) {
 3197		hash = skb_get_rx_queue(skb);
 3198		if (hash >= qoffset)
 3199			hash -= qoffset;
 3200		while (unlikely(hash >= qcount))
 3201			hash -= qcount;
 3202		return hash + qoffset;
 3203	}
 3204
 3205	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
 3206}
 3207
 3208static void skb_warn_bad_offload(const struct sk_buff *skb)
 3209{
 3210	static const netdev_features_t null_features;
 3211	struct net_device *dev = skb->dev;
 3212	const char *name = "";
 3213
 3214	if (!net_ratelimit())
 3215		return;
 3216
 3217	if (dev) {
 3218		if (dev->dev.parent)
 3219			name = dev_driver_string(dev->dev.parent);
 3220		else
 3221			name = netdev_name(dev);
 3222	}
 3223	skb_dump(KERN_WARNING, skb, false);
 3224	WARN(1, "%s: caps=(%pNF, %pNF)\n",
 3225	     name, dev ? &dev->features : &null_features,
 3226	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
 3227}
 3228
 3229/*
 3230 * Invalidate hardware checksum when packet is to be mangled, and
 3231 * complete checksum manually on outgoing path.
 3232 */
 3233int skb_checksum_help(struct sk_buff *skb)
 3234{
 3235	__wsum csum;
 3236	int ret = 0, offset;
 3237
 3238	if (skb->ip_summed == CHECKSUM_COMPLETE)
 3239		goto out_set_summed;
 3240
 3241	if (unlikely(skb_is_gso(skb))) {
 3242		skb_warn_bad_offload(skb);
 3243		return -EINVAL;
 3244	}
 3245
 3246	/* Before computing a checksum, we should make sure no frag could
 3247	 * be modified by an external entity : checksum could be wrong.
 3248	 */
 3249	if (skb_has_shared_frag(skb)) {
 3250		ret = __skb_linearize(skb);
 3251		if (ret)
 3252			goto out;
 3253	}
 3254
 3255	offset = skb_checksum_start_offset(skb);
 3256	ret = -EINVAL;
 3257	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
 3258		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
 3259		goto out;
 3260	}
 3261	csum = skb_checksum(skb, offset, skb->len - offset, 0);
 3262
 3263	offset += skb->csum_offset;
 3264	if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb))) {
 3265		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
 3266		goto out;
 3267	}
 3268	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
 3269	if (ret)
 3270		goto out;
 3271
 3272	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
 3273out_set_summed:
 3274	skb->ip_summed = CHECKSUM_NONE;
 3275out:
 3276	return ret;
 3277}
 3278EXPORT_SYMBOL(skb_checksum_help);
 3279
 3280int skb_crc32c_csum_help(struct sk_buff *skb)
 3281{
 3282	__le32 crc32c_csum;
 3283	int ret = 0, offset, start;
 3284
 3285	if (skb->ip_summed != CHECKSUM_PARTIAL)
 3286		goto out;
 3287
 3288	if (unlikely(skb_is_gso(skb)))
 3289		goto out;
 3290
 3291	/* Before computing a checksum, we should make sure no frag could
 3292	 * be modified by an external entity : checksum could be wrong.
 3293	 */
 3294	if (unlikely(skb_has_shared_frag(skb))) {
 3295		ret = __skb_linearize(skb);
 3296		if (ret)
 3297			goto out;
 3298	}
 3299	start = skb_checksum_start_offset(skb);
 3300	offset = start + offsetof(struct sctphdr, checksum);
 3301	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
 3302		ret = -EINVAL;
 3303		goto out;
 3304	}
 3305
 3306	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
 3307	if (ret)
 3308		goto out;
 3309
 3310	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
 3311						  skb->len - start, ~(__u32)0,
 3312						  crc32c_csum_stub));
 3313	*(__le32 *)(skb->data + offset) = crc32c_csum;
 3314	skb->ip_summed = CHECKSUM_NONE;
 3315	skb->csum_not_inet = 0;
 3316out:
 3317	return ret;
 3318}
 3319
 3320__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 3321{
 3322	__be16 type = skb->protocol;
 3323
 3324	/* Tunnel gso handlers can set protocol to ethernet. */
 3325	if (type == htons(ETH_P_TEB)) {
 3326		struct ethhdr *eth;
 3327
 3328		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
 3329			return 0;
 3330
 3331		eth = (struct ethhdr *)skb->data;
 3332		type = eth->h_proto;
 3333	}
 3334
 3335	return __vlan_get_protocol(skb, type, depth);
 3336}
 3337
 3338/* openvswitch calls this on rx path, so we need a different check.
 3339 */
 3340static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 3341{
 3342	if (tx_path)
 3343		return skb->ip_summed != CHECKSUM_PARTIAL &&
 3344		       skb->ip_summed != CHECKSUM_UNNECESSARY;
 3345
 3346	return skb->ip_summed == CHECKSUM_NONE;
 3347}
 3348
 3349/**
 3350 *	__skb_gso_segment - Perform segmentation on skb.
 3351 *	@skb: buffer to segment
 3352 *	@features: features for the output path (see dev->features)
 3353 *	@tx_path: whether it is called in TX path
 3354 *
 3355 *	This function segments the given skb and returns a list of segments.
 3356 *
 3357 *	It may return NULL if the skb requires no segmentation.  This is
 3358 *	only possible when GSO is used for verifying header integrity.
 3359 *
 3360 *	Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
 3361 */
 3362struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 3363				  netdev_features_t features, bool tx_path)
 3364{
 3365	struct sk_buff *segs;
 
 
 
 
 3366
 3367	if (unlikely(skb_needs_check(skb, tx_path))) {
 3368		int err;
 3369
 3370		/* We're going to init ->check field in TCP or UDP header */
 3371		err = skb_cow_head(skb, 0);
 3372		if (err < 0)
 3373			return ERR_PTR(err);
 3374	}
 3375
 3376	/* Only report GSO partial support if it will enable us to
 3377	 * support segmentation on this frame without needing additional
 3378	 * work.
 3379	 */
 3380	if (features & NETIF_F_GSO_PARTIAL) {
 3381		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
 3382		struct net_device *dev = skb->dev;
 3383
 3384		partial_features |= dev->features & dev->gso_partial_features;
 3385		if (!skb_gso_ok(skb, features | partial_features))
 3386			features &= ~NETIF_F_GSO_PARTIAL;
 3387	}
 3388
 3389	BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
 3390		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
 
 3391
 3392	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
 3393	SKB_GSO_CB(skb)->encap_level = 0;
 3394
 3395	skb_reset_mac_header(skb);
 3396	skb_reset_mac_len(skb);
 
 
 3397
 3398	segs = skb_mac_gso_segment(skb, features);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3399
 3400	if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
 3401		skb_warn_bad_offload(skb);
 3402
 3403	return segs;
 3404}
 3405EXPORT_SYMBOL(__skb_gso_segment);
 3406
 3407/* Take action when hardware reception checksum errors are detected. */
 3408#ifdef CONFIG_BUG
 3409static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 3410{
 3411	netdev_err(dev, "hw csum failure\n");
 3412	skb_dump(KERN_ERR, skb, true);
 3413	dump_stack();
 3414}
 3415
 3416void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 3417{
 3418	DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
 3419}
 3420EXPORT_SYMBOL(netdev_rx_csum_fault);
 3421#endif
 3422
 3423/* XXX: check that highmem exists at all on the given machine. */
 
 
 
 
 3424static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 3425{
 3426#ifdef CONFIG_HIGHMEM
 3427	int i;
 3428
 3429	if (!(dev->features & NETIF_F_HIGHDMA)) {
 3430		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 3431			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
 
 
 
 3432
 3433			if (PageHighMem(skb_frag_page(frag)))
 
 
 
 
 
 
 
 
 3434				return 1;
 3435		}
 3436	}
 3437#endif
 3438	return 0;
 3439}
 3440
 3441/* If MPLS offload request, verify we are testing hardware MPLS features
 3442 * instead of standard features for the netdev.
 3443 */
 3444#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
 3445static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3446					   netdev_features_t features,
 3447					   __be16 type)
 3448{
 3449	if (eth_p_mpls(type))
 3450		features &= skb->dev->mpls_features;
 3451
 3452	return features;
 3453}
 3454#else
 3455static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3456					   netdev_features_t features,
 3457					   __be16 type)
 3458{
 3459	return features;
 3460}
 3461#endif
 3462
 3463static netdev_features_t harmonize_features(struct sk_buff *skb,
 3464	netdev_features_t features)
 3465{
 3466	__be16 type;
 3467
 3468	type = skb_network_protocol(skb, NULL);
 3469	features = net_mpls_features(skb, features, type);
 3470
 3471	if (skb->ip_summed != CHECKSUM_NONE &&
 3472	    !can_checksum_protocol(features, type)) {
 3473		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 3474	}
 3475	if (illegal_highdma(skb->dev, skb))
 3476		features &= ~NETIF_F_SG;
 3477
 3478	return features;
 
 
 3479}
 3480
 3481netdev_features_t passthru_features_check(struct sk_buff *skb,
 3482					  struct net_device *dev,
 3483					  netdev_features_t features)
 
 
 
 
 
 
 3484{
 3485	return features;
 3486}
 3487EXPORT_SYMBOL(passthru_features_check);
 3488
 3489static netdev_features_t dflt_features_check(struct sk_buff *skb,
 3490					     struct net_device *dev,
 3491					     netdev_features_t features)
 3492{
 3493	return vlan_features_check(skb, features);
 3494}
 3495
 3496static netdev_features_t gso_features_check(const struct sk_buff *skb,
 3497					    struct net_device *dev,
 3498					    netdev_features_t features)
 3499{
 3500	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 3501
 3502	if (gso_segs > READ_ONCE(dev->gso_max_segs))
 3503		return features & ~NETIF_F_GSO_MASK;
 3504
 3505	if (!skb_shinfo(skb)->gso_type) {
 3506		skb_warn_bad_offload(skb);
 3507		return features & ~NETIF_F_GSO_MASK;
 3508	}
 3509
 3510	/* Support for GSO partial features requires software
 3511	 * intervention before we can actually process the packets
 3512	 * so we need to strip support for any partial features now
 3513	 * and we can pull them back in after we have partially
 3514	 * segmented the frame.
 3515	 */
 3516	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
 3517		features &= ~dev->gso_partial_features;
 3518
 3519	/* Make sure to clear the IPv4 ID mangling feature if the
 3520	 * IPv4 header has the potential to be fragmented.
 3521	 */
 3522	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
 3523		struct iphdr *iph = skb->encapsulation ?
 3524				    inner_ip_hdr(skb) : ip_hdr(skb);
 
 
 
 
 3525
 3526		if (!(iph->frag_off & htons(IP_DF)))
 3527			features &= ~NETIF_F_TSO_MANGLEID;
 
 
 
 
 
 
 3528	}
 3529
 3530	return features;
 3531}
 3532
 3533netdev_features_t netif_skb_features(struct sk_buff *skb)
 3534{
 3535	struct net_device *dev = skb->dev;
 3536	netdev_features_t features = dev->features;
 
 
 
 3537
 3538	if (skb_is_gso(skb))
 3539		features = gso_features_check(skb, dev, features);
 
 
 
 
 3540
 3541	/* If encapsulation offload request, verify we are testing
 3542	 * hardware encapsulation features instead of standard
 3543	 * features for the netdev
 3544	 */
 3545	if (skb->encapsulation)
 3546		features &= dev->hw_enc_features;
 3547
 3548	if (skb_vlan_tagged(skb))
 3549		features = netdev_intersect_features(features,
 3550						     dev->vlan_features |
 3551						     NETIF_F_HW_VLAN_CTAG_TX |
 3552						     NETIF_F_HW_VLAN_STAG_TX);
 3553
 3554	if (dev->netdev_ops->ndo_features_check)
 3555		features &= dev->netdev_ops->ndo_features_check(skb, dev,
 3556								features);
 3557	else
 3558		features &= dflt_features_check(skb, dev, features);
 3559
 3560	return harmonize_features(skb, features);
 
 
 
 
 
 
 3561}
 3562EXPORT_SYMBOL(netif_skb_features);
 3563
 3564static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 3565		    struct netdev_queue *txq, bool more)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3566{
 3567	unsigned int len;
 3568	int rc;
 
 3569
 3570	if (dev_nit_active(dev))
 3571		dev_queue_xmit_nit(skb, dev);
 3572
 3573	len = skb->len;
 3574	trace_net_dev_start_xmit(skb, dev);
 3575	rc = netdev_start_xmit(skb, dev, txq, more);
 3576	trace_net_dev_xmit(skb, rc, dev, len);
 
 
 3577
 3578	return rc;
 3579}
 3580
 3581struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
 3582				    struct netdev_queue *txq, int *ret)
 3583{
 3584	struct sk_buff *skb = first;
 3585	int rc = NETDEV_TX_OK;
 3586
 3587	while (skb) {
 3588		struct sk_buff *next = skb->next;
 
 
 
 3589
 3590		skb_mark_not_on_list(skb);
 3591		rc = xmit_one(skb, dev, txq, next != NULL);
 3592		if (unlikely(!dev_xmit_complete(rc))) {
 3593			skb->next = next;
 3594			goto out;
 3595		}
 3596
 3597		skb = next;
 3598		if (netif_tx_queue_stopped(txq) && skb) {
 3599			rc = NETDEV_TX_BUSY;
 3600			break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3601		}
 
 
 
 
 
 
 
 3602	}
 3603
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3604out:
 3605	*ret = rc;
 3606	return skb;
 3607}
 3608
 3609static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 3610					  netdev_features_t features)
 3611{
 3612	if (skb_vlan_tag_present(skb) &&
 3613	    !vlan_hw_offload_capable(features, skb->vlan_proto))
 3614		skb = __vlan_hwaccel_push_inside(skb);
 3615	return skb;
 3616}
 3617
 3618int skb_csum_hwoffload_help(struct sk_buff *skb,
 3619			    const netdev_features_t features)
 
 
 
 
 3620{
 3621	if (unlikely(skb_csum_is_sctp(skb)))
 3622		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
 3623			skb_crc32c_csum_help(skb);
 3624
 3625	if (features & NETIF_F_HW_CSUM)
 3626		return 0;
 
 
 
 
 3627
 3628	if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
 3629		switch (skb->csum_offset) {
 3630		case offsetof(struct tcphdr, check):
 3631		case offsetof(struct udphdr, check):
 3632			return 0;
 3633		}
 3634	}
 3635
 3636	return skb_checksum_help(skb);
 
 
 
 
 
 
 3637}
 3638EXPORT_SYMBOL(skb_csum_hwoffload_help);
 3639
 3640static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
 3641{
 3642	netdev_features_t features;
 3643
 3644	features = netif_skb_features(skb);
 3645	skb = validate_xmit_vlan(skb, features);
 3646	if (unlikely(!skb))
 3647		goto out_null;
 3648
 3649	skb = sk_validate_xmit_skb(skb, dev);
 3650	if (unlikely(!skb))
 3651		goto out_null;
 3652
 3653	if (netif_needs_gso(skb, features)) {
 3654		struct sk_buff *segs;
 3655
 3656		segs = skb_gso_segment(skb, features);
 3657		if (IS_ERR(segs)) {
 3658			goto out_kfree_skb;
 3659		} else if (segs) {
 3660			consume_skb(skb);
 3661			skb = segs;
 3662		}
 3663	} else {
 3664		if (skb_needs_linearize(skb, features) &&
 3665		    __skb_linearize(skb))
 3666			goto out_kfree_skb;
 3667
 3668		/* If packet is not checksummed and device does not
 3669		 * support checksumming for this protocol, complete
 3670		 * checksumming here.
 3671		 */
 3672		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 3673			if (skb->encapsulation)
 3674				skb_set_inner_transport_header(skb,
 3675							       skb_checksum_start_offset(skb));
 3676			else
 3677				skb_set_transport_header(skb,
 3678							 skb_checksum_start_offset(skb));
 3679			if (skb_csum_hwoffload_help(skb, features))
 3680				goto out_kfree_skb;
 3681		}
 3682	}
 3683
 3684	skb = validate_xmit_xfrm(skb, features, again);
 3685
 3686	return skb;
 3687
 3688out_kfree_skb:
 3689	kfree_skb(skb);
 3690out_null:
 3691	dev_core_stats_tx_dropped_inc(dev);
 3692	return NULL;
 3693}
 3694
 3695struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
 3696{
 3697	struct sk_buff *next, *head = NULL, *tail;
 
 
 
 3698
 3699	for (; skb != NULL; skb = next) {
 3700		next = skb->next;
 3701		skb_mark_not_on_list(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3702
 3703		/* in case skb wont be segmented, point to itself */
 3704		skb->prev = skb;
 3705
 3706		skb = validate_xmit_skb(skb, dev, again);
 3707		if (!skb)
 3708			continue;
 3709
 3710		if (!head)
 3711			head = skb;
 3712		else
 3713			tail->next = skb;
 3714		/* If skb was segmented, skb->prev points to
 3715		 * the last segment. If not, it still contains skb.
 3716		 */
 3717		tail = skb->prev;
 3718	}
 3719	return head;
 3720}
 3721EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
 3722
 3723static void qdisc_pkt_len_init(struct sk_buff *skb)
 
 3724{
 3725	const struct skb_shared_info *shinfo = skb_shinfo(skb);
 
 3726
 3727	qdisc_skb_cb(skb)->pkt_len = skb->len;
 
 
 
 
 
 
 
 3728
 3729	/* To get more precise estimation of bytes sent on wire,
 3730	 * we add to pkt_len the headers size of all segments
 3731	 */
 3732	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
 3733		unsigned int hdr_len;
 3734		u16 gso_segs = shinfo->gso_segs;
 3735
 3736		/* mac layer + network layer */
 3737		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
 3738
 3739		/* + transport layer */
 3740		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
 3741			const struct tcphdr *th;
 3742			struct tcphdr _tcphdr;
 3743
 3744			th = skb_header_pointer(skb, skb_transport_offset(skb),
 3745						sizeof(_tcphdr), &_tcphdr);
 3746			if (likely(th))
 3747				hdr_len += __tcp_hdrlen(th);
 3748		} else {
 3749			struct udphdr _udphdr;
 3750
 3751			if (skb_header_pointer(skb, skb_transport_offset(skb),
 3752					       sizeof(_udphdr), &_udphdr))
 3753				hdr_len += sizeof(struct udphdr);
 3754		}
 3755
 3756		if (shinfo->gso_type & SKB_GSO_DODGY)
 3757			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
 3758						shinfo->gso_size);
 3759
 3760		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
 3761	}
 3762}
 3763
 3764static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
 3765			     struct sk_buff **to_free,
 3766			     struct netdev_queue *txq)
 3767{
 3768	int rc;
 3769
 3770	rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
 3771	if (rc == NET_XMIT_SUCCESS)
 3772		trace_qdisc_enqueue(q, txq, skb);
 3773	return rc;
 3774}
 3775
 3776static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 3777				 struct net_device *dev,
 3778				 struct netdev_queue *txq)
 3779{
 3780	spinlock_t *root_lock = qdisc_lock(q);
 3781	struct sk_buff *to_free = NULL;
 3782	bool contended;
 3783	int rc;
 3784
 
 3785	qdisc_calculate_pkt_len(skb, q);
 3786
 3787	if (q->flags & TCQ_F_NOLOCK) {
 3788		if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
 3789		    qdisc_run_begin(q)) {
 3790			/* Retest nolock_qdisc_is_empty() within the protection
 3791			 * of q->seqlock to protect from racing with requeuing.
 3792			 */
 3793			if (unlikely(!nolock_qdisc_is_empty(q))) {
 3794				rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 3795				__qdisc_run(q);
 3796				qdisc_run_end(q);
 3797
 3798				goto no_lock_out;
 3799			}
 3800
 3801			qdisc_bstats_cpu_update(q, skb);
 3802			if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
 3803			    !nolock_qdisc_is_empty(q))
 3804				__qdisc_run(q);
 3805
 3806			qdisc_run_end(q);
 3807			return NET_XMIT_SUCCESS;
 3808		}
 3809
 3810		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 3811		qdisc_run(q);
 3812
 3813no_lock_out:
 3814		if (unlikely(to_free))
 3815			kfree_skb_list_reason(to_free,
 3816					      SKB_DROP_REASON_QDISC_DROP);
 3817		return rc;
 3818	}
 3819
 3820	/*
 3821	 * Heuristic to force contended enqueues to serialize on a
 3822	 * separate lock before trying to get qdisc main lock.
 3823	 * This permits qdisc->running owner to get the lock more
 3824	 * often and dequeue packets faster.
 3825	 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
 3826	 * and then other tasks will only enqueue packets. The packets will be
 3827	 * sent after the qdisc owner is scheduled again. To prevent this
 3828	 * scenario the task always serialize on the lock.
 3829	 */
 3830	contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
 3831	if (unlikely(contended))
 3832		spin_lock(&q->busylock);
 3833
 3834	spin_lock(root_lock);
 3835	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
 3836		__qdisc_drop(skb, &to_free);
 3837		rc = NET_XMIT_DROP;
 3838	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
 3839		   qdisc_run_begin(q)) {
 3840		/*
 3841		 * This is a work-conserving queue; there are no old skbs
 3842		 * waiting to be sent out; and the qdisc is not running -
 3843		 * xmit the skb directly.
 3844		 */
 
 
 3845
 3846		qdisc_bstats_update(q, skb);
 3847
 3848		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
 3849			if (unlikely(contended)) {
 3850				spin_unlock(&q->busylock);
 3851				contended = false;
 3852			}
 3853			__qdisc_run(q);
 3854		}
 
 3855
 3856		qdisc_run_end(q);
 3857		rc = NET_XMIT_SUCCESS;
 3858	} else {
 3859		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 
 3860		if (qdisc_run_begin(q)) {
 3861			if (unlikely(contended)) {
 3862				spin_unlock(&q->busylock);
 3863				contended = false;
 3864			}
 3865			__qdisc_run(q);
 3866			qdisc_run_end(q);
 3867		}
 3868	}
 3869	spin_unlock(root_lock);
 3870	if (unlikely(to_free))
 3871		kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP);
 3872	if (unlikely(contended))
 3873		spin_unlock(&q->busylock);
 3874	return rc;
 3875}
 3876
 3877#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 3878static void skb_update_prio(struct sk_buff *skb)
 3879{
 3880	const struct netprio_map *map;
 3881	const struct sock *sk;
 3882	unsigned int prioidx;
 3883
 3884	if (skb->priority)
 3885		return;
 3886	map = rcu_dereference_bh(skb->dev->priomap);
 3887	if (!map)
 3888		return;
 3889	sk = skb_to_full_sk(skb);
 3890	if (!sk)
 3891		return;
 3892
 3893	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
 3894
 3895	if (prioidx < map->priomap_len)
 3896		skb->priority = map->priomap[prioidx];
 3897}
 3898#else
 3899#define skb_update_prio(skb)
 3900#endif
 3901
 
 
 
 3902/**
 3903 *	dev_loopback_xmit - loop back @skb
 3904 *	@net: network namespace this loopback is happening in
 3905 *	@sk:  sk needed to be a netfilter okfn
 3906 *	@skb: buffer to transmit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3907 */
 3908int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
 3909{
 3910	skb_reset_mac_header(skb);
 3911	__skb_pull(skb, skb_network_offset(skb));
 3912	skb->pkt_type = PACKET_LOOPBACK;
 3913	if (skb->ip_summed == CHECKSUM_NONE)
 3914		skb->ip_summed = CHECKSUM_UNNECESSARY;
 3915	DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
 3916	skb_dst_force(skb);
 3917	netif_rx(skb);
 3918	return 0;
 3919}
 3920EXPORT_SYMBOL(dev_loopback_xmit);
 3921
 3922#ifdef CONFIG_NET_EGRESS
 3923static struct sk_buff *
 3924sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 3925{
 3926#ifdef CONFIG_NET_CLS_ACT
 3927	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
 3928	struct tcf_result cl_res;
 3929
 3930	if (!miniq)
 3931		return skb;
 3932
 3933	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
 3934	tc_skb_cb(skb)->mru = 0;
 3935	tc_skb_cb(skb)->post_ct = false;
 3936	mini_qdisc_bstats_cpu_update(miniq, skb);
 3937
 3938	switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
 3939	case TC_ACT_OK:
 3940	case TC_ACT_RECLASSIFY:
 3941		skb->tc_index = TC_H_MIN(cl_res.classid);
 3942		break;
 3943	case TC_ACT_SHOT:
 3944		mini_qdisc_qstats_cpu_drop(miniq);
 3945		*ret = NET_XMIT_DROP;
 3946		kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
 3947		return NULL;
 3948	case TC_ACT_STOLEN:
 3949	case TC_ACT_QUEUED:
 3950	case TC_ACT_TRAP:
 3951		*ret = NET_XMIT_SUCCESS;
 3952		consume_skb(skb);
 3953		return NULL;
 3954	case TC_ACT_REDIRECT:
 3955		/* No need to push/pop skb's mac_header here on egress! */
 3956		skb_do_redirect(skb);
 3957		*ret = NET_XMIT_SUCCESS;
 3958		return NULL;
 3959	default:
 3960		break;
 3961	}
 3962#endif /* CONFIG_NET_CLS_ACT */
 3963
 3964	return skb;
 3965}
 3966
 3967static struct netdev_queue *
 3968netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
 3969{
 3970	int qm = skb_get_queue_mapping(skb);
 3971
 3972	return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
 3973}
 3974
 3975static bool netdev_xmit_txqueue_skipped(void)
 3976{
 3977	return __this_cpu_read(softnet_data.xmit.skip_txqueue);
 3978}
 3979
 3980void netdev_xmit_skip_txqueue(bool skip)
 3981{
 3982	__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
 3983}
 3984EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
 3985#endif /* CONFIG_NET_EGRESS */
 3986
 3987#ifdef CONFIG_XPS
 3988static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 3989			       struct xps_dev_maps *dev_maps, unsigned int tci)
 3990{
 3991	int tc = netdev_get_prio_tc_map(dev, skb->priority);
 3992	struct xps_map *map;
 3993	int queue_index = -1;
 3994
 3995	if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
 3996		return queue_index;
 3997
 3998	tci *= dev_maps->num_tc;
 3999	tci += tc;
 4000
 4001	map = rcu_dereference(dev_maps->attr_map[tci]);
 4002	if (map) {
 4003		if (map->len == 1)
 4004			queue_index = map->queues[0];
 4005		else
 4006			queue_index = map->queues[reciprocal_scale(
 4007						skb_get_hash(skb), map->len)];
 4008		if (unlikely(queue_index >= dev->real_num_tx_queues))
 4009			queue_index = -1;
 4010	}
 4011	return queue_index;
 4012}
 4013#endif
 4014
 4015static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
 4016			 struct sk_buff *skb)
 4017{
 4018#ifdef CONFIG_XPS
 4019	struct xps_dev_maps *dev_maps;
 4020	struct sock *sk = skb->sk;
 4021	int queue_index = -1;
 4022
 4023	if (!static_key_false(&xps_needed))
 4024		return -1;
 4025
 4026	rcu_read_lock();
 4027	if (!static_key_false(&xps_rxqs_needed))
 4028		goto get_cpus_map;
 4029
 4030	dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
 4031	if (dev_maps) {
 4032		int tci = sk_rx_queue_get(sk);
 4033
 4034		if (tci >= 0)
 4035			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 4036							  tci);
 4037	}
 4038
 4039get_cpus_map:
 4040	if (queue_index < 0) {
 4041		dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
 4042		if (dev_maps) {
 4043			unsigned int tci = skb->sender_cpu - 1;
 4044
 4045			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 4046							  tci);
 4047		}
 4048	}
 4049	rcu_read_unlock();
 4050
 4051	return queue_index;
 4052#else
 4053	return -1;
 4054#endif
 4055}
 4056
 4057u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 4058		     struct net_device *sb_dev)
 4059{
 4060	return 0;
 4061}
 4062EXPORT_SYMBOL(dev_pick_tx_zero);
 4063
 4064u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
 4065		       struct net_device *sb_dev)
 4066{
 4067	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
 4068}
 4069EXPORT_SYMBOL(dev_pick_tx_cpu_id);
 4070
 4071u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 4072		     struct net_device *sb_dev)
 4073{
 4074	struct sock *sk = skb->sk;
 4075	int queue_index = sk_tx_queue_get(sk);
 4076
 4077	sb_dev = sb_dev ? : dev;
 4078
 4079	if (queue_index < 0 || skb->ooo_okay ||
 4080	    queue_index >= dev->real_num_tx_queues) {
 4081		int new_index = get_xps_queue(dev, sb_dev, skb);
 4082
 4083		if (new_index < 0)
 4084			new_index = skb_tx_hash(dev, sb_dev, skb);
 4085
 4086		if (queue_index != new_index && sk &&
 4087		    sk_fullsock(sk) &&
 4088		    rcu_access_pointer(sk->sk_dst_cache))
 4089			sk_tx_queue_set(sk, new_index);
 4090
 4091		queue_index = new_index;
 4092	}
 4093
 4094	return queue_index;
 4095}
 4096EXPORT_SYMBOL(netdev_pick_tx);
 4097
 4098struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
 4099					 struct sk_buff *skb,
 4100					 struct net_device *sb_dev)
 4101{
 4102	int queue_index = 0;
 4103
 4104#ifdef CONFIG_XPS
 4105	u32 sender_cpu = skb->sender_cpu - 1;
 4106
 4107	if (sender_cpu >= (u32)NR_CPUS)
 4108		skb->sender_cpu = raw_smp_processor_id() + 1;
 4109#endif
 4110
 4111	if (dev->real_num_tx_queues != 1) {
 4112		const struct net_device_ops *ops = dev->netdev_ops;
 4113
 4114		if (ops->ndo_select_queue)
 4115			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
 4116		else
 4117			queue_index = netdev_pick_tx(dev, skb, sb_dev);
 4118
 4119		queue_index = netdev_cap_txqueue(dev, queue_index);
 4120	}
 4121
 4122	skb_set_queue_mapping(skb, queue_index);
 4123	return netdev_get_tx_queue(dev, queue_index);
 4124}
 4125
 4126/**
 4127 * __dev_queue_xmit() - transmit a buffer
 4128 * @skb:	buffer to transmit
 4129 * @sb_dev:	suboordinate device used for L2 forwarding offload
 4130 *
 4131 * Queue a buffer for transmission to a network device. The caller must
 4132 * have set the device and priority and built the buffer before calling
 4133 * this function. The function can be called from an interrupt.
 4134 *
 4135 * When calling this method, interrupts MUST be enabled. This is because
 4136 * the BH enable code must have IRQs enabled so that it will not deadlock.
 4137 *
 4138 * Regardless of the return value, the skb is consumed, so it is currently
 4139 * difficult to retry a send to this method. (You can bump the ref count
 4140 * before sending to hold a reference for retry if you are careful.)
 4141 *
 4142 * Return:
 4143 * * 0				- buffer successfully transmitted
 4144 * * positive qdisc return code	- NET_XMIT_DROP etc.
 4145 * * negative errno		- other errors
 4146 */
 4147int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 4148{
 4149	struct net_device *dev = skb->dev;
 4150	struct netdev_queue *txq = NULL;
 4151	struct Qdisc *q;
 4152	int rc = -ENOMEM;
 4153	bool again = false;
 4154
 4155	skb_reset_mac_header(skb);
 4156	skb_assert_len(skb);
 4157
 4158	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
 4159		__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
 4160
 4161	/* Disable soft irqs for various locks below. Also
 4162	 * stops preemption for RCU.
 4163	 */
 4164	rcu_read_lock_bh();
 4165
 4166	skb_update_prio(skb);
 4167
 4168	qdisc_pkt_len_init(skb);
 
 
 4169#ifdef CONFIG_NET_CLS_ACT
 4170	skb->tc_at_ingress = 0;
 4171#endif
 4172#ifdef CONFIG_NET_EGRESS
 4173	if (static_branch_unlikely(&egress_needed_key)) {
 4174		if (nf_hook_egress_active()) {
 4175			skb = nf_hook_egress(skb, &rc, dev);
 4176			if (!skb)
 4177				goto out;
 4178		}
 4179
 4180		netdev_xmit_skip_txqueue(false);
 4181
 4182		nf_skip_egress(skb, true);
 4183		skb = sch_handle_egress(skb, &rc, dev);
 4184		if (!skb)
 4185			goto out;
 4186		nf_skip_egress(skb, false);
 4187
 4188		if (netdev_xmit_txqueue_skipped())
 4189			txq = netdev_tx_queue_mapping(dev, skb);
 4190	}
 4191#endif
 4192	/* If device/qdisc don't need skb->dst, release it right now while
 4193	 * its hot in this cpu cache.
 4194	 */
 4195	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
 4196		skb_dst_drop(skb);
 4197	else
 4198		skb_dst_force(skb);
 4199
 4200	if (!txq)
 4201		txq = netdev_core_pick_tx(dev, skb, sb_dev);
 4202
 4203	q = rcu_dereference_bh(txq->qdisc);
 4204
 4205	trace_net_dev_queue(skb);
 4206	if (q->enqueue) {
 4207		rc = __dev_xmit_skb(skb, q, dev, txq);
 4208		goto out;
 4209	}
 4210
 4211	/* The device has no queue. Common case for software devices:
 4212	 * loopback, all the sorts of tunnels...
 4213
 4214	 * Really, it is unlikely that netif_tx_lock protection is necessary
 4215	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
 4216	 * counters.)
 4217	 * However, it is possible, that they rely on protection
 4218	 * made by us here.
 4219
 4220	 * Check this and shot the lock. It is not prone from deadlocks.
 4221	 *Either shot noqueue qdisc, it is even simpler 8)
 4222	 */
 4223	if (dev->flags & IFF_UP) {
 4224		int cpu = smp_processor_id(); /* ok because BHs are off */
 4225
 4226		/* Other cpus might concurrently change txq->xmit_lock_owner
 4227		 * to -1 or to their cpu id, but not to our id.
 4228		 */
 4229		if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
 4230			if (dev_xmit_recursion())
 4231				goto recursion_alert;
 4232
 4233			skb = validate_xmit_skb(skb, dev, &again);
 4234			if (!skb)
 4235				goto out;
 4236
 4237			HARD_TX_LOCK(dev, txq, cpu);
 4238
 4239			if (!netif_xmit_stopped(txq)) {
 4240				dev_xmit_recursion_inc();
 4241				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
 4242				dev_xmit_recursion_dec();
 4243				if (dev_xmit_complete(rc)) {
 4244					HARD_TX_UNLOCK(dev, txq);
 4245					goto out;
 4246				}
 4247			}
 4248			HARD_TX_UNLOCK(dev, txq);
 4249			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
 4250					     dev->name);
 4251		} else {
 4252			/* Recursion is detected! It is possible,
 4253			 * unfortunately
 4254			 */
 4255recursion_alert:
 4256			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
 4257					     dev->name);
 4258		}
 4259	}
 4260
 4261	rc = -ENETDOWN;
 4262	rcu_read_unlock_bh();
 4263
 4264	dev_core_stats_tx_dropped_inc(dev);
 4265	kfree_skb_list(skb);
 4266	return rc;
 4267out:
 4268	rcu_read_unlock_bh();
 4269	return rc;
 4270}
 4271EXPORT_SYMBOL(__dev_queue_xmit);
 4272
 4273int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 4274{
 4275	struct net_device *dev = skb->dev;
 4276	struct sk_buff *orig_skb = skb;
 4277	struct netdev_queue *txq;
 4278	int ret = NETDEV_TX_BUSY;
 4279	bool again = false;
 4280
 4281	if (unlikely(!netif_running(dev) ||
 4282		     !netif_carrier_ok(dev)))
 4283		goto drop;
 4284
 4285	skb = validate_xmit_skb_list(skb, dev, &again);
 4286	if (skb != orig_skb)
 4287		goto drop;
 4288
 4289	skb_set_queue_mapping(skb, queue_id);
 4290	txq = skb_get_tx_queue(dev, skb);
 4291
 4292	local_bh_disable();
 4293
 4294	dev_xmit_recursion_inc();
 4295	HARD_TX_LOCK(dev, txq, smp_processor_id());
 4296	if (!netif_xmit_frozen_or_drv_stopped(txq))
 4297		ret = netdev_start_xmit(skb, dev, txq, false);
 4298	HARD_TX_UNLOCK(dev, txq);
 4299	dev_xmit_recursion_dec();
 4300
 4301	local_bh_enable();
 4302	return ret;
 4303drop:
 4304	dev_core_stats_tx_dropped_inc(dev);
 4305	kfree_skb_list(skb);
 4306	return NET_XMIT_DROP;
 4307}
 4308EXPORT_SYMBOL(__dev_direct_xmit);
 4309
 4310/*************************************************************************
 4311 *			Receiver routines
 4312 *************************************************************************/
 4313
 4314int netdev_max_backlog __read_mostly = 1000;
 4315EXPORT_SYMBOL(netdev_max_backlog);
 4316
 4317int netdev_tstamp_prequeue __read_mostly = 1;
 4318unsigned int sysctl_skb_defer_max __read_mostly = 64;
 4319int netdev_budget __read_mostly = 300;
 4320/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
 4321unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
 4322int weight_p __read_mostly = 64;           /* old backlog weight */
 4323int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 4324int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
 4325int dev_rx_weight __read_mostly = 64;
 4326int dev_tx_weight __read_mostly = 64;
 4327
 4328/* Called with irq disabled */
 4329static inline void ____napi_schedule(struct softnet_data *sd,
 4330				     struct napi_struct *napi)
 4331{
 4332	struct task_struct *thread;
 
 
 4333
 4334	lockdep_assert_irqs_disabled();
 
 
 
 
 
 
 
 
 
 4335
 4336	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
 4337		/* Paired with smp_mb__before_atomic() in
 4338		 * napi_enable()/dev_set_threaded().
 4339		 * Use READ_ONCE() to guarantee a complete
 4340		 * read on napi->thread. Only call
 4341		 * wake_up_process() when it's not NULL.
 4342		 */
 4343		thread = READ_ONCE(napi->thread);
 4344		if (thread) {
 4345			/* Avoid doing set_bit() if the thread is in
 4346			 * INTERRUPTIBLE state, cause napi_thread_wait()
 4347			 * makes sure to proceed with napi polling
 4348			 * if the thread is explicitly woken from here.
 4349			 */
 4350			if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
 4351				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
 4352			wake_up_process(thread);
 4353			return;
 4354		}
 4355	}
 4356
 4357	list_add_tail(&napi->poll_list, &sd->poll_list);
 4358	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 
 
 
 
 
 
 
 
 4359}
 
 4360
 4361#ifdef CONFIG_RPS
 4362
 4363/* One global table that all flow-based protocols share. */
 4364struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 4365EXPORT_SYMBOL(rps_sock_flow_table);
 4366u32 rps_cpu_mask __read_mostly;
 4367EXPORT_SYMBOL(rps_cpu_mask);
 4368
 4369struct static_key_false rps_needed __read_mostly;
 4370EXPORT_SYMBOL(rps_needed);
 4371struct static_key_false rfs_needed __read_mostly;
 4372EXPORT_SYMBOL(rfs_needed);
 4373
 4374static struct rps_dev_flow *
 4375set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4376	    struct rps_dev_flow *rflow, u16 next_cpu)
 4377{
 4378	if (next_cpu < nr_cpu_ids) {
 4379#ifdef CONFIG_RFS_ACCEL
 4380		struct netdev_rx_queue *rxqueue;
 4381		struct rps_dev_flow_table *flow_table;
 4382		struct rps_dev_flow *old_rflow;
 4383		u32 flow_id;
 4384		u16 rxq_index;
 4385		int rc;
 4386
 4387		/* Should we steer this flow to a different hardware queue? */
 4388		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
 4389		    !(dev->features & NETIF_F_NTUPLE))
 4390			goto out;
 4391		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
 4392		if (rxq_index == skb_get_rx_queue(skb))
 4393			goto out;
 4394
 4395		rxqueue = dev->_rx + rxq_index;
 4396		flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4397		if (!flow_table)
 4398			goto out;
 4399		flow_id = skb_get_hash(skb) & flow_table->mask;
 4400		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
 4401							rxq_index, flow_id);
 4402		if (rc < 0)
 4403			goto out;
 4404		old_rflow = rflow;
 4405		rflow = &flow_table->flows[flow_id];
 4406		rflow->filter = rc;
 4407		if (old_rflow->filter == rflow->filter)
 4408			old_rflow->filter = RPS_NO_FILTER;
 4409	out:
 4410#endif
 4411		rflow->last_qtail =
 4412			per_cpu(softnet_data, next_cpu).input_queue_head;
 4413	}
 4414
 4415	rflow->cpu = next_cpu;
 4416	return rflow;
 4417}
 4418
 4419/*
 4420 * get_rps_cpu is called from netif_receive_skb and returns the target
 4421 * CPU from the RPS map of the receiving queue for a given skb.
 4422 * rcu_read_lock must be held on entry.
 4423 */
 4424static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4425		       struct rps_dev_flow **rflowp)
 4426{
 4427	const struct rps_sock_flow_table *sock_flow_table;
 4428	struct netdev_rx_queue *rxqueue = dev->_rx;
 4429	struct rps_dev_flow_table *flow_table;
 4430	struct rps_map *map;
 4431	int cpu = -1;
 4432	u32 tcpu;
 4433	u32 hash;
 4434
 4435	if (skb_rx_queue_recorded(skb)) {
 4436		u16 index = skb_get_rx_queue(skb);
 4437
 4438		if (unlikely(index >= dev->real_num_rx_queues)) {
 4439			WARN_ONCE(dev->real_num_rx_queues > 1,
 4440				  "%s received packet on queue %u, but number "
 4441				  "of RX queues is %u\n",
 4442				  dev->name, index, dev->real_num_rx_queues);
 4443			goto done;
 4444		}
 4445		rxqueue += index;
 4446	}
 4447
 4448	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
 4449
 4450	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4451	map = rcu_dereference(rxqueue->rps_map);
 4452	if (!flow_table && !map)
 
 
 
 
 
 
 
 
 4453		goto done;
 
 4454
 4455	skb_reset_network_header(skb);
 4456	hash = skb_get_hash(skb);
 4457	if (!hash)
 4458		goto done;
 4459
 
 4460	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 4461	if (flow_table && sock_flow_table) {
 
 4462		struct rps_dev_flow *rflow;
 4463		u32 next_cpu;
 4464		u32 ident;
 4465
 4466		/* First check into global flow table if there is a match */
 4467		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
 4468		if ((ident ^ hash) & ~rps_cpu_mask)
 4469			goto try_rps;
 4470
 4471		next_cpu = ident & rps_cpu_mask;
 4472
 4473		/* OK, now we know there is a match,
 4474		 * we can look at the local (per receive queue) flow table
 4475		 */
 4476		rflow = &flow_table->flows[hash & flow_table->mask];
 4477		tcpu = rflow->cpu;
 4478
 4479		/*
 4480		 * If the desired CPU (where last recvmsg was done) is
 4481		 * different from current CPU (one in the rx-queue flow
 4482		 * table entry), switch if one of the following holds:
 4483		 *   - Current CPU is unset (>= nr_cpu_ids).
 4484		 *   - Current CPU is offline.
 4485		 *   - The current CPU's queue tail has advanced beyond the
 4486		 *     last packet that was enqueued using this table entry.
 4487		 *     This guarantees that all previous packets for the flow
 4488		 *     have been dequeued, thus preserving in order delivery.
 4489		 */
 4490		if (unlikely(tcpu != next_cpu) &&
 4491		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
 4492		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
 4493		      rflow->last_qtail)) >= 0)) {
 4494			tcpu = next_cpu;
 4495			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 4496		}
 4497
 4498		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
 4499			*rflowp = rflow;
 4500			cpu = tcpu;
 4501			goto done;
 4502		}
 4503	}
 4504
 4505try_rps:
 
 4506
 4507	if (map) {
 4508		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 4509		if (cpu_online(tcpu)) {
 4510			cpu = tcpu;
 4511			goto done;
 4512		}
 4513	}
 4514
 4515done:
 4516	return cpu;
 4517}
 4518
 4519#ifdef CONFIG_RFS_ACCEL
 4520
 4521/**
 4522 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 4523 * @dev: Device on which the filter was set
 4524 * @rxq_index: RX queue index
 4525 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 4526 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 4527 *
 4528 * Drivers that implement ndo_rx_flow_steer() should periodically call
 4529 * this function for each installed filter and remove the filters for
 4530 * which it returns %true.
 4531 */
 4532bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 4533			 u32 flow_id, u16 filter_id)
 4534{
 4535	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
 4536	struct rps_dev_flow_table *flow_table;
 4537	struct rps_dev_flow *rflow;
 4538	bool expire = true;
 4539	unsigned int cpu;
 4540
 4541	rcu_read_lock();
 4542	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4543	if (flow_table && flow_id <= flow_table->mask) {
 4544		rflow = &flow_table->flows[flow_id];
 4545		cpu = READ_ONCE(rflow->cpu);
 4546		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 4547		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
 4548			   rflow->last_qtail) <
 4549		     (int)(10 * flow_table->mask)))
 4550			expire = false;
 4551	}
 4552	rcu_read_unlock();
 4553	return expire;
 4554}
 4555EXPORT_SYMBOL(rps_may_expire_flow);
 4556
 4557#endif /* CONFIG_RFS_ACCEL */
 4558
 4559/* Called from hardirq (IPI) context */
 4560static void rps_trigger_softirq(void *data)
 4561{
 4562	struct softnet_data *sd = data;
 4563
 4564	____napi_schedule(sd, &sd->backlog);
 4565	sd->received_rps++;
 4566}
 4567
 4568#endif /* CONFIG_RPS */
 4569
 4570/* Called from hardirq (IPI) context */
 4571static void trigger_rx_softirq(void *data)
 4572{
 4573	struct softnet_data *sd = data;
 4574
 4575	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4576	smp_store_release(&sd->defer_ipi_scheduled, 0);
 4577}
 4578
 4579/*
 4580 * Check if this softnet_data structure is another cpu one
 4581 * If yes, queue it to our IPI list and return 1
 4582 * If no, return 0
 4583 */
 4584static int napi_schedule_rps(struct softnet_data *sd)
 4585{
 4586	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 
 4587
 4588#ifdef CONFIG_RPS
 4589	if (sd != mysd) {
 4590		sd->rps_ipi_next = mysd->rps_ipi_list;
 4591		mysd->rps_ipi_list = sd;
 4592
 4593		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4594		return 1;
 4595	}
 4596#endif /* CONFIG_RPS */
 4597	__napi_schedule_irqoff(&mysd->backlog);
 4598	return 0;
 4599}
 4600
 4601#ifdef CONFIG_NET_FLOW_LIMIT
 4602int netdev_flow_limit_table_len __read_mostly = (1 << 12);
 4603#endif
 4604
 4605static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
 4606{
 4607#ifdef CONFIG_NET_FLOW_LIMIT
 4608	struct sd_flow_limit *fl;
 4609	struct softnet_data *sd;
 4610	unsigned int old_flow, new_flow;
 4611
 4612	if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
 4613		return false;
 4614
 4615	sd = this_cpu_ptr(&softnet_data);
 4616
 4617	rcu_read_lock();
 4618	fl = rcu_dereference(sd->flow_limit);
 4619	if (fl) {
 4620		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
 4621		old_flow = fl->history[fl->history_head];
 4622		fl->history[fl->history_head] = new_flow;
 4623
 4624		fl->history_head++;
 4625		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
 4626
 4627		if (likely(fl->buckets[old_flow]))
 4628			fl->buckets[old_flow]--;
 4629
 4630		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
 4631			fl->count++;
 4632			rcu_read_unlock();
 4633			return true;
 4634		}
 4635	}
 4636	rcu_read_unlock();
 4637#endif
 4638	return false;
 4639}
 4640
 4641/*
 4642 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 4643 * queue (may be a remote CPU queue).
 4644 */
 4645static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 4646			      unsigned int *qtail)
 4647{
 4648	enum skb_drop_reason reason;
 4649	struct softnet_data *sd;
 4650	unsigned long flags;
 4651	unsigned int qlen;
 4652
 4653	reason = SKB_DROP_REASON_NOT_SPECIFIED;
 4654	sd = &per_cpu(softnet_data, cpu);
 4655
 4656	rps_lock_irqsave(sd, &flags);
 4657	if (!netif_running(skb->dev))
 4658		goto drop;
 4659	qlen = skb_queue_len(&sd->input_pkt_queue);
 4660	if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
 4661		if (qlen) {
 4662enqueue:
 4663			__skb_queue_tail(&sd->input_pkt_queue, skb);
 4664			input_queue_tail_incr_save(sd, qtail);
 4665			rps_unlock_irq_restore(sd, &flags);
 
 4666			return NET_RX_SUCCESS;
 4667		}
 4668
 4669		/* Schedule NAPI for backlog device
 4670		 * We can use non atomic operation since we own the queue lock
 4671		 */
 4672		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
 4673			napi_schedule_rps(sd);
 
 
 4674		goto enqueue;
 4675	}
 4676	reason = SKB_DROP_REASON_CPU_BACKLOG;
 4677
 4678drop:
 4679	sd->dropped++;
 4680	rps_unlock_irq_restore(sd, &flags);
 
 
 4681
 4682	dev_core_stats_rx_dropped_inc(skb->dev);
 4683	kfree_skb_reason(skb, reason);
 4684	return NET_RX_DROP;
 4685}
 4686
 4687static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
 4688{
 4689	struct net_device *dev = skb->dev;
 4690	struct netdev_rx_queue *rxqueue;
 4691
 4692	rxqueue = dev->_rx;
 4693
 4694	if (skb_rx_queue_recorded(skb)) {
 4695		u16 index = skb_get_rx_queue(skb);
 4696
 4697		if (unlikely(index >= dev->real_num_rx_queues)) {
 4698			WARN_ONCE(dev->real_num_rx_queues > 1,
 4699				  "%s received packet on queue %u, but number "
 4700				  "of RX queues is %u\n",
 4701				  dev->name, index, dev->real_num_rx_queues);
 4702
 4703			return rxqueue; /* Return first rxqueue */
 4704		}
 4705		rxqueue += index;
 4706	}
 4707	return rxqueue;
 4708}
 4709
 4710u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 4711			     struct bpf_prog *xdp_prog)
 4712{
 4713	void *orig_data, *orig_data_end, *hard_start;
 4714	struct netdev_rx_queue *rxqueue;
 4715	bool orig_bcast, orig_host;
 4716	u32 mac_len, frame_sz;
 4717	__be16 orig_eth_type;
 4718	struct ethhdr *eth;
 4719	u32 metalen, act;
 4720	int off;
 4721
 4722	/* The XDP program wants to see the packet starting at the MAC
 4723	 * header.
 4724	 */
 4725	mac_len = skb->data - skb_mac_header(skb);
 4726	hard_start = skb->data - skb_headroom(skb);
 4727
 4728	/* SKB "head" area always have tailroom for skb_shared_info */
 4729	frame_sz = (void *)skb_end_pointer(skb) - hard_start;
 4730	frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 4731
 4732	rxqueue = netif_get_rxqueue(skb);
 4733	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
 4734	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
 4735			 skb_headlen(skb) + mac_len, true);
 4736
 4737	orig_data_end = xdp->data_end;
 4738	orig_data = xdp->data;
 4739	eth = (struct ethhdr *)xdp->data;
 4740	orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
 4741	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
 4742	orig_eth_type = eth->h_proto;
 4743
 4744	act = bpf_prog_run_xdp(xdp_prog, xdp);
 4745
 4746	/* check if bpf_xdp_adjust_head was used */
 4747	off = xdp->data - orig_data;
 4748	if (off) {
 4749		if (off > 0)
 4750			__skb_pull(skb, off);
 4751		else if (off < 0)
 4752			__skb_push(skb, -off);
 4753
 4754		skb->mac_header += off;
 4755		skb_reset_network_header(skb);
 4756	}
 4757
 4758	/* check if bpf_xdp_adjust_tail was used */
 4759	off = xdp->data_end - orig_data_end;
 4760	if (off != 0) {
 4761		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
 4762		skb->len += off; /* positive on grow, negative on shrink */
 4763	}
 4764
 4765	/* check if XDP changed eth hdr such SKB needs update */
 4766	eth = (struct ethhdr *)xdp->data;
 4767	if ((orig_eth_type != eth->h_proto) ||
 4768	    (orig_host != ether_addr_equal_64bits(eth->h_dest,
 4769						  skb->dev->dev_addr)) ||
 4770	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
 4771		__skb_push(skb, ETH_HLEN);
 4772		skb->pkt_type = PACKET_HOST;
 4773		skb->protocol = eth_type_trans(skb, skb->dev);
 4774	}
 4775
 4776	/* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
 4777	 * before calling us again on redirect path. We do not call do_redirect
 4778	 * as we leave that up to the caller.
 4779	 *
 4780	 * Caller is responsible for managing lifetime of skb (i.e. calling
 4781	 * kfree_skb in response to actions it cannot handle/XDP_DROP).
 4782	 */
 4783	switch (act) {
 4784	case XDP_REDIRECT:
 4785	case XDP_TX:
 4786		__skb_push(skb, mac_len);
 4787		break;
 4788	case XDP_PASS:
 4789		metalen = xdp->data - xdp->data_meta;
 4790		if (metalen)
 4791			skb_metadata_set(skb, metalen);
 4792		break;
 4793	}
 4794
 4795	return act;
 4796}
 4797
 4798static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 4799				     struct xdp_buff *xdp,
 4800				     struct bpf_prog *xdp_prog)
 4801{
 4802	u32 act = XDP_DROP;
 4803
 4804	/* Reinjected packets coming from act_mirred or similar should
 4805	 * not get XDP generic processing.
 4806	 */
 4807	if (skb_is_redirected(skb))
 4808		return XDP_PASS;
 4809
 4810	/* XDP packets must be linear and must have sufficient headroom
 4811	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
 4812	 * native XDP provides, thus we need to do it here as well.
 4813	 */
 4814	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
 4815	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
 4816		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 4817		int troom = skb->tail + skb->data_len - skb->end;
 4818
 4819		/* In case we have to go down the path and also linearize,
 4820		 * then lets do the pskb_expand_head() work just once here.
 4821		 */
 4822		if (pskb_expand_head(skb,
 4823				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
 4824				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
 4825			goto do_drop;
 4826		if (skb_linearize(skb))
 4827			goto do_drop;
 4828	}
 4829
 4830	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
 4831	switch (act) {
 4832	case XDP_REDIRECT:
 4833	case XDP_TX:
 4834	case XDP_PASS:
 4835		break;
 4836	default:
 4837		bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
 4838		fallthrough;
 4839	case XDP_ABORTED:
 4840		trace_xdp_exception(skb->dev, xdp_prog, act);
 4841		fallthrough;
 4842	case XDP_DROP:
 4843	do_drop:
 4844		kfree_skb(skb);
 4845		break;
 4846	}
 4847
 4848	return act;
 4849}
 4850
 4851/* When doing generic XDP we have to bypass the qdisc layer and the
 4852 * network taps in order to match in-driver-XDP behavior. This also means
 4853 * that XDP packets are able to starve other packets going through a qdisc,
 4854 * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
 4855 * queues, so they do not have this starvation issue.
 4856 */
 4857void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 4858{
 4859	struct net_device *dev = skb->dev;
 4860	struct netdev_queue *txq;
 4861	bool free_skb = true;
 4862	int cpu, rc;
 4863
 4864	txq = netdev_core_pick_tx(dev, skb, NULL);
 4865	cpu = smp_processor_id();
 4866	HARD_TX_LOCK(dev, txq, cpu);
 4867	if (!netif_xmit_frozen_or_drv_stopped(txq)) {
 4868		rc = netdev_start_xmit(skb, dev, txq, 0);
 4869		if (dev_xmit_complete(rc))
 4870			free_skb = false;
 4871	}
 4872	HARD_TX_UNLOCK(dev, txq);
 4873	if (free_skb) {
 4874		trace_xdp_exception(dev, xdp_prog, XDP_TX);
 4875		dev_core_stats_tx_dropped_inc(dev);
 4876		kfree_skb(skb);
 4877	}
 4878}
 4879
 4880static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
 4881
 4882int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 4883{
 4884	if (xdp_prog) {
 4885		struct xdp_buff xdp;
 4886		u32 act;
 4887		int err;
 4888
 4889		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
 4890		if (act != XDP_PASS) {
 4891			switch (act) {
 4892			case XDP_REDIRECT:
 4893				err = xdp_do_generic_redirect(skb->dev, skb,
 4894							      &xdp, xdp_prog);
 4895				if (err)
 4896					goto out_redir;
 4897				break;
 4898			case XDP_TX:
 4899				generic_xdp_tx(skb, xdp_prog);
 4900				break;
 4901			}
 4902			return XDP_DROP;
 4903		}
 4904	}
 4905	return XDP_PASS;
 4906out_redir:
 4907	kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
 4908	return XDP_DROP;
 4909}
 4910EXPORT_SYMBOL_GPL(do_xdp_generic);
 4911
 4912static int netif_rx_internal(struct sk_buff *skb)
 4913{
 4914	int ret;
 4915
 4916	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
 4917
 4918	trace_netif_rx(skb);
 4919
 4920#ifdef CONFIG_RPS
 4921	if (static_branch_unlikely(&rps_needed)) {
 4922		struct rps_dev_flow voidflow, *rflow = &voidflow;
 4923		int cpu;
 4924
 
 4925		rcu_read_lock();
 4926
 4927		cpu = get_rps_cpu(skb->dev, skb, &rflow);
 4928		if (cpu < 0)
 4929			cpu = smp_processor_id();
 4930
 4931		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 4932
 4933		rcu_read_unlock();
 
 4934	} else
 4935#endif
 4936	{
 4937		unsigned int qtail;
 4938
 4939		ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
 4940	}
 4941	return ret;
 4942}
 
 4943
 4944/**
 4945 *	__netif_rx	-	Slightly optimized version of netif_rx
 4946 *	@skb: buffer to post
 4947 *
 4948 *	This behaves as netif_rx except that it does not disable bottom halves.
 4949 *	As a result this function may only be invoked from the interrupt context
 4950 *	(either hard or soft interrupt).
 4951 */
 4952int __netif_rx(struct sk_buff *skb)
 4953{
 4954	int ret;
 4955
 4956	lockdep_assert_once(hardirq_count() | softirq_count());
 
 
 
 
 4957
 4958	trace_netif_rx_entry(skb);
 4959	ret = netif_rx_internal(skb);
 4960	trace_netif_rx_exit(ret);
 4961	return ret;
 4962}
 4963EXPORT_SYMBOL(__netif_rx);
 4964
 4965/**
 4966 *	netif_rx	-	post buffer to the network code
 4967 *	@skb: buffer to post
 4968 *
 4969 *	This function receives a packet from a device driver and queues it for
 4970 *	the upper (protocol) levels to process via the backlog NAPI device. It
 4971 *	always succeeds. The buffer may be dropped during processing for
 4972 *	congestion control or by the protocol layers.
 4973 *	The network buffer is passed via the backlog NAPI device. Modern NIC
 4974 *	driver should use NAPI and GRO.
 4975 *	This function can used from interrupt and from process context. The
 4976 *	caller from process context must not disable interrupts before invoking
 4977 *	this function.
 4978 *
 4979 *	return values:
 4980 *	NET_RX_SUCCESS	(no congestion)
 4981 *	NET_RX_DROP     (packet was dropped)
 4982 *
 4983 */
 4984int netif_rx(struct sk_buff *skb)
 4985{
 4986	bool need_bh_off = !(hardirq_count() | softirq_count());
 4987	int ret;
 4988
 4989	if (need_bh_off)
 4990		local_bh_disable();
 4991	trace_netif_rx_entry(skb);
 4992	ret = netif_rx_internal(skb);
 4993	trace_netif_rx_exit(ret);
 4994	if (need_bh_off)
 4995		local_bh_enable();
 4996	return ret;
 4997}
 4998EXPORT_SYMBOL(netif_rx);
 4999
 5000static __latent_entropy void net_tx_action(struct softirq_action *h)
 5001{
 5002	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 5003
 5004	if (sd->completion_queue) {
 5005		struct sk_buff *clist;
 5006
 5007		local_irq_disable();
 5008		clist = sd->completion_queue;
 5009		sd->completion_queue = NULL;
 5010		local_irq_enable();
 5011
 5012		while (clist) {
 5013			struct sk_buff *skb = clist;
 5014
 5015			clist = clist->next;
 5016
 5017			WARN_ON(refcount_read(&skb->users));
 5018			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
 5019				trace_consume_skb(skb);
 5020			else
 5021				trace_kfree_skb(skb, net_tx_action,
 5022						SKB_DROP_REASON_NOT_SPECIFIED);
 5023
 5024			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 5025				__kfree_skb(skb);
 5026			else
 5027				__kfree_skb_defer(skb);
 5028		}
 5029	}
 5030
 5031	if (sd->output_queue) {
 5032		struct Qdisc *head;
 5033
 5034		local_irq_disable();
 5035		head = sd->output_queue;
 5036		sd->output_queue = NULL;
 5037		sd->output_queue_tailp = &sd->output_queue;
 5038		local_irq_enable();
 5039
 5040		rcu_read_lock();
 5041
 5042		while (head) {
 5043			struct Qdisc *q = head;
 5044			spinlock_t *root_lock = NULL;
 5045
 5046			head = head->next_sched;
 5047
 5048			/* We need to make sure head->next_sched is read
 5049			 * before clearing __QDISC_STATE_SCHED
 5050			 */
 5051			smp_mb__before_atomic();
 5052
 5053			if (!(q->flags & TCQ_F_NOLOCK)) {
 5054				root_lock = qdisc_lock(q);
 5055				spin_lock(root_lock);
 5056			} else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
 5057						     &q->state))) {
 5058				/* There is a synchronize_net() between
 5059				 * STATE_DEACTIVATED flag being set and
 5060				 * qdisc_reset()/some_qdisc_is_busy() in
 5061				 * dev_deactivate(), so we can safely bail out
 5062				 * early here to avoid data race between
 5063				 * qdisc_deactivate() and some_qdisc_is_busy()
 5064				 * for lockless qdisc.
 5065				 */
 5066				clear_bit(__QDISC_STATE_SCHED, &q->state);
 5067				continue;
 5068			}
 5069
 5070			clear_bit(__QDISC_STATE_SCHED, &q->state);
 5071			qdisc_run(q);
 5072			if (root_lock)
 5073				spin_unlock(root_lock);
 5074		}
 5075
 5076		rcu_read_unlock();
 5077	}
 5078
 5079	xfrm_dev_backlog(sd);
 5080}
 5081
 5082#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
 
 5083/* This hook is defined here for ATM LANE */
 5084int (*br_fdb_test_addr_hook)(struct net_device *dev,
 5085			     unsigned char *addr) __read_mostly;
 5086EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 5087#endif
 5088
 5089static inline struct sk_buff *
 5090sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 5091		   struct net_device *orig_dev, bool *another)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5092{
 5093#ifdef CONFIG_NET_CLS_ACT
 5094	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
 5095	struct tcf_result cl_res;
 5096
 5097	/* If there's at least one ingress present somewhere (so
 5098	 * we get here via enabled static key), remaining devices
 5099	 * that are not configured with an ingress qdisc will bail
 5100	 * out here.
 5101	 */
 5102	if (!miniq)
 5103		return skb;
 5104
 5105	if (*pt_prev) {
 5106		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 5107		*pt_prev = NULL;
 5108	}
 5109
 5110	qdisc_skb_cb(skb)->pkt_len = skb->len;
 5111	tc_skb_cb(skb)->mru = 0;
 5112	tc_skb_cb(skb)->post_ct = false;
 5113	skb->tc_at_ingress = 1;
 5114	mini_qdisc_bstats_cpu_update(miniq, skb);
 5115
 5116	switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
 5117	case TC_ACT_OK:
 5118	case TC_ACT_RECLASSIFY:
 5119		skb->tc_index = TC_H_MIN(cl_res.classid);
 5120		break;
 5121	case TC_ACT_SHOT:
 5122		mini_qdisc_qstats_cpu_drop(miniq);
 5123		kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
 5124		*ret = NET_RX_DROP;
 5125		return NULL;
 5126	case TC_ACT_STOLEN:
 5127	case TC_ACT_QUEUED:
 5128	case TC_ACT_TRAP:
 5129		consume_skb(skb);
 5130		*ret = NET_RX_SUCCESS;
 5131		return NULL;
 5132	case TC_ACT_REDIRECT:
 5133		/* skb_mac_header check was done by cls/act_bpf, so
 5134		 * we can safely push the L2 header back before
 5135		 * redirecting to another netdev
 5136		 */
 5137		__skb_push(skb, skb->mac_len);
 5138		if (skb_do_redirect(skb) == -EAGAIN) {
 5139			__skb_pull(skb, skb->mac_len);
 5140			*another = true;
 5141			break;
 5142		}
 5143		*ret = NET_RX_SUCCESS;
 5144		return NULL;
 5145	case TC_ACT_CONSUMED:
 5146		*ret = NET_RX_SUCCESS;
 5147		return NULL;
 5148	default:
 5149		break;
 5150	}
 5151#endif /* CONFIG_NET_CLS_ACT */
 
 
 5152	return skb;
 5153}
 5154
 5155/**
 5156 *	netdev_is_rx_handler_busy - check if receive handler is registered
 5157 *	@dev: device to check
 5158 *
 5159 *	Check if a receive handler is already registered for a given device.
 5160 *	Return true if there one.
 5161 *
 5162 *	The caller must hold the rtnl_mutex.
 5163 */
 5164bool netdev_is_rx_handler_busy(struct net_device *dev)
 5165{
 5166	ASSERT_RTNL();
 5167	return dev && rtnl_dereference(dev->rx_handler);
 5168}
 5169EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
 5170
 5171/**
 5172 *	netdev_rx_handler_register - register receive handler
 5173 *	@dev: device to register a handler for
 5174 *	@rx_handler: receive handler to register
 5175 *	@rx_handler_data: data pointer that is used by rx handler
 5176 *
 5177 *	Register a receive handler for a device. This handler will then be
 5178 *	called from __netif_receive_skb. A negative errno code is returned
 5179 *	on a failure.
 5180 *
 5181 *	The caller must hold the rtnl_mutex.
 5182 *
 5183 *	For a general description of rx_handler, see enum rx_handler_result.
 5184 */
 5185int netdev_rx_handler_register(struct net_device *dev,
 5186			       rx_handler_func_t *rx_handler,
 5187			       void *rx_handler_data)
 5188{
 5189	if (netdev_is_rx_handler_busy(dev))
 
 
 5190		return -EBUSY;
 5191
 5192	if (dev->priv_flags & IFF_NO_RX_HANDLER)
 5193		return -EINVAL;
 5194
 5195	/* Note: rx_handler_data must be set before rx_handler */
 5196	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
 5197	rcu_assign_pointer(dev->rx_handler, rx_handler);
 5198
 5199	return 0;
 5200}
 5201EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
 5202
 5203/**
 5204 *	netdev_rx_handler_unregister - unregister receive handler
 5205 *	@dev: device to unregister a handler from
 5206 *
 5207 *	Unregister a receive handler from a device.
 5208 *
 5209 *	The caller must hold the rtnl_mutex.
 5210 */
 5211void netdev_rx_handler_unregister(struct net_device *dev)
 5212{
 5213
 5214	ASSERT_RTNL();
 5215	RCU_INIT_POINTER(dev->rx_handler, NULL);
 5216	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
 5217	 * section has a guarantee to see a non NULL rx_handler_data
 5218	 * as well.
 5219	 */
 5220	synchronize_net();
 5221	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
 5222}
 5223EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 5224
 5225/*
 5226 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 5227 * the special handling of PFMEMALLOC skbs.
 5228 */
 5229static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
 5230{
 5231	switch (skb->protocol) {
 5232	case htons(ETH_P_ARP):
 5233	case htons(ETH_P_IP):
 5234	case htons(ETH_P_IPV6):
 5235	case htons(ETH_P_8021Q):
 5236	case htons(ETH_P_8021AD):
 5237		return true;
 5238	default:
 5239		return false;
 5240	}
 5241}
 5242
 5243static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 5244			     int *ret, struct net_device *orig_dev)
 5245{
 5246	if (nf_hook_ingress_active(skb)) {
 5247		int ingress_retval;
 5248
 5249		if (*pt_prev) {
 5250			*ret = deliver_skb(skb, *pt_prev, orig_dev);
 5251			*pt_prev = NULL;
 5252		}
 5253
 5254		rcu_read_lock();
 5255		ingress_retval = nf_hook_ingress(skb);
 5256		rcu_read_unlock();
 5257		return ingress_retval;
 5258	}
 5259	return 0;
 5260}
 5261
 5262static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 5263				    struct packet_type **ppt_prev)
 5264{
 5265	struct packet_type *ptype, *pt_prev;
 5266	rx_handler_func_t *rx_handler;
 5267	struct sk_buff *skb = *pskb;
 5268	struct net_device *orig_dev;
 
 5269	bool deliver_exact = false;
 5270	int ret = NET_RX_DROP;
 5271	__be16 type;
 5272
 5273	net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
 5274
 5275	trace_netif_receive_skb(skb);
 5276
 
 
 
 
 
 
 5277	orig_dev = skb->dev;
 5278
 5279	skb_reset_network_header(skb);
 5280	if (!skb_transport_header_was_set(skb))
 5281		skb_reset_transport_header(skb);
 5282	skb_reset_mac_len(skb);
 5283
 5284	pt_prev = NULL;
 5285
 
 
 5286another_round:
 5287	skb->skb_iif = skb->dev->ifindex;
 5288
 5289	__this_cpu_inc(softnet_data.processed);
 5290
 5291	if (static_branch_unlikely(&generic_xdp_needed_key)) {
 5292		int ret2;
 5293
 5294		migrate_disable();
 5295		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
 5296		migrate_enable();
 5297
 5298		if (ret2 != XDP_PASS) {
 5299			ret = NET_RX_DROP;
 5300			goto out;
 5301		}
 5302	}
 5303
 5304	if (eth_type_vlan(skb->protocol)) {
 5305		skb = skb_vlan_untag(skb);
 5306		if (unlikely(!skb))
 5307			goto out;
 5308	}
 5309
 5310	if (skb_skip_tc_classify(skb))
 5311		goto skip_classify;
 5312
 5313	if (pfmemalloc)
 5314		goto skip_taps;
 5315
 5316	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 5317		if (pt_prev)
 5318			ret = deliver_skb(skb, pt_prev, orig_dev);
 5319		pt_prev = ptype;
 
 
 5320	}
 5321
 5322	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
 5323		if (pt_prev)
 5324			ret = deliver_skb(skb, pt_prev, orig_dev);
 5325		pt_prev = ptype;
 5326	}
 5327
 5328skip_taps:
 5329#ifdef CONFIG_NET_INGRESS
 5330	if (static_branch_unlikely(&ingress_needed_key)) {
 5331		bool another = false;
 5332
 5333		nf_skip_egress(skb, true);
 5334		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
 5335					 &another);
 5336		if (another)
 5337			goto another_round;
 5338		if (!skb)
 5339			goto out;
 5340
 5341		nf_skip_egress(skb, false);
 5342		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
 5343			goto out;
 5344	}
 5345#endif
 5346	skb_reset_redirect(skb);
 5347skip_classify:
 5348	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 5349		goto drop;
 5350
 5351	if (skb_vlan_tag_present(skb)) {
 
 5352		if (pt_prev) {
 5353			ret = deliver_skb(skb, pt_prev, orig_dev);
 5354			pt_prev = NULL;
 5355		}
 5356		if (vlan_do_receive(&skb))
 5357			goto another_round;
 5358		else if (unlikely(!skb))
 5359			goto out;
 5360	}
 5361
 5362	rx_handler = rcu_dereference(skb->dev->rx_handler);
 5363	if (rx_handler) {
 5364		if (pt_prev) {
 5365			ret = deliver_skb(skb, pt_prev, orig_dev);
 5366			pt_prev = NULL;
 5367		}
 5368		switch (rx_handler(&skb)) {
 5369		case RX_HANDLER_CONSUMED:
 5370			ret = NET_RX_SUCCESS;
 5371			goto out;
 5372		case RX_HANDLER_ANOTHER:
 5373			goto another_round;
 5374		case RX_HANDLER_EXACT:
 5375			deliver_exact = true;
 5376			break;
 5377		case RX_HANDLER_PASS:
 5378			break;
 5379		default:
 5380			BUG();
 5381		}
 5382	}
 5383
 5384	if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
 5385check_vlan_id:
 5386		if (skb_vlan_tag_get_id(skb)) {
 5387			/* Vlan id is non 0 and vlan_do_receive() above couldn't
 5388			 * find vlan device.
 5389			 */
 5390			skb->pkt_type = PACKET_OTHERHOST;
 5391		} else if (eth_type_vlan(skb->protocol)) {
 5392			/* Outer header is 802.1P with vlan 0, inner header is
 5393			 * 802.1Q or 802.1AD and vlan_do_receive() above could
 5394			 * not find vlan dev for vlan id 0.
 5395			 */
 5396			__vlan_hwaccel_clear_tag(skb);
 5397			skb = skb_vlan_untag(skb);
 5398			if (unlikely(!skb))
 5399				goto out;
 5400			if (vlan_do_receive(&skb))
 5401				/* After stripping off 802.1P header with vlan 0
 5402				 * vlan dev is found for inner header.
 5403				 */
 5404				goto another_round;
 5405			else if (unlikely(!skb))
 5406				goto out;
 5407			else
 5408				/* We have stripped outer 802.1P vlan 0 header.
 5409				 * But could not find vlan dev.
 5410				 * check again for vlan id to set OTHERHOST.
 5411				 */
 5412				goto check_vlan_id;
 5413		}
 5414		/* Note: we might in the future use prio bits
 5415		 * and set skb->priority like in vlan_do_receive()
 5416		 * For the time being, just ignore Priority Code Point
 5417		 */
 5418		__vlan_hwaccel_clear_tag(skb);
 5419	}
 5420
 5421	type = skb->protocol;
 5422
 5423	/* deliver only exact match when indicated */
 5424	if (likely(!deliver_exact)) {
 5425		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5426				       &ptype_base[ntohs(type) &
 5427						   PTYPE_HASH_MASK]);
 5428	}
 5429
 5430	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5431			       &orig_dev->ptype_specific);
 5432
 5433	if (unlikely(skb->dev != orig_dev)) {
 5434		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5435				       &skb->dev->ptype_specific);
 5436	}
 5437
 5438	if (pt_prev) {
 5439		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 5440			goto drop;
 5441		*ppt_prev = pt_prev;
 5442	} else {
 5443drop:
 5444		if (!deliver_exact)
 5445			dev_core_stats_rx_dropped_inc(skb->dev);
 5446		else
 5447			dev_core_stats_rx_nohandler_inc(skb->dev);
 5448		kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
 5449		/* Jamal, now you will not able to escape explaining
 5450		 * me how you were going to use this. :-)
 5451		 */
 5452		ret = NET_RX_DROP;
 5453	}
 5454
 5455out:
 5456	/* The invariant here is that if *ppt_prev is not NULL
 5457	 * then skb should also be non-NULL.
 5458	 *
 5459	 * Apparently *ppt_prev assignment above holds this invariant due to
 5460	 * skb dereferencing near it.
 5461	 */
 5462	*pskb = skb;
 5463	return ret;
 5464}
 5465
 5466static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
 5467{
 5468	struct net_device *orig_dev = skb->dev;
 5469	struct packet_type *pt_prev = NULL;
 5470	int ret;
 5471
 5472	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5473	if (pt_prev)
 5474		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
 5475					 skb->dev, pt_prev, orig_dev);
 5476	return ret;
 5477}
 5478
 5479/**
 5480 *	netif_receive_skb_core - special purpose version of netif_receive_skb
 5481 *	@skb: buffer to process
 5482 *
 5483 *	More direct receive version of netif_receive_skb().  It should
 5484 *	only be used by callers that have a need to skip RPS and Generic XDP.
 5485 *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
 5486 *
 5487 *	This function may only be called from softirq context and interrupts
 5488 *	should be enabled.
 5489 *
 5490 *	Return values (usually ignored):
 5491 *	NET_RX_SUCCESS: no congestion
 5492 *	NET_RX_DROP: packet was dropped
 5493 */
 5494int netif_receive_skb_core(struct sk_buff *skb)
 5495{
 5496	int ret;
 
 
 
 5497
 5498	rcu_read_lock();
 5499	ret = __netif_receive_skb_one_core(skb, false);
 5500	rcu_read_unlock();
 
 5501
 5502	return ret;
 5503}
 5504EXPORT_SYMBOL(netif_receive_skb_core);
 5505
 5506static inline void __netif_receive_skb_list_ptype(struct list_head *head,
 5507						  struct packet_type *pt_prev,
 5508						  struct net_device *orig_dev)
 5509{
 5510	struct sk_buff *skb, *next;
 5511
 5512	if (!pt_prev)
 5513		return;
 5514	if (list_empty(head))
 5515		return;
 5516	if (pt_prev->list_func != NULL)
 5517		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
 5518				   ip_list_rcv, head, pt_prev, orig_dev);
 5519	else
 5520		list_for_each_entry_safe(skb, next, head, list) {
 5521			skb_list_del_init(skb);
 5522			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 5523		}
 
 
 
 
 5524}
 
 5525
 5526static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
 5527{
 5528	/* Fast-path assumptions:
 5529	 * - There is no RX handler.
 5530	 * - Only one packet_type matches.
 5531	 * If either of these fails, we will end up doing some per-packet
 5532	 * processing in-line, then handling the 'last ptype' for the whole
 5533	 * sublist.  This can't cause out-of-order delivery to any single ptype,
 5534	 * because the 'last ptype' must be constant across the sublist, and all
 5535	 * other ptypes are handled per-packet.
 5536	 */
 5537	/* Current (common) ptype of sublist */
 5538	struct packet_type *pt_curr = NULL;
 5539	/* Current (common) orig_dev of sublist */
 5540	struct net_device *od_curr = NULL;
 5541	struct list_head sublist;
 5542	struct sk_buff *skb, *next;
 5543
 5544	INIT_LIST_HEAD(&sublist);
 5545	list_for_each_entry_safe(skb, next, head, list) {
 5546		struct net_device *orig_dev = skb->dev;
 5547		struct packet_type *pt_prev = NULL;
 5548
 5549		skb_list_del_init(skb);
 5550		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5551		if (!pt_prev)
 5552			continue;
 5553		if (pt_curr != pt_prev || od_curr != orig_dev) {
 5554			/* dispatch old sublist */
 5555			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5556			/* start new sublist */
 5557			INIT_LIST_HEAD(&sublist);
 5558			pt_curr = pt_prev;
 5559			od_curr = orig_dev;
 5560		}
 5561		list_add_tail(&skb->list, &sublist);
 5562	}
 
 5563
 5564	/* dispatch final sublist */
 5565	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 
 
 
 
 
 5566}
 5567
 5568static int __netif_receive_skb(struct sk_buff *skb)
 5569{
 5570	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 5571
 5572	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
 5573		unsigned int noreclaim_flag;
 
 
 5574
 5575		/*
 5576		 * PFMEMALLOC skbs are special, they should
 5577		 * - be delivered to SOCK_MEMALLOC sockets only
 5578		 * - stay away from userspace
 5579		 * - have bounded memory usage
 5580		 *
 5581		 * Use PF_MEMALLOC as this saves us from propagating the allocation
 5582		 * context down to all allocation sites.
 5583		 */
 5584		noreclaim_flag = memalloc_noreclaim_save();
 5585		ret = __netif_receive_skb_one_core(skb, true);
 5586		memalloc_noreclaim_restore(noreclaim_flag);
 5587	} else
 5588		ret = __netif_receive_skb_one_core(skb, false);
 5589
 5590	return ret;
 
 5591}
 5592
 5593static void __netif_receive_skb_list(struct list_head *head)
 5594{
 5595	unsigned long noreclaim_flag = 0;
 5596	struct sk_buff *skb, *next;
 5597	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
 5598
 5599	list_for_each_entry_safe(skb, next, head, list) {
 5600		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
 5601			struct list_head sublist;
 5602
 5603			/* Handle the previous sublist */
 5604			list_cut_before(&sublist, head, &skb->list);
 5605			if (!list_empty(&sublist))
 5606				__netif_receive_skb_list_core(&sublist, pfmemalloc);
 5607			pfmemalloc = !pfmemalloc;
 5608			/* See comments in __netif_receive_skb */
 5609			if (pfmemalloc)
 5610				noreclaim_flag = memalloc_noreclaim_save();
 5611			else
 5612				memalloc_noreclaim_restore(noreclaim_flag);
 5613		}
 5614	}
 5615	/* Handle the remaining sublist */
 5616	if (!list_empty(head))
 5617		__netif_receive_skb_list_core(head, pfmemalloc);
 5618	/* Restore pflags */
 5619	if (pfmemalloc)
 5620		memalloc_noreclaim_restore(noreclaim_flag);
 5621}
 
 5622
 5623static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 5624{
 5625	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
 5626	struct bpf_prog *new = xdp->prog;
 5627	int ret = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5628
 5629	switch (xdp->command) {
 5630	case XDP_SETUP_PROG:
 5631		rcu_assign_pointer(dev->xdp_prog, new);
 5632		if (old)
 5633			bpf_prog_put(old);
 
 5634
 5635		if (old && !new) {
 5636			static_branch_dec(&generic_xdp_needed_key);
 5637		} else if (new && !old) {
 5638			static_branch_inc(&generic_xdp_needed_key);
 5639			dev_disable_lro(dev);
 5640			dev_disable_gro_hw(dev);
 5641		}
 5642		break;
 
 
 
 
 
 
 
 
 5643
 5644	default:
 5645		ret = -EINVAL;
 5646		break;
 
 
 
 
 5647	}
 5648
 5649	return ret;
 5650}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5651
 5652static int netif_receive_skb_internal(struct sk_buff *skb)
 5653{
 5654	int ret;
 5655
 5656	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
 5657
 5658	if (skb_defer_rx_timestamp(skb))
 5659		return NET_RX_SUCCESS;
 5660
 5661	rcu_read_lock();
 5662#ifdef CONFIG_RPS
 5663	if (static_branch_unlikely(&rps_needed)) {
 5664		struct rps_dev_flow voidflow, *rflow = &voidflow;
 5665		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5666
 5667		if (cpu >= 0) {
 5668			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5669			rcu_read_unlock();
 5670			return ret;
 
 5671		}
 5672	}
 5673#endif
 5674	ret = __netif_receive_skb(skb);
 5675	rcu_read_unlock();
 5676	return ret;
 
 
 
 
 5677}
 
 5678
 5679void netif_receive_skb_list_internal(struct list_head *head)
 
 5680{
 5681	struct sk_buff *skb, *next;
 5682	struct list_head sublist;
 
 
 
 5683
 5684	INIT_LIST_HEAD(&sublist);
 5685	list_for_each_entry_safe(skb, next, head, list) {
 5686		net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
 5687		skb_list_del_init(skb);
 5688		if (!skb_defer_rx_timestamp(skb))
 5689			list_add_tail(&skb->list, &sublist);
 
 
 
 
 
 5690	}
 5691	list_splice_init(&sublist, head);
 5692
 5693	rcu_read_lock();
 5694#ifdef CONFIG_RPS
 5695	if (static_branch_unlikely(&rps_needed)) {
 5696		list_for_each_entry_safe(skb, next, head, list) {
 5697			struct rps_dev_flow voidflow, *rflow = &voidflow;
 5698			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5699
 5700			if (cpu >= 0) {
 5701				/* Will be handled, remove from list */
 5702				skb_list_del_init(skb);
 5703				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5704			}
 5705		}
 5706	}
 5707#endif
 5708	__netif_receive_skb_list(head);
 5709	rcu_read_unlock();
 5710}
 5711
 5712/**
 5713 *	netif_receive_skb - process receive buffer from network
 5714 *	@skb: buffer to process
 5715 *
 5716 *	netif_receive_skb() is the main receive data processing function.
 5717 *	It always succeeds. The buffer may be dropped during processing
 5718 *	for congestion control or by the protocol layers.
 5719 *
 5720 *	This function may only be called from softirq context and interrupts
 5721 *	should be enabled.
 5722 *
 5723 *	Return values (usually ignored):
 5724 *	NET_RX_SUCCESS: no congestion
 5725 *	NET_RX_DROP: packet was dropped
 5726 */
 5727int netif_receive_skb(struct sk_buff *skb)
 5728{
 5729	int ret;
 
 
 
 
 
 
 
 
 5730
 5731	trace_netif_receive_skb_entry(skb);
 
 
 
 
 
 5732
 5733	ret = netif_receive_skb_internal(skb);
 5734	trace_netif_receive_skb_exit(ret);
 
 
 5735
 5736	return ret;
 5737}
 5738EXPORT_SYMBOL(netif_receive_skb);
 5739
 5740/**
 5741 *	netif_receive_skb_list - process many receive buffers from network
 5742 *	@head: list of skbs to process.
 5743 *
 5744 *	Since return value of netif_receive_skb() is normally ignored, and
 5745 *	wouldn't be meaningful for a list, this function returns void.
 5746 *
 5747 *	This function may only be called from softirq context and interrupts
 5748 *	should be enabled.
 5749 */
 5750void netif_receive_skb_list(struct list_head *head)
 5751{
 5752	struct sk_buff *skb;
 
 
 5753
 5754	if (list_empty(head))
 5755		return;
 5756	if (trace_netif_receive_skb_list_entry_enabled()) {
 5757		list_for_each_entry(skb, head, list)
 5758			trace_netif_receive_skb_list_entry(skb);
 5759	}
 5760	netif_receive_skb_list_internal(head);
 5761	trace_netif_receive_skb_list_exit(0);
 5762}
 5763EXPORT_SYMBOL(netif_receive_skb_list);
 5764
 5765static DEFINE_PER_CPU(struct work_struct, flush_works);
 
 
 5766
 5767/* Network device is going away, flush any packets still pending */
 5768static void flush_backlog(struct work_struct *work)
 
 
 
 5769{
 5770	struct sk_buff *skb, *tmp;
 5771	struct softnet_data *sd;
 
 
 
 
 5772
 5773	local_bh_disable();
 5774	sd = this_cpu_ptr(&softnet_data);
 5775
 5776	rps_lock_irq_disable(sd);
 5777	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 5778		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5779			__skb_unlink(skb, &sd->input_pkt_queue);
 5780			dev_kfree_skb_irq(skb);
 5781			input_queue_head_incr(sd);
 5782		}
 5783	}
 5784	rps_unlock_irq_enable(sd);
 5785
 5786	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 5787		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5788			__skb_unlink(skb, &sd->process_queue);
 5789			kfree_skb(skb);
 5790			input_queue_head_incr(sd);
 5791		}
 5792	}
 5793	local_bh_enable();
 5794}
 
 5795
 5796static bool flush_required(int cpu)
 
 5797{
 5798#if IS_ENABLED(CONFIG_RPS)
 5799	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
 5800	bool do_flush;
 
 5801
 5802	rps_lock_irq_disable(sd);
 
 
 
 
 
 
 
 
 
 5803
 5804	/* as insertion into process_queue happens with the rps lock held,
 5805	 * process_queue access may race only with dequeue
 5806	 */
 5807	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
 5808		   !skb_queue_empty_lockless(&sd->process_queue);
 5809	rps_unlock_irq_enable(sd);
 5810
 5811	return do_flush;
 5812#endif
 5813	/* without RPS we can't safely check input_pkt_queue: during a
 5814	 * concurrent remote skb_queue_splice() we can detect as empty both
 5815	 * input_pkt_queue and process_queue even if the latter could end-up
 5816	 * containing a lot of packets.
 5817	 */
 5818	return true;
 5819}
 
 5820
 5821static void flush_all_backlogs(void)
 5822{
 5823	static cpumask_t flush_cpus;
 5824	unsigned int cpu;
 
 
 5825
 5826	/* since we are under rtnl lock protection we can use static data
 5827	 * for the cpumask and avoid allocating on stack the possibly
 5828	 * large mask
 5829	 */
 5830	ASSERT_RTNL();
 5831
 5832	cpus_read_lock();
 
 5833
 5834	cpumask_clear(&flush_cpus);
 5835	for_each_online_cpu(cpu) {
 5836		if (flush_required(cpu)) {
 5837			queue_work_on(cpu, system_highpri_wq,
 5838				      per_cpu_ptr(&flush_works, cpu));
 5839			cpumask_set_cpu(cpu, &flush_cpus);
 
 
 
 5840		}
 5841	}
 5842
 5843	/* we can have in flight packet[s] on the cpus we are not flushing,
 5844	 * synchronize_net() in unregister_netdevice_many() will take care of
 5845	 * them
 
 
 5846	 */
 5847	for_each_cpu(cpu, &flush_cpus)
 5848		flush_work(per_cpu_ptr(&flush_works, cpu));
 5849
 5850	cpus_read_unlock();
 
 5851}
 5852
 5853static void net_rps_send_ipi(struct softnet_data *remsd)
 5854{
 5855#ifdef CONFIG_RPS
 5856	while (remsd) {
 5857		struct softnet_data *next = remsd->rps_ipi_next;
 
 5858
 5859		if (cpu_online(remsd->cpu))
 5860			smp_call_function_single_async(remsd->cpu, &remsd->csd);
 5861		remsd = next;
 5862	}
 5863#endif
 5864}
 
 5865
 5866/*
 5867 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 5868 * Note: called with local irq disabled, but exits with local irq enabled.
 5869 */
 5870static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 5871{
 5872#ifdef CONFIG_RPS
 5873	struct softnet_data *remsd = sd->rps_ipi_list;
 5874
 5875	if (remsd) {
 5876		sd->rps_ipi_list = NULL;
 5877
 5878		local_irq_enable();
 5879
 5880		/* Send pending IPI's to kick RPS processing on remote cpus. */
 5881		net_rps_send_ipi(remsd);
 
 
 
 
 
 
 
 5882	} else
 5883#endif
 5884		local_irq_enable();
 5885}
 5886
 5887static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
 5888{
 5889#ifdef CONFIG_RPS
 5890	return sd->rps_ipi_list != NULL;
 5891#else
 5892	return false;
 5893#endif
 5894}
 5895
 5896static int process_backlog(struct napi_struct *napi, int quota)
 5897{
 
 5898	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 5899	bool again = true;
 5900	int work = 0;
 5901
 
 5902	/* Check if we have pending ipi, its better to send them now,
 5903	 * not waiting net_rx_action() end.
 5904	 */
 5905	if (sd_has_rps_ipi_waiting(sd)) {
 5906		local_irq_disable();
 5907		net_rps_action_and_irq_enable(sd);
 5908	}
 5909
 5910	napi->weight = READ_ONCE(dev_rx_weight);
 5911	while (again) {
 
 5912		struct sk_buff *skb;
 
 5913
 5914		while ((skb = __skb_dequeue(&sd->process_queue))) {
 5915			rcu_read_lock();
 5916			__netif_receive_skb(skb);
 5917			rcu_read_unlock();
 5918			input_queue_head_incr(sd);
 5919			if (++work >= quota)
 
 5920				return work;
 
 
 5921
 5922		}
 
 
 
 
 5923
 5924		rps_lock_irq_disable(sd);
 5925		if (skb_queue_empty(&sd->input_pkt_queue)) {
 5926			/*
 5927			 * Inline a custom version of __napi_complete().
 5928			 * only current cpu owns and manipulates this napi,
 5929			 * and NAPI_STATE_SCHED is the only possible flag set
 5930			 * on backlog.
 5931			 * We can use a plain write instead of clear_bit(),
 5932			 * and we dont need an smp_mb() memory barrier.
 5933			 */
 
 5934			napi->state = 0;
 5935			again = false;
 5936		} else {
 5937			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 5938						   &sd->process_queue);
 5939		}
 5940		rps_unlock_irq_enable(sd);
 5941	}
 
 5942
 5943	return work;
 5944}
 5945
 5946/**
 5947 * __napi_schedule - schedule for receive
 5948 * @n: entry to schedule
 5949 *
 5950 * The entry's receive function will be scheduled to run.
 5951 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 5952 */
 5953void __napi_schedule(struct napi_struct *n)
 5954{
 5955	unsigned long flags;
 5956
 5957	local_irq_save(flags);
 5958	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 5959	local_irq_restore(flags);
 5960}
 5961EXPORT_SYMBOL(__napi_schedule);
 5962
 5963/**
 5964 *	napi_schedule_prep - check if napi can be scheduled
 5965 *	@n: napi context
 5966 *
 5967 * Test if NAPI routine is already running, and if not mark
 5968 * it as running.  This is used as a condition variable to
 5969 * insure only one NAPI poll instance runs.  We also make
 5970 * sure there is no pending NAPI disable.
 5971 */
 5972bool napi_schedule_prep(struct napi_struct *n)
 5973{
 5974	unsigned long new, val = READ_ONCE(n->state);
 5975
 5976	do {
 5977		if (unlikely(val & NAPIF_STATE_DISABLE))
 5978			return false;
 5979		new = val | NAPIF_STATE_SCHED;
 5980
 5981		/* Sets STATE_MISSED bit if STATE_SCHED was already set
 5982		 * This was suggested by Alexander Duyck, as compiler
 5983		 * emits better code than :
 5984		 * if (val & NAPIF_STATE_SCHED)
 5985		 *     new |= NAPIF_STATE_MISSED;
 5986		 */
 5987		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
 5988						   NAPIF_STATE_MISSED;
 5989	} while (!try_cmpxchg(&n->state, &val, new));
 5990
 5991	return !(val & NAPIF_STATE_SCHED);
 
 
 5992}
 5993EXPORT_SYMBOL(napi_schedule_prep);
 5994
 5995/**
 5996 * __napi_schedule_irqoff - schedule for receive
 5997 * @n: entry to schedule
 5998 *
 5999 * Variant of __napi_schedule() assuming hard irqs are masked.
 6000 *
 6001 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
 6002 * because the interrupt disabled assumption might not be true
 6003 * due to force-threaded interrupts and spinlock substitution.
 6004 */
 6005void __napi_schedule_irqoff(struct napi_struct *n)
 6006{
 6007	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 6008		____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6009	else
 6010		__napi_schedule(n);
 6011}
 6012EXPORT_SYMBOL(__napi_schedule_irqoff);
 6013
 6014bool napi_complete_done(struct napi_struct *n, int work_done)
 6015{
 6016	unsigned long flags, val, new, timeout = 0;
 6017	bool ret = true;
 6018
 6019	/*
 6020	 * 1) Don't let napi dequeue from the cpu poll list
 6021	 *    just in case its running on a different cpu.
 6022	 * 2) If we are busy polling, do nothing here, we have
 6023	 *    the guarantee we will be called later.
 6024	 */
 6025	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
 6026				 NAPIF_STATE_IN_BUSY_POLL)))
 6027		return false;
 6028
 6029	if (work_done) {
 6030		if (n->gro_bitmask)
 6031			timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6032		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
 6033	}
 6034	if (n->defer_hard_irqs_count > 0) {
 6035		n->defer_hard_irqs_count--;
 6036		timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6037		if (timeout)
 6038			ret = false;
 6039	}
 6040	if (n->gro_bitmask) {
 6041		/* When the NAPI instance uses a timeout and keeps postponing
 6042		 * it, we need to bound somehow the time packets are kept in
 6043		 * the GRO layer
 6044		 */
 6045		napi_gro_flush(n, !!timeout);
 6046	}
 6047
 6048	gro_normal_list(n);
 6049
 6050	if (unlikely(!list_empty(&n->poll_list))) {
 6051		/* If n->poll_list is not empty, we need to mask irqs */
 6052		local_irq_save(flags);
 6053		list_del_init(&n->poll_list);
 6054		local_irq_restore(flags);
 6055	}
 6056
 6057	val = READ_ONCE(n->state);
 6058	do {
 6059		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
 6060
 6061		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
 6062			      NAPIF_STATE_SCHED_THREADED |
 6063			      NAPIF_STATE_PREFER_BUSY_POLL);
 6064
 6065		/* If STATE_MISSED was set, leave STATE_SCHED set,
 6066		 * because we will call napi->poll() one more time.
 6067		 * This C code was suggested by Alexander Duyck to help gcc.
 6068		 */
 6069		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
 6070						    NAPIF_STATE_SCHED;
 6071	} while (!try_cmpxchg(&n->state, &val, new));
 6072
 6073	if (unlikely(val & NAPIF_STATE_MISSED)) {
 6074		__napi_schedule(n);
 6075		return false;
 6076	}
 6077
 6078	if (timeout)
 6079		hrtimer_start(&n->timer, ns_to_ktime(timeout),
 6080			      HRTIMER_MODE_REL_PINNED);
 6081	return ret;
 6082}
 6083EXPORT_SYMBOL(napi_complete_done);
 6084
 6085/* must be called under rcu_read_lock(), as we dont take a reference */
 6086static struct napi_struct *napi_by_id(unsigned int napi_id)
 6087{
 6088	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
 6089	struct napi_struct *napi;
 6090
 6091	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
 6092		if (napi->napi_id == napi_id)
 6093			return napi;
 6094
 6095	return NULL;
 6096}
 6097
 6098#if defined(CONFIG_NET_RX_BUSY_POLL)
 6099
 6100static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
 6101{
 6102	if (!skip_schedule) {
 6103		gro_normal_list(napi);
 6104		__napi_schedule(napi);
 6105		return;
 6106	}
 6107
 6108	if (napi->gro_bitmask) {
 6109		/* flush too old packets
 6110		 * If HZ < 1000, flush all packets.
 6111		 */
 6112		napi_gro_flush(napi, HZ >= 1000);
 6113	}
 6114
 6115	gro_normal_list(napi);
 6116	clear_bit(NAPI_STATE_SCHED, &napi->state);
 6117}
 6118
 6119static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
 6120			   u16 budget)
 6121{
 6122	bool skip_schedule = false;
 6123	unsigned long timeout;
 6124	int rc;
 6125
 6126	/* Busy polling means there is a high chance device driver hard irq
 6127	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
 6128	 * set in napi_schedule_prep().
 6129	 * Since we are about to call napi->poll() once more, we can safely
 6130	 * clear NAPI_STATE_MISSED.
 6131	 *
 6132	 * Note: x86 could use a single "lock and ..." instruction
 6133	 * to perform these two clear_bit()
 6134	 */
 6135	clear_bit(NAPI_STATE_MISSED, &napi->state);
 6136	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
 6137
 6138	local_bh_disable();
 6139
 6140	if (prefer_busy_poll) {
 6141		napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
 6142		timeout = READ_ONCE(napi->dev->gro_flush_timeout);
 6143		if (napi->defer_hard_irqs_count && timeout) {
 6144			hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
 6145			skip_schedule = true;
 6146		}
 6147	}
 6148
 6149	/* All we really want here is to re-enable device interrupts.
 6150	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
 6151	 */
 6152	rc = napi->poll(napi, budget);
 6153	/* We can't gro_normal_list() here, because napi->poll() might have
 6154	 * rearmed the napi (napi_complete_done()) in which case it could
 6155	 * already be running on another CPU.
 6156	 */
 6157	trace_napi_poll(napi, rc, budget);
 6158	netpoll_poll_unlock(have_poll_lock);
 6159	if (rc == budget)
 6160		__busy_poll_stop(napi, skip_schedule);
 6161	local_bh_enable();
 6162}
 6163
 6164void napi_busy_loop(unsigned int napi_id,
 6165		    bool (*loop_end)(void *, unsigned long),
 6166		    void *loop_end_arg, bool prefer_busy_poll, u16 budget)
 6167{
 6168	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
 6169	int (*napi_poll)(struct napi_struct *napi, int budget);
 6170	void *have_poll_lock = NULL;
 6171	struct napi_struct *napi;
 6172
 6173restart:
 6174	napi_poll = NULL;
 6175
 6176	rcu_read_lock();
 6177
 6178	napi = napi_by_id(napi_id);
 6179	if (!napi)
 6180		goto out;
 6181
 6182	preempt_disable();
 6183	for (;;) {
 6184		int work = 0;
 6185
 6186		local_bh_disable();
 6187		if (!napi_poll) {
 6188			unsigned long val = READ_ONCE(napi->state);
 6189
 6190			/* If multiple threads are competing for this napi,
 6191			 * we avoid dirtying napi->state as much as we can.
 6192			 */
 6193			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
 6194				   NAPIF_STATE_IN_BUSY_POLL)) {
 6195				if (prefer_busy_poll)
 6196					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 6197				goto count;
 6198			}
 6199			if (cmpxchg(&napi->state, val,
 6200				    val | NAPIF_STATE_IN_BUSY_POLL |
 6201					  NAPIF_STATE_SCHED) != val) {
 6202				if (prefer_busy_poll)
 6203					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 6204				goto count;
 6205			}
 6206			have_poll_lock = netpoll_poll_lock(napi);
 6207			napi_poll = napi->poll;
 6208		}
 6209		work = napi_poll(napi, budget);
 6210		trace_napi_poll(napi, work, budget);
 6211		gro_normal_list(napi);
 6212count:
 6213		if (work > 0)
 6214			__NET_ADD_STATS(dev_net(napi->dev),
 6215					LINUX_MIB_BUSYPOLLRXPACKETS, work);
 6216		local_bh_enable();
 6217
 6218		if (!loop_end || loop_end(loop_end_arg, start_time))
 6219			break;
 6220
 6221		if (unlikely(need_resched())) {
 6222			if (napi_poll)
 6223				busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
 6224			preempt_enable();
 6225			rcu_read_unlock();
 6226			cond_resched();
 6227			if (loop_end(loop_end_arg, start_time))
 6228				return;
 6229			goto restart;
 6230		}
 6231		cpu_relax();
 6232	}
 6233	if (napi_poll)
 6234		busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
 6235	preempt_enable();
 6236out:
 6237	rcu_read_unlock();
 6238}
 6239EXPORT_SYMBOL(napi_busy_loop);
 6240
 6241#endif /* CONFIG_NET_RX_BUSY_POLL */
 6242
 6243static void napi_hash_add(struct napi_struct *napi)
 6244{
 6245	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
 6246		return;
 6247
 6248	spin_lock(&napi_hash_lock);
 6249
 6250	/* 0..NR_CPUS range is reserved for sender_cpu use */
 6251	do {
 6252		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
 6253			napi_gen_id = MIN_NAPI_ID;
 6254	} while (napi_by_id(napi_gen_id));
 6255	napi->napi_id = napi_gen_id;
 6256
 6257	hlist_add_head_rcu(&napi->napi_hash_node,
 6258			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
 6259
 6260	spin_unlock(&napi_hash_lock);
 6261}
 6262
 6263/* Warning : caller is responsible to make sure rcu grace period
 6264 * is respected before freeing memory containing @napi
 6265 */
 6266static void napi_hash_del(struct napi_struct *napi)
 6267{
 6268	spin_lock(&napi_hash_lock);
 6269
 6270	hlist_del_init_rcu(&napi->napi_hash_node);
 6271
 6272	spin_unlock(&napi_hash_lock);
 6273}
 6274
 6275static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 6276{
 6277	struct napi_struct *napi;
 6278
 6279	napi = container_of(timer, struct napi_struct, timer);
 6280
 6281	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 6282	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 6283	 */
 6284	if (!napi_disable_pending(napi) &&
 6285	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
 6286		clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 6287		__napi_schedule_irqoff(napi);
 6288	}
 6289
 6290	return HRTIMER_NORESTART;
 6291}
 6292
 6293static void init_gro_hash(struct napi_struct *napi)
 6294{
 6295	int i;
 6296
 6297	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6298		INIT_LIST_HEAD(&napi->gro_hash[i].list);
 6299		napi->gro_hash[i].count = 0;
 6300	}
 6301	napi->gro_bitmask = 0;
 6302}
 6303
 6304int dev_set_threaded(struct net_device *dev, bool threaded)
 6305{
 6306	struct napi_struct *napi;
 6307	int err = 0;
 6308
 6309	if (dev->threaded == threaded)
 6310		return 0;
 6311
 6312	if (threaded) {
 6313		list_for_each_entry(napi, &dev->napi_list, dev_list) {
 6314			if (!napi->thread) {
 6315				err = napi_kthread_create(napi);
 6316				if (err) {
 6317					threaded = false;
 6318					break;
 6319				}
 6320			}
 6321		}
 6322	}
 6323
 6324	dev->threaded = threaded;
 6325
 6326	/* Make sure kthread is created before THREADED bit
 6327	 * is set.
 6328	 */
 6329	smp_mb__before_atomic();
 6330
 6331	/* Setting/unsetting threaded mode on a napi might not immediately
 6332	 * take effect, if the current napi instance is actively being
 6333	 * polled. In this case, the switch between threaded mode and
 6334	 * softirq mode will happen in the next round of napi_schedule().
 6335	 * This should not cause hiccups/stalls to the live traffic.
 6336	 */
 6337	list_for_each_entry(napi, &dev->napi_list, dev_list) {
 6338		if (threaded)
 6339			set_bit(NAPI_STATE_THREADED, &napi->state);
 6340		else
 6341			clear_bit(NAPI_STATE_THREADED, &napi->state);
 6342	}
 6343
 6344	return err;
 6345}
 6346EXPORT_SYMBOL(dev_set_threaded);
 6347
 6348void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 6349			   int (*poll)(struct napi_struct *, int), int weight)
 6350{
 6351	if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
 6352		return;
 6353
 6354	INIT_LIST_HEAD(&napi->poll_list);
 6355	INIT_HLIST_NODE(&napi->napi_hash_node);
 6356	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 6357	napi->timer.function = napi_watchdog;
 6358	init_gro_hash(napi);
 6359	napi->skb = NULL;
 6360	INIT_LIST_HEAD(&napi->rx_list);
 6361	napi->rx_count = 0;
 6362	napi->poll = poll;
 6363	if (weight > NAPI_POLL_WEIGHT)
 6364		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
 6365				weight);
 6366	napi->weight = weight;
 
 6367	napi->dev = dev;
 6368#ifdef CONFIG_NETPOLL
 
 6369	napi->poll_owner = -1;
 6370#endif
 6371	set_bit(NAPI_STATE_SCHED, &napi->state);
 6372	set_bit(NAPI_STATE_NPSVC, &napi->state);
 6373	list_add_rcu(&napi->dev_list, &dev->napi_list);
 6374	napi_hash_add(napi);
 6375	napi_get_frags_check(napi);
 6376	/* Create kthread for this napi if dev->threaded is set.
 6377	 * Clear dev->threaded if kthread creation failed so that
 6378	 * threaded mode will not be enabled in napi_enable().
 6379	 */
 6380	if (dev->threaded && napi_kthread_create(napi))
 6381		dev->threaded = 0;
 6382}
 6383EXPORT_SYMBOL(netif_napi_add_weight);
 6384
 6385void napi_disable(struct napi_struct *n)
 6386{
 6387	unsigned long val, new;
 6388
 6389	might_sleep();
 6390	set_bit(NAPI_STATE_DISABLE, &n->state);
 6391
 6392	val = READ_ONCE(n->state);
 6393	do {
 6394		while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
 6395			usleep_range(20, 200);
 6396			val = READ_ONCE(n->state);
 6397		}
 6398
 6399		new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
 6400		new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
 6401	} while (!try_cmpxchg(&n->state, &val, new));
 6402
 6403	hrtimer_cancel(&n->timer);
 6404
 6405	clear_bit(NAPI_STATE_DISABLE, &n->state);
 6406}
 6407EXPORT_SYMBOL(napi_disable);
 6408
 6409/**
 6410 *	napi_enable - enable NAPI scheduling
 6411 *	@n: NAPI context
 6412 *
 6413 * Resume NAPI from being scheduled on this context.
 6414 * Must be paired with napi_disable.
 6415 */
 6416void napi_enable(struct napi_struct *n)
 6417{
 6418	unsigned long new, val = READ_ONCE(n->state);
 6419
 6420	do {
 6421		BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
 6422
 6423		new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
 6424		if (n->dev->threaded && n->thread)
 6425			new |= NAPIF_STATE_THREADED;
 6426	} while (!try_cmpxchg(&n->state, &val, new));
 6427}
 6428EXPORT_SYMBOL(napi_enable);
 6429
 6430static void flush_gro_hash(struct napi_struct *napi)
 6431{
 6432	int i;
 6433
 6434	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6435		struct sk_buff *skb, *n;
 6436
 6437		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
 6438			kfree_skb(skb);
 6439		napi->gro_hash[i].count = 0;
 6440	}
 6441}
 6442
 6443/* Must be called in process context */
 6444void __netif_napi_del(struct napi_struct *napi)
 6445{
 6446	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
 6447		return;
 6448
 6449	napi_hash_del(napi);
 6450	list_del_rcu(&napi->dev_list);
 6451	napi_free_frags(napi);
 6452
 6453	flush_gro_hash(napi);
 6454	napi->gro_bitmask = 0;
 6455
 6456	if (napi->thread) {
 6457		kthread_stop(napi->thread);
 6458		napi->thread = NULL;
 6459	}
 6460}
 6461EXPORT_SYMBOL(__netif_napi_del);
 6462
 6463static int __napi_poll(struct napi_struct *n, bool *repoll)
 6464{
 6465	int work, weight;
 6466
 6467	weight = n->weight;
 6468
 6469	/* This NAPI_STATE_SCHED test is for avoiding a race
 6470	 * with netpoll's poll_napi().  Only the entity which
 6471	 * obtains the lock and sees NAPI_STATE_SCHED set will
 6472	 * actually make the ->poll() call.  Therefore we avoid
 6473	 * accidentally calling ->poll() when NAPI is not scheduled.
 6474	 */
 6475	work = 0;
 6476	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 6477		work = n->poll(n, weight);
 6478		trace_napi_poll(n, work, weight);
 6479	}
 6480
 6481	if (unlikely(work > weight))
 6482		netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
 6483				n->poll, work, weight);
 6484
 6485	if (likely(work < weight))
 6486		return work;
 6487
 6488	/* Drivers must not modify the NAPI state if they
 6489	 * consume the entire weight.  In such cases this code
 6490	 * still "owns" the NAPI instance and therefore can
 6491	 * move the instance around on the list at-will.
 6492	 */
 6493	if (unlikely(napi_disable_pending(n))) {
 6494		napi_complete(n);
 6495		return work;
 6496	}
 6497
 6498	/* The NAPI context has more processing work, but busy-polling
 6499	 * is preferred. Exit early.
 6500	 */
 6501	if (napi_prefer_busy_poll(n)) {
 6502		if (napi_complete_done(n, work)) {
 6503			/* If timeout is not set, we need to make sure
 6504			 * that the NAPI is re-scheduled.
 6505			 */
 6506			napi_schedule(n);
 6507		}
 6508		return work;
 6509	}
 6510
 6511	if (n->gro_bitmask) {
 6512		/* flush too old packets
 6513		 * If HZ < 1000, flush all packets.
 6514		 */
 6515		napi_gro_flush(n, HZ >= 1000);
 6516	}
 6517
 6518	gro_normal_list(n);
 6519
 6520	/* Some drivers may have called napi_schedule
 6521	 * prior to exhausting their budget.
 6522	 */
 6523	if (unlikely(!list_empty(&n->poll_list))) {
 6524		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
 6525			     n->dev ? n->dev->name : "backlog");
 6526		return work;
 6527	}
 6528
 6529	*repoll = true;
 6530
 6531	return work;
 6532}
 
 6533
 6534static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 6535{
 6536	bool do_repoll = false;
 
 
 6537	void *have;
 6538	int work;
 6539
 6540	list_del_init(&n->poll_list);
 6541
 6542	have = netpoll_poll_lock(n);
 
 
 6543
 6544	work = __napi_poll(n, &do_repoll);
 
 
 
 
 
 6545
 6546	if (do_repoll)
 6547		list_add_tail(&n->poll_list, repoll);
 6548
 6549	netpoll_poll_unlock(have);
 
 
 
 
 
 6550
 6551	return work;
 6552}
 6553
 6554static int napi_thread_wait(struct napi_struct *napi)
 6555{
 6556	bool woken = false;
 6557
 6558	set_current_state(TASK_INTERRUPTIBLE);
 6559
 6560	while (!kthread_should_stop()) {
 6561		/* Testing SCHED_THREADED bit here to make sure the current
 6562		 * kthread owns this napi and could poll on this napi.
 6563		 * Testing SCHED bit is not enough because SCHED bit might be
 6564		 * set by some other busy poll thread or by napi_disable().
 6565		 */
 6566		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
 6567			WARN_ON(!list_empty(&napi->poll_list));
 6568			__set_current_state(TASK_RUNNING);
 6569			return 0;
 6570		}
 6571
 6572		schedule();
 6573		/* woken being true indicates this thread owns this napi. */
 6574		woken = true;
 6575		set_current_state(TASK_INTERRUPTIBLE);
 6576	}
 6577	__set_current_state(TASK_RUNNING);
 6578
 6579	return -1;
 6580}
 6581
 6582static int napi_threaded_poll(void *data)
 6583{
 6584	struct napi_struct *napi = data;
 6585	void *have;
 6586
 6587	while (!napi_thread_wait(napi)) {
 6588		for (;;) {
 6589			bool repoll = false;
 6590
 6591			local_bh_disable();
 6592
 6593			have = netpoll_poll_lock(napi);
 6594			__napi_poll(napi, &repoll);
 6595			netpoll_poll_unlock(have);
 6596
 6597			local_bh_enable();
 6598
 6599			if (!repoll)
 6600				break;
 6601
 6602			cond_resched();
 6603		}
 6604	}
 6605	return 0;
 6606}
 6607
 6608static void skb_defer_free_flush(struct softnet_data *sd)
 6609{
 6610	struct sk_buff *skb, *next;
 6611	unsigned long flags;
 6612
 6613	/* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
 6614	if (!READ_ONCE(sd->defer_list))
 6615		return;
 6616
 6617	spin_lock_irqsave(&sd->defer_lock, flags);
 6618	skb = sd->defer_list;
 6619	sd->defer_list = NULL;
 6620	sd->defer_count = 0;
 6621	spin_unlock_irqrestore(&sd->defer_lock, flags);
 6622
 6623	while (skb != NULL) {
 6624		next = skb->next;
 6625		napi_consume_skb(skb, 1);
 6626		skb = next;
 6627	}
 6628}
 6629
 6630static __latent_entropy void net_rx_action(struct softirq_action *h)
 6631{
 6632	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 6633	unsigned long time_limit = jiffies +
 6634		usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
 6635	int budget = READ_ONCE(netdev_budget);
 6636	LIST_HEAD(list);
 6637	LIST_HEAD(repoll);
 6638
 6639	local_irq_disable();
 6640	list_splice_init(&sd->poll_list, &list);
 6641	local_irq_enable();
 6642
 6643	for (;;) {
 6644		struct napi_struct *n;
 6645
 6646		skb_defer_free_flush(sd);
 6647
 6648		if (list_empty(&list)) {
 6649			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
 6650				goto end;
 6651			break;
 6652		}
 6653
 6654		n = list_first_entry(&list, struct napi_struct, poll_list);
 6655		budget -= napi_poll(n, &repoll);
 6656
 6657		/* If softirq window is exhausted then punt.
 6658		 * Allow this to run for 2 jiffies since which will allow
 6659		 * an average latency of 1.5/HZ.
 6660		 */
 6661		if (unlikely(budget <= 0 ||
 6662			     time_after_eq(jiffies, time_limit))) {
 6663			sd->time_squeeze++;
 6664			break;
 6665		}
 6666	}
 6667
 6668	local_irq_disable();
 6669
 6670	list_splice_tail_init(&sd->poll_list, &list);
 6671	list_splice_tail(&repoll, &list);
 6672	list_splice(&list, &sd->poll_list);
 6673	if (!list_empty(&sd->poll_list))
 6674		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 6675
 6676	net_rps_action_and_irq_enable(sd);
 6677end:;
 6678}
 6679
 6680struct netdev_adjacent {
 6681	struct net_device *dev;
 6682	netdevice_tracker dev_tracker;
 
 
 
 
 6683
 6684	/* upper master flag, there can only be one master device per list */
 6685	bool master;
 6686
 6687	/* lookup ignore flag */
 6688	bool ignore;
 6689
 6690	/* counter for the number of times this device was added to us */
 6691	u16 ref_nr;
 6692
 6693	/* private field for the users */
 6694	void *private;
 6695
 6696	struct list_head list;
 6697	struct rcu_head rcu;
 6698};
 6699
 6700static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
 6701						 struct list_head *adj_list)
 6702{
 6703	struct netdev_adjacent *adj;
 6704
 6705	list_for_each_entry(adj, adj_list, list) {
 6706		if (adj->dev == adj_dev)
 6707			return adj;
 6708	}
 6709	return NULL;
 6710}
 6711
 6712static int ____netdev_has_upper_dev(struct net_device *upper_dev,
 6713				    struct netdev_nested_priv *priv)
 6714{
 6715	struct net_device *dev = (struct net_device *)priv->data;
 6716
 6717	return upper_dev == dev;
 6718}
 6719
 6720/**
 6721 * netdev_has_upper_dev - Check if device is linked to an upper device
 6722 * @dev: device
 6723 * @upper_dev: upper device to check
 6724 *
 6725 * Find out if a device is linked to specified upper device and return true
 6726 * in case it is. Note that this checks only immediate upper device,
 6727 * not through a complete stack of devices. The caller must hold the RTNL lock.
 6728 */
 6729bool netdev_has_upper_dev(struct net_device *dev,
 6730			  struct net_device *upper_dev)
 6731{
 6732	struct netdev_nested_priv priv = {
 6733		.data = (void *)upper_dev,
 6734	};
 6735
 6736	ASSERT_RTNL();
 6737
 6738	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6739					     &priv);
 6740}
 6741EXPORT_SYMBOL(netdev_has_upper_dev);
 6742
 6743/**
 6744 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
 6745 * @dev: device
 6746 * @upper_dev: upper device to check
 6747 *
 6748 * Find out if a device is linked to specified upper device and return true
 6749 * in case it is. Note that this checks the entire upper device chain.
 6750 * The caller must hold rcu lock.
 6751 */
 6752
 6753bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
 6754				  struct net_device *upper_dev)
 6755{
 6756	struct netdev_nested_priv priv = {
 6757		.data = (void *)upper_dev,
 6758	};
 6759
 6760	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6761					       &priv);
 6762}
 6763EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
 6764
 6765/**
 6766 * netdev_has_any_upper_dev - Check if device is linked to some device
 6767 * @dev: device
 6768 *
 6769 * Find out if a device is linked to an upper device and return true in case
 6770 * it is. The caller must hold the RTNL lock.
 6771 */
 6772bool netdev_has_any_upper_dev(struct net_device *dev)
 6773{
 6774	ASSERT_RTNL();
 6775
 6776	return !list_empty(&dev->adj_list.upper);
 6777}
 6778EXPORT_SYMBOL(netdev_has_any_upper_dev);
 6779
 6780/**
 6781 * netdev_master_upper_dev_get - Get master upper device
 6782 * @dev: device
 6783 *
 6784 * Find a master upper device and return pointer to it or NULL in case
 6785 * it's not there. The caller must hold the RTNL lock.
 6786 */
 6787struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 6788{
 6789	struct netdev_adjacent *upper;
 6790
 6791	ASSERT_RTNL();
 6792
 6793	if (list_empty(&dev->adj_list.upper))
 6794		return NULL;
 6795
 6796	upper = list_first_entry(&dev->adj_list.upper,
 6797				 struct netdev_adjacent, list);
 6798	if (likely(upper->master))
 6799		return upper->dev;
 6800	return NULL;
 6801}
 6802EXPORT_SYMBOL(netdev_master_upper_dev_get);
 6803
 6804static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
 6805{
 6806	struct netdev_adjacent *upper;
 
 6807
 6808	ASSERT_RTNL();
 
 
 6809
 6810	if (list_empty(&dev->adj_list.upper))
 6811		return NULL;
 6812
 6813	upper = list_first_entry(&dev->adj_list.upper,
 6814				 struct netdev_adjacent, list);
 6815	if (likely(upper->master) && !upper->ignore)
 6816		return upper->dev;
 6817	return NULL;
 6818}
 6819
 6820/**
 6821 * netdev_has_any_lower_dev - Check if device is linked to some device
 6822 * @dev: device
 6823 *
 6824 * Find out if a device is linked to a lower device and return true in case
 6825 * it is. The caller must hold the RTNL lock.
 6826 */
 6827static bool netdev_has_any_lower_dev(struct net_device *dev)
 6828{
 6829	ASSERT_RTNL();
 6830
 6831	return !list_empty(&dev->adj_list.lower);
 
 
 6832}
 6833
 6834void *netdev_adjacent_get_private(struct list_head *adj_list)
 6835{
 6836	struct netdev_adjacent *adj;
 6837
 6838	adj = list_entry(adj_list, struct netdev_adjacent, list);
 6839
 6840	return adj->private;
 6841}
 6842EXPORT_SYMBOL(netdev_adjacent_get_private);
 6843
 6844/**
 6845 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 6846 * @dev: device
 6847 * @iter: list_head ** of the current position
 6848 *
 6849 * Gets the next device from the dev's upper list, starting from iter
 6850 * position. The caller must hold RCU read lock.
 6851 */
 6852struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 6853						 struct list_head **iter)
 6854{
 6855	struct netdev_adjacent *upper;
 6856
 6857	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 6858
 6859	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6860
 6861	if (&upper->list == &dev->adj_list.upper)
 6862		return NULL;
 6863
 6864	*iter = &upper->list;
 6865
 6866	return upper->dev;
 6867}
 6868EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 6869
 6870static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
 6871						  struct list_head **iter,
 6872						  bool *ignore)
 6873{
 6874	struct netdev_adjacent *upper;
 
 
 
 
 
 6875
 6876	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
 
 
 6877
 6878	if (&upper->list == &dev->adj_list.upper)
 6879		return NULL;
 6880
 6881	*iter = &upper->list;
 6882	*ignore = upper->ignore;
 6883
 6884	return upper->dev;
 6885}
 
 6886
 6887static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 6888						    struct list_head **iter)
 6889{
 6890	struct netdev_adjacent *upper;
 6891
 6892	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 6893
 6894	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6895
 6896	if (&upper->list == &dev->adj_list.upper)
 6897		return NULL;
 6898
 6899	*iter = &upper->list;
 6900
 6901	return upper->dev;
 6902}
 6903
 6904static int __netdev_walk_all_upper_dev(struct net_device *dev,
 6905				       int (*fn)(struct net_device *dev,
 6906					 struct netdev_nested_priv *priv),
 6907				       struct netdev_nested_priv *priv)
 6908{
 6909	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 6910	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 6911	int ret, cur = 0;
 6912	bool ignore;
 6913
 6914	now = dev;
 6915	iter = &dev->adj_list.upper;
 6916
 6917	while (1) {
 6918		if (now != dev) {
 6919			ret = fn(now, priv);
 6920			if (ret)
 6921				return ret;
 6922		}
 6923
 6924		next = NULL;
 6925		while (1) {
 6926			udev = __netdev_next_upper_dev(now, &iter, &ignore);
 6927			if (!udev)
 6928				break;
 6929			if (ignore)
 6930				continue;
 6931
 6932			next = udev;
 6933			niter = &udev->adj_list.upper;
 6934			dev_stack[cur] = now;
 6935			iter_stack[cur++] = iter;
 6936			break;
 6937		}
 6938
 6939		if (!next) {
 6940			if (!cur)
 6941				return 0;
 6942			next = dev_stack[--cur];
 6943			niter = iter_stack[cur];
 6944		}
 6945
 6946		now = next;
 6947		iter = niter;
 6948	}
 6949
 6950	return 0;
 6951}
 
 
 6952
 6953int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 6954				  int (*fn)(struct net_device *dev,
 6955					    struct netdev_nested_priv *priv),
 6956				  struct netdev_nested_priv *priv)
 6957{
 6958	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 6959	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 6960	int ret, cur = 0;
 6961
 6962	now = dev;
 6963	iter = &dev->adj_list.upper;
 6964
 6965	while (1) {
 6966		if (now != dev) {
 6967			ret = fn(now, priv);
 6968			if (ret)
 6969				return ret;
 6970		}
 6971
 6972		next = NULL;
 6973		while (1) {
 6974			udev = netdev_next_upper_dev_rcu(now, &iter);
 6975			if (!udev)
 6976				break;
 6977
 6978			next = udev;
 6979			niter = &udev->adj_list.upper;
 6980			dev_stack[cur] = now;
 6981			iter_stack[cur++] = iter;
 6982			break;
 6983		}
 6984
 6985		if (!next) {
 6986			if (!cur)
 6987				return 0;
 6988			next = dev_stack[--cur];
 6989			niter = iter_stack[cur];
 6990		}
 6991
 6992		now = next;
 6993		iter = niter;
 6994	}
 6995
 6996	return 0;
 6997}
 6998EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
 6999
 7000static bool __netdev_has_upper_dev(struct net_device *dev,
 7001				   struct net_device *upper_dev)
 7002{
 7003	struct netdev_nested_priv priv = {
 7004		.flags = 0,
 7005		.data = (void *)upper_dev,
 7006	};
 7007
 7008	ASSERT_RTNL();
 7009
 7010	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
 7011					   &priv);
 7012}
 7013
 7014/**
 7015 * netdev_lower_get_next_private - Get the next ->private from the
 7016 *				   lower neighbour list
 7017 * @dev: device
 7018 * @iter: list_head ** of the current position
 7019 *
 7020 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7021 * list, starting from iter position. The caller must hold either hold the
 7022 * RTNL lock or its own locking that guarantees that the neighbour lower
 7023 * list will remain unchanged.
 7024 */
 7025void *netdev_lower_get_next_private(struct net_device *dev,
 7026				    struct list_head **iter)
 7027{
 7028	struct netdev_adjacent *lower;
 
 
 
 
 
 
 
 
 
 
 7029
 7030	lower = list_entry(*iter, struct netdev_adjacent, list);
 7031
 7032	if (&lower->list == &dev->adj_list.lower)
 7033		return NULL;
 7034
 7035	*iter = lower->list.next;
 7036
 7037	return lower->private;
 7038}
 7039EXPORT_SYMBOL(netdev_lower_get_next_private);
 7040
 7041/**
 7042 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 7043 *				       lower neighbour list, RCU
 7044 *				       variant
 7045 * @dev: device
 7046 * @iter: list_head ** of the current position
 7047 *
 7048 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7049 * list, starting from iter position. The caller must hold RCU read lock.
 7050 */
 7051void *netdev_lower_get_next_private_rcu(struct net_device *dev,
 7052					struct list_head **iter)
 7053{
 7054	struct netdev_adjacent *lower;
 
 7055
 7056	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
 
 
 
 7057
 7058	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 
 
 7059
 7060	if (&lower->list == &dev->adj_list.lower)
 7061		return NULL;
 7062
 7063	*iter = &lower->list;
 7064
 7065	return lower->private;
 7066}
 7067EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
 7068
 7069/**
 7070 * netdev_lower_get_next - Get the next device from the lower neighbour
 7071 *                         list
 7072 * @dev: device
 7073 * @iter: list_head ** of the current position
 7074 *
 7075 * Gets the next netdev_adjacent from the dev's lower neighbour
 7076 * list, starting from iter position. The caller must hold RTNL lock or
 7077 * its own locking that guarantees that the neighbour lower
 7078 * list will remain unchanged.
 7079 */
 7080void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 
 7081{
 7082	struct netdev_adjacent *lower;
 
 
 7083
 7084	lower = list_entry(*iter, struct netdev_adjacent, list);
 7085
 7086	if (&lower->list == &dev->adj_list.lower)
 7087		return NULL;
 7088
 7089	*iter = lower->list.next;
 7090
 7091	return lower->dev;
 7092}
 7093EXPORT_SYMBOL(netdev_lower_get_next);
 7094
 7095static struct net_device *netdev_next_lower_dev(struct net_device *dev,
 7096						struct list_head **iter)
 7097{
 7098	struct netdev_adjacent *lower;
 7099
 7100	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7101
 7102	if (&lower->list == &dev->adj_list.lower)
 7103		return NULL;
 7104
 7105	*iter = &lower->list;
 7106
 7107	return lower->dev;
 7108}
 7109
 7110static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
 7111						  struct list_head **iter,
 7112						  bool *ignore)
 7113{
 7114	struct netdev_adjacent *lower;
 7115
 7116	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7117
 7118	if (&lower->list == &dev->adj_list.lower)
 7119		return NULL;
 7120
 7121	*iter = &lower->list;
 7122	*ignore = lower->ignore;
 7123
 7124	return lower->dev;
 7125}
 7126
 7127int netdev_walk_all_lower_dev(struct net_device *dev,
 7128			      int (*fn)(struct net_device *dev,
 7129					struct netdev_nested_priv *priv),
 7130			      struct netdev_nested_priv *priv)
 7131{
 7132	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7133	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7134	int ret, cur = 0;
 7135
 7136	now = dev;
 7137	iter = &dev->adj_list.lower;
 7138
 7139	while (1) {
 7140		if (now != dev) {
 7141			ret = fn(now, priv);
 7142			if (ret)
 7143				return ret;
 7144		}
 7145
 7146		next = NULL;
 7147		while (1) {
 7148			ldev = netdev_next_lower_dev(now, &iter);
 7149			if (!ldev)
 7150				break;
 7151
 7152			next = ldev;
 7153			niter = &ldev->adj_list.lower;
 7154			dev_stack[cur] = now;
 7155			iter_stack[cur++] = iter;
 7156			break;
 7157		}
 7158
 7159		if (!next) {
 7160			if (!cur)
 7161				return 0;
 7162			next = dev_stack[--cur];
 7163			niter = iter_stack[cur];
 7164		}
 7165
 7166		now = next;
 7167		iter = niter;
 7168	}
 7169
 7170	return 0;
 
 
 
 
 
 7171}
 7172EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
 7173
 7174static int __netdev_walk_all_lower_dev(struct net_device *dev,
 7175				       int (*fn)(struct net_device *dev,
 7176					 struct netdev_nested_priv *priv),
 7177				       struct netdev_nested_priv *priv)
 7178{
 7179	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7180	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7181	int ret, cur = 0;
 7182	bool ignore;
 7183
 7184	now = dev;
 7185	iter = &dev->adj_list.lower;
 7186
 7187	while (1) {
 7188		if (now != dev) {
 7189			ret = fn(now, priv);
 7190			if (ret)
 7191				return ret;
 7192		}
 7193
 7194		next = NULL;
 7195		while (1) {
 7196			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
 7197			if (!ldev)
 7198				break;
 7199			if (ignore)
 7200				continue;
 7201
 7202			next = ldev;
 7203			niter = &ldev->adj_list.lower;
 7204			dev_stack[cur] = now;
 7205			iter_stack[cur++] = iter;
 7206			break;
 7207		}
 7208
 7209		if (!next) {
 7210			if (!cur)
 7211				return 0;
 7212			next = dev_stack[--cur];
 7213			niter = iter_stack[cur];
 7214		}
 7215
 7216		now = next;
 7217		iter = niter;
 7218	}
 7219
 7220	return 0;
 7221}
 7222
 7223struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
 7224					     struct list_head **iter)
 7225{
 7226	struct netdev_adjacent *lower;
 7227
 7228	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7229	if (&lower->list == &dev->adj_list.lower)
 7230		return NULL;
 7231
 7232	*iter = &lower->list;
 7233
 7234	return lower->dev;
 7235}
 7236EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
 7237
 7238static u8 __netdev_upper_depth(struct net_device *dev)
 7239{
 7240	struct net_device *udev;
 7241	struct list_head *iter;
 7242	u8 max_depth = 0;
 7243	bool ignore;
 7244
 7245	for (iter = &dev->adj_list.upper,
 7246	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
 7247	     udev;
 7248	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
 7249		if (ignore)
 7250			continue;
 7251		if (max_depth < udev->upper_level)
 7252			max_depth = udev->upper_level;
 7253	}
 7254
 7255	return max_depth;
 7256}
 7257
 7258static u8 __netdev_lower_depth(struct net_device *dev)
 7259{
 7260	struct net_device *ldev;
 7261	struct list_head *iter;
 7262	u8 max_depth = 0;
 7263	bool ignore;
 7264
 7265	for (iter = &dev->adj_list.lower,
 7266	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
 7267	     ldev;
 7268	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
 7269		if (ignore)
 7270			continue;
 7271		if (max_depth < ldev->lower_level)
 7272			max_depth = ldev->lower_level;
 7273	}
 7274
 7275	return max_depth;
 7276}
 7277
 7278static int __netdev_update_upper_level(struct net_device *dev,
 7279				       struct netdev_nested_priv *__unused)
 7280{
 7281	dev->upper_level = __netdev_upper_depth(dev) + 1;
 7282	return 0;
 7283}
 7284
 7285#ifdef CONFIG_LOCKDEP
 7286static LIST_HEAD(net_unlink_list);
 7287
 7288static void net_unlink_todo(struct net_device *dev)
 7289{
 7290	if (list_empty(&dev->unlink_list))
 7291		list_add_tail(&dev->unlink_list, &net_unlink_list);
 7292}
 7293#endif
 7294
 7295static int __netdev_update_lower_level(struct net_device *dev,
 7296				       struct netdev_nested_priv *priv)
 7297{
 7298	dev->lower_level = __netdev_lower_depth(dev) + 1;
 7299
 7300#ifdef CONFIG_LOCKDEP
 7301	if (!priv)
 7302		return 0;
 7303
 7304	if (priv->flags & NESTED_SYNC_IMM)
 7305		dev->nested_level = dev->lower_level - 1;
 7306	if (priv->flags & NESTED_SYNC_TODO)
 7307		net_unlink_todo(dev);
 7308#endif
 7309	return 0;
 7310}
 7311
 7312int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
 7313				  int (*fn)(struct net_device *dev,
 7314					    struct netdev_nested_priv *priv),
 7315				  struct netdev_nested_priv *priv)
 7316{
 7317	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7318	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7319	int ret, cur = 0;
 7320
 7321	now = dev;
 7322	iter = &dev->adj_list.lower;
 7323
 7324	while (1) {
 7325		if (now != dev) {
 7326			ret = fn(now, priv);
 7327			if (ret)
 7328				return ret;
 7329		}
 7330
 7331		next = NULL;
 7332		while (1) {
 7333			ldev = netdev_next_lower_dev_rcu(now, &iter);
 7334			if (!ldev)
 7335				break;
 7336
 7337			next = ldev;
 7338			niter = &ldev->adj_list.lower;
 7339			dev_stack[cur] = now;
 7340			iter_stack[cur++] = iter;
 7341			break;
 7342		}
 7343
 7344		if (!next) {
 7345			if (!cur)
 7346				return 0;
 7347			next = dev_stack[--cur];
 7348			niter = iter_stack[cur];
 7349		}
 7350
 7351		now = next;
 7352		iter = niter;
 7353	}
 7354
 7355	return 0;
 7356}
 7357EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
 7358
 7359/**
 7360 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 7361 *				       lower neighbour list, RCU
 7362 *				       variant
 7363 * @dev: device
 7364 *
 7365 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 7366 * list. The caller must hold RCU read lock.
 7367 */
 7368void *netdev_lower_get_first_private_rcu(struct net_device *dev)
 7369{
 7370	struct netdev_adjacent *lower;
 7371
 7372	lower = list_first_or_null_rcu(&dev->adj_list.lower,
 7373			struct netdev_adjacent, list);
 7374	if (lower)
 7375		return lower->private;
 7376	return NULL;
 7377}
 7378EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
 7379
 7380/**
 7381 * netdev_master_upper_dev_get_rcu - Get master upper device
 7382 * @dev: device
 7383 *
 7384 * Find a master upper device and return pointer to it or NULL in case
 7385 * it's not there. The caller must hold the RCU read lock.
 7386 */
 7387struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
 7388{
 7389	struct netdev_adjacent *upper;
 7390
 7391	upper = list_first_or_null_rcu(&dev->adj_list.upper,
 7392				       struct netdev_adjacent, list);
 7393	if (upper && likely(upper->master))
 7394		return upper->dev;
 7395	return NULL;
 7396}
 7397EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
 7398
 7399static int netdev_adjacent_sysfs_add(struct net_device *dev,
 7400			      struct net_device *adj_dev,
 7401			      struct list_head *dev_list)
 7402{
 7403	char linkname[IFNAMSIZ+7];
 7404
 7405	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7406		"upper_%s" : "lower_%s", adj_dev->name);
 7407	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
 7408				 linkname);
 7409}
 7410static void netdev_adjacent_sysfs_del(struct net_device *dev,
 7411			       char *name,
 7412			       struct list_head *dev_list)
 7413{
 7414	char linkname[IFNAMSIZ+7];
 7415
 7416	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7417		"upper_%s" : "lower_%s", name);
 7418	sysfs_remove_link(&(dev->dev.kobj), linkname);
 7419}
 7420
 7421static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
 7422						 struct net_device *adj_dev,
 7423						 struct list_head *dev_list)
 7424{
 7425	return (dev_list == &dev->adj_list.upper ||
 7426		dev_list == &dev->adj_list.lower) &&
 7427		net_eq(dev_net(dev), dev_net(adj_dev));
 7428}
 7429
 7430static int __netdev_adjacent_dev_insert(struct net_device *dev,
 7431					struct net_device *adj_dev,
 7432					struct list_head *dev_list,
 7433					void *private, bool master)
 7434{
 7435	struct netdev_adjacent *adj;
 7436	int ret;
 7437
 7438	adj = __netdev_find_adj(adj_dev, dev_list);
 7439
 7440	if (adj) {
 7441		adj->ref_nr += 1;
 7442		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
 7443			 dev->name, adj_dev->name, adj->ref_nr);
 7444
 7445		return 0;
 7446	}
 7447
 7448	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
 7449	if (!adj)
 7450		return -ENOMEM;
 7451
 7452	adj->dev = adj_dev;
 7453	adj->master = master;
 7454	adj->ref_nr = 1;
 7455	adj->private = private;
 7456	adj->ignore = false;
 7457	netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
 7458
 7459	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
 7460		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
 7461
 7462	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
 7463		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
 7464		if (ret)
 7465			goto free_adj;
 7466	}
 7467
 7468	/* Ensure that master link is always the first item in list. */
 7469	if (master) {
 7470		ret = sysfs_create_link(&(dev->dev.kobj),
 7471					&(adj_dev->dev.kobj), "master");
 7472		if (ret)
 7473			goto remove_symlinks;
 7474
 7475		list_add_rcu(&adj->list, dev_list);
 7476	} else {
 7477		list_add_tail_rcu(&adj->list, dev_list);
 7478	}
 7479
 
 
 
 
 7480	return 0;
 7481
 7482remove_symlinks:
 7483	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7484		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7485free_adj:
 7486	netdev_put(adj_dev, &adj->dev_tracker);
 7487	kfree(adj);
 7488
 7489	return ret;
 7490}
 7491
 7492static void __netdev_adjacent_dev_remove(struct net_device *dev,
 7493					 struct net_device *adj_dev,
 7494					 u16 ref_nr,
 7495					 struct list_head *dev_list)
 7496{
 7497	struct netdev_adjacent *adj;
 7498
 7499	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
 7500		 dev->name, adj_dev->name, ref_nr);
 7501
 7502	adj = __netdev_find_adj(adj_dev, dev_list);
 7503
 7504	if (!adj) {
 7505		pr_err("Adjacency does not exist for device %s from %s\n",
 7506		       dev->name, adj_dev->name);
 7507		WARN_ON(1);
 7508		return;
 7509	}
 7510
 7511	if (adj->ref_nr > ref_nr) {
 7512		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
 7513			 dev->name, adj_dev->name, ref_nr,
 7514			 adj->ref_nr - ref_nr);
 7515		adj->ref_nr -= ref_nr;
 7516		return;
 7517	}
 7518
 7519	if (adj->master)
 7520		sysfs_remove_link(&(dev->dev.kobj), "master");
 7521
 7522	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7523		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7524
 7525	list_del_rcu(&adj->list);
 7526	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
 7527		 adj_dev->name, dev->name, adj_dev->name);
 7528	netdev_put(adj_dev, &adj->dev_tracker);
 7529	kfree_rcu(adj, rcu);
 7530}
 7531
 7532static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
 7533					    struct net_device *upper_dev,
 7534					    struct list_head *up_list,
 7535					    struct list_head *down_list,
 7536					    void *private, bool master)
 7537{
 7538	int ret;
 7539
 7540	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
 7541					   private, master);
 7542	if (ret)
 7543		return ret;
 7544
 7545	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
 7546					   private, false);
 7547	if (ret) {
 7548		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
 7549		return ret;
 7550	}
 7551
 7552	return 0;
 7553}
 7554
 7555static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
 7556					       struct net_device *upper_dev,
 7557					       u16 ref_nr,
 7558					       struct list_head *up_list,
 7559					       struct list_head *down_list)
 7560{
 7561	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
 7562	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
 7563}
 7564
 7565static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
 7566						struct net_device *upper_dev,
 7567						void *private, bool master)
 7568{
 7569	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
 7570						&dev->adj_list.upper,
 7571						&upper_dev->adj_list.lower,
 7572						private, master);
 7573}
 7574
 7575static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 7576						   struct net_device *upper_dev)
 7577{
 7578	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
 7579					   &dev->adj_list.upper,
 7580					   &upper_dev->adj_list.lower);
 7581}
 7582
 7583static int __netdev_upper_dev_link(struct net_device *dev,
 7584				   struct net_device *upper_dev, bool master,
 7585				   void *upper_priv, void *upper_info,
 7586				   struct netdev_nested_priv *priv,
 7587				   struct netlink_ext_ack *extack)
 7588{
 7589	struct netdev_notifier_changeupper_info changeupper_info = {
 7590		.info = {
 7591			.dev = dev,
 7592			.extack = extack,
 7593		},
 7594		.upper_dev = upper_dev,
 7595		.master = master,
 7596		.linking = true,
 7597		.upper_info = upper_info,
 7598	};
 7599	struct net_device *master_dev;
 7600	int ret = 0;
 7601
 7602	ASSERT_RTNL();
 7603
 7604	if (dev == upper_dev)
 7605		return -EBUSY;
 7606
 7607	/* To prevent loops, check if dev is not upper device to upper_dev. */
 7608	if (__netdev_has_upper_dev(upper_dev, dev))
 7609		return -EBUSY;
 7610
 7611	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
 7612		return -EMLINK;
 7613
 7614	if (!master) {
 7615		if (__netdev_has_upper_dev(dev, upper_dev))
 7616			return -EEXIST;
 7617	} else {
 7618		master_dev = __netdev_master_upper_dev_get(dev);
 7619		if (master_dev)
 7620			return master_dev == upper_dev ? -EEXIST : -EBUSY;
 7621	}
 7622
 7623	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7624					    &changeupper_info.info);
 7625	ret = notifier_to_errno(ret);
 7626	if (ret)
 7627		return ret;
 7628
 7629	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
 7630						   master);
 7631	if (ret)
 7632		return ret;
 7633
 7634	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7635					    &changeupper_info.info);
 7636	ret = notifier_to_errno(ret);
 7637	if (ret)
 7638		goto rollback;
 7639
 7640	__netdev_update_upper_level(dev, NULL);
 7641	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7642
 7643	__netdev_update_lower_level(upper_dev, priv);
 7644	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7645				    priv);
 7646
 7647	return 0;
 7648
 7649rollback:
 7650	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7651
 7652	return ret;
 7653}
 7654
 7655/**
 7656 * netdev_upper_dev_link - Add a link to the upper device
 7657 * @dev: device
 7658 * @upper_dev: new upper device
 7659 * @extack: netlink extended ack
 7660 *
 7661 * Adds a link to device which is upper to this one. The caller must hold
 7662 * the RTNL lock. On a failure a negative errno code is returned.
 7663 * On success the reference counts are adjusted and the function
 7664 * returns zero.
 7665 */
 7666int netdev_upper_dev_link(struct net_device *dev,
 7667			  struct net_device *upper_dev,
 7668			  struct netlink_ext_ack *extack)
 7669{
 7670	struct netdev_nested_priv priv = {
 7671		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7672		.data = NULL,
 7673	};
 7674
 7675	return __netdev_upper_dev_link(dev, upper_dev, false,
 7676				       NULL, NULL, &priv, extack);
 7677}
 7678EXPORT_SYMBOL(netdev_upper_dev_link);
 7679
 7680/**
 7681 * netdev_master_upper_dev_link - Add a master link to the upper device
 7682 * @dev: device
 7683 * @upper_dev: new upper device
 7684 * @upper_priv: upper device private
 7685 * @upper_info: upper info to be passed down via notifier
 7686 * @extack: netlink extended ack
 7687 *
 7688 * Adds a link to device which is upper to this one. In this case, only
 7689 * one master upper device can be linked, although other non-master devices
 7690 * might be linked as well. The caller must hold the RTNL lock.
 7691 * On a failure a negative errno code is returned. On success the reference
 7692 * counts are adjusted and the function returns zero.
 7693 */
 7694int netdev_master_upper_dev_link(struct net_device *dev,
 7695				 struct net_device *upper_dev,
 7696				 void *upper_priv, void *upper_info,
 7697				 struct netlink_ext_ack *extack)
 7698{
 7699	struct netdev_nested_priv priv = {
 7700		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7701		.data = NULL,
 7702	};
 7703
 7704	return __netdev_upper_dev_link(dev, upper_dev, true,
 7705				       upper_priv, upper_info, &priv, extack);
 7706}
 7707EXPORT_SYMBOL(netdev_master_upper_dev_link);
 7708
 7709static void __netdev_upper_dev_unlink(struct net_device *dev,
 7710				      struct net_device *upper_dev,
 7711				      struct netdev_nested_priv *priv)
 7712{
 7713	struct netdev_notifier_changeupper_info changeupper_info = {
 7714		.info = {
 7715			.dev = dev,
 7716		},
 7717		.upper_dev = upper_dev,
 7718		.linking = false,
 7719	};
 7720
 7721	ASSERT_RTNL();
 7722
 7723	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
 7724
 7725	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7726				      &changeupper_info.info);
 7727
 7728	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7729
 7730	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7731				      &changeupper_info.info);
 7732
 7733	__netdev_update_upper_level(dev, NULL);
 7734	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7735
 7736	__netdev_update_lower_level(upper_dev, priv);
 7737	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7738				    priv);
 7739}
 7740
 7741/**
 7742 * netdev_upper_dev_unlink - Removes a link to upper device
 7743 * @dev: device
 7744 * @upper_dev: new upper device
 7745 *
 7746 * Removes a link to device which is upper to this one. The caller must hold
 7747 * the RTNL lock.
 7748 */
 7749void netdev_upper_dev_unlink(struct net_device *dev,
 7750			     struct net_device *upper_dev)
 7751{
 7752	struct netdev_nested_priv priv = {
 7753		.flags = NESTED_SYNC_TODO,
 7754		.data = NULL,
 7755	};
 7756
 7757	__netdev_upper_dev_unlink(dev, upper_dev, &priv);
 7758}
 7759EXPORT_SYMBOL(netdev_upper_dev_unlink);
 7760
 7761static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
 7762				      struct net_device *lower_dev,
 7763				      bool val)
 7764{
 7765	struct netdev_adjacent *adj;
 
 
 7766
 7767	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
 7768	if (adj)
 7769		adj->ignore = val;
 
 
 
 7770
 7771	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
 7772	if (adj)
 7773		adj->ignore = val;
 7774}
 7775
 7776static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
 7777					struct net_device *lower_dev)
 7778{
 7779	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
 7780}
 7781
 7782static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
 7783				       struct net_device *lower_dev)
 7784{
 7785	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
 7786}
 
 
 7787
 7788int netdev_adjacent_change_prepare(struct net_device *old_dev,
 7789				   struct net_device *new_dev,
 7790				   struct net_device *dev,
 7791				   struct netlink_ext_ack *extack)
 7792{
 7793	struct netdev_nested_priv priv = {
 7794		.flags = 0,
 7795		.data = NULL,
 7796	};
 7797	int err;
 7798
 7799	if (!new_dev)
 7800		return 0;
 7801
 7802	if (old_dev && new_dev != old_dev)
 7803		netdev_adjacent_dev_disable(dev, old_dev);
 7804	err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
 7805				      extack);
 7806	if (err) {
 7807		if (old_dev && new_dev != old_dev)
 7808			netdev_adjacent_dev_enable(dev, old_dev);
 7809		return err;
 7810	}
 7811
 7812	return 0;
 7813}
 7814EXPORT_SYMBOL(netdev_adjacent_change_prepare);
 7815
 7816void netdev_adjacent_change_commit(struct net_device *old_dev,
 7817				   struct net_device *new_dev,
 7818				   struct net_device *dev)
 7819{
 7820	struct netdev_nested_priv priv = {
 7821		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7822		.data = NULL,
 7823	};
 7824
 7825	if (!new_dev || !old_dev)
 7826		return;
 7827
 7828	if (new_dev == old_dev)
 7829		return;
 7830
 7831	netdev_adjacent_dev_enable(dev, old_dev);
 7832	__netdev_upper_dev_unlink(old_dev, dev, &priv);
 7833}
 7834EXPORT_SYMBOL(netdev_adjacent_change_commit);
 7835
 7836void netdev_adjacent_change_abort(struct net_device *old_dev,
 7837				  struct net_device *new_dev,
 7838				  struct net_device *dev)
 7839{
 7840	struct netdev_nested_priv priv = {
 7841		.flags = 0,
 7842		.data = NULL,
 7843	};
 7844
 7845	if (!new_dev)
 7846		return;
 7847
 7848	if (old_dev && new_dev != old_dev)
 7849		netdev_adjacent_dev_enable(dev, old_dev);
 7850
 7851	__netdev_upper_dev_unlink(new_dev, dev, &priv);
 7852}
 7853EXPORT_SYMBOL(netdev_adjacent_change_abort);
 7854
 7855/**
 7856 * netdev_bonding_info_change - Dispatch event about slave change
 7857 * @dev: device
 7858 * @bonding_info: info to dispatch
 7859 *
 7860 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 7861 * The caller must hold the RTNL lock.
 7862 */
 7863void netdev_bonding_info_change(struct net_device *dev,
 7864				struct netdev_bonding_info *bonding_info)
 7865{
 7866	struct netdev_notifier_bonding_info info = {
 7867		.info.dev = dev,
 7868	};
 7869
 7870	memcpy(&info.bonding_info, bonding_info,
 7871	       sizeof(struct netdev_bonding_info));
 7872	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
 7873				      &info.info);
 7874}
 7875EXPORT_SYMBOL(netdev_bonding_info_change);
 7876
 7877static int netdev_offload_xstats_enable_l3(struct net_device *dev,
 7878					   struct netlink_ext_ack *extack)
 7879{
 7880	struct netdev_notifier_offload_xstats_info info = {
 7881		.info.dev = dev,
 7882		.info.extack = extack,
 7883		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
 7884	};
 7885	int err;
 7886	int rc;
 7887
 7888	dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
 7889					 GFP_KERNEL);
 7890	if (!dev->offload_xstats_l3)
 7891		return -ENOMEM;
 7892
 7893	rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
 7894						  NETDEV_OFFLOAD_XSTATS_DISABLE,
 7895						  &info.info);
 7896	err = notifier_to_errno(rc);
 7897	if (err)
 7898		goto free_stats;
 7899
 7900	return 0;
 7901
 7902free_stats:
 7903	kfree(dev->offload_xstats_l3);
 7904	dev->offload_xstats_l3 = NULL;
 7905	return err;
 7906}
 7907
 7908int netdev_offload_xstats_enable(struct net_device *dev,
 7909				 enum netdev_offload_xstats_type type,
 7910				 struct netlink_ext_ack *extack)
 7911{
 7912	ASSERT_RTNL();
 7913
 7914	if (netdev_offload_xstats_enabled(dev, type))
 7915		return -EALREADY;
 7916
 7917	switch (type) {
 7918	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
 7919		return netdev_offload_xstats_enable_l3(dev, extack);
 7920	}
 7921
 7922	WARN_ON(1);
 7923	return -EINVAL;
 7924}
 7925EXPORT_SYMBOL(netdev_offload_xstats_enable);
 7926
 7927static void netdev_offload_xstats_disable_l3(struct net_device *dev)
 
 7928{
 7929	struct netdev_notifier_offload_xstats_info info = {
 7930		.info.dev = dev,
 7931		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
 7932	};
 7933
 7934	call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
 7935				      &info.info);
 7936	kfree(dev->offload_xstats_l3);
 7937	dev->offload_xstats_l3 = NULL;
 7938}
 7939
 7940int netdev_offload_xstats_disable(struct net_device *dev,
 7941				  enum netdev_offload_xstats_type type)
 7942{
 7943	ASSERT_RTNL();
 7944
 7945	if (!netdev_offload_xstats_enabled(dev, type))
 7946		return -EALREADY;
 
 
 
 
 
 7947
 7948	switch (type) {
 7949	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
 7950		netdev_offload_xstats_disable_l3(dev);
 7951		return 0;
 7952	}
 7953
 7954	WARN_ON(1);
 7955	return -EINVAL;
 7956}
 7957EXPORT_SYMBOL(netdev_offload_xstats_disable);
 7958
 7959static void netdev_offload_xstats_disable_all(struct net_device *dev)
 7960{
 7961	netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
 7962}
 7963
 7964static struct rtnl_hw_stats64 *
 7965netdev_offload_xstats_get_ptr(const struct net_device *dev,
 7966			      enum netdev_offload_xstats_type type)
 7967{
 7968	switch (type) {
 7969	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
 7970		return dev->offload_xstats_l3;
 7971	}
 7972
 7973	WARN_ON(1);
 7974	return NULL;
 7975}
 7976
 7977bool netdev_offload_xstats_enabled(const struct net_device *dev,
 7978				   enum netdev_offload_xstats_type type)
 7979{
 7980	ASSERT_RTNL();
 7981
 7982	return netdev_offload_xstats_get_ptr(dev, type);
 7983}
 7984EXPORT_SYMBOL(netdev_offload_xstats_enabled);
 7985
 7986struct netdev_notifier_offload_xstats_ru {
 7987	bool used;
 
 
 
 
 7988};
 7989
 7990struct netdev_notifier_offload_xstats_rd {
 7991	struct rtnl_hw_stats64 stats;
 7992	bool used;
 7993};
 7994
 7995static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
 7996				  const struct rtnl_hw_stats64 *src)
 7997{
 7998	dest->rx_packets	  += src->rx_packets;
 7999	dest->tx_packets	  += src->tx_packets;
 8000	dest->rx_bytes		  += src->rx_bytes;
 8001	dest->tx_bytes		  += src->tx_bytes;
 8002	dest->rx_errors		  += src->rx_errors;
 8003	dest->tx_errors		  += src->tx_errors;
 8004	dest->rx_dropped	  += src->rx_dropped;
 8005	dest->tx_dropped	  += src->tx_dropped;
 8006	dest->multicast		  += src->multicast;
 8007}
 8008
 8009static int netdev_offload_xstats_get_used(struct net_device *dev,
 8010					  enum netdev_offload_xstats_type type,
 8011					  bool *p_used,
 8012					  struct netlink_ext_ack *extack)
 8013{
 8014	struct netdev_notifier_offload_xstats_ru report_used = {};
 8015	struct netdev_notifier_offload_xstats_info info = {
 8016		.info.dev = dev,
 8017		.info.extack = extack,
 8018		.type = type,
 8019		.report_used = &report_used,
 8020	};
 8021	int rc;
 8022
 8023	WARN_ON(!netdev_offload_xstats_enabled(dev, type));
 8024	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
 8025					   &info.info);
 8026	*p_used = report_used.used;
 8027	return notifier_to_errno(rc);
 8028}
 8029
 8030static int netdev_offload_xstats_get_stats(struct net_device *dev,
 8031					   enum netdev_offload_xstats_type type,
 8032					   struct rtnl_hw_stats64 *p_stats,
 8033					   bool *p_used,
 8034					   struct netlink_ext_ack *extack)
 8035{
 8036	struct netdev_notifier_offload_xstats_rd report_delta = {};
 8037	struct netdev_notifier_offload_xstats_info info = {
 8038		.info.dev = dev,
 8039		.info.extack = extack,
 8040		.type = type,
 8041		.report_delta = &report_delta,
 8042	};
 8043	struct rtnl_hw_stats64 *stats;
 8044	int rc;
 8045
 8046	stats = netdev_offload_xstats_get_ptr(dev, type);
 8047	if (WARN_ON(!stats))
 8048		return -EINVAL;
 8049
 8050	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
 8051					   &info.info);
 8052
 8053	/* Cache whatever we got, even if there was an error, otherwise the
 8054	 * successful stats retrievals would get lost.
 8055	 */
 8056	netdev_hw_stats64_add(stats, &report_delta.stats);
 8057
 8058	if (p_stats)
 8059		*p_stats = *stats;
 8060	*p_used = report_delta.used;
 8061
 8062	return notifier_to_errno(rc);
 8063}
 8064
 8065int netdev_offload_xstats_get(struct net_device *dev,
 8066			      enum netdev_offload_xstats_type type,
 8067			      struct rtnl_hw_stats64 *p_stats, bool *p_used,
 8068			      struct netlink_ext_ack *extack)
 8069{
 8070	ASSERT_RTNL();
 8071
 8072	if (p_stats)
 8073		return netdev_offload_xstats_get_stats(dev, type, p_stats,
 8074						       p_used, extack);
 8075	else
 8076		return netdev_offload_xstats_get_used(dev, type, p_used,
 8077						      extack);
 8078}
 8079EXPORT_SYMBOL(netdev_offload_xstats_get);
 8080
 8081void
 8082netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
 8083				   const struct rtnl_hw_stats64 *stats)
 8084{
 8085	report_delta->used = true;
 8086	netdev_hw_stats64_add(&report_delta->stats, stats);
 8087}
 8088EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
 8089
 8090void
 8091netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
 8092{
 8093	report_used->used = true;
 8094}
 8095EXPORT_SYMBOL(netdev_offload_xstats_report_used);
 8096
 8097void netdev_offload_xstats_push_delta(struct net_device *dev,
 8098				      enum netdev_offload_xstats_type type,
 8099				      const struct rtnl_hw_stats64 *p_stats)
 8100{
 8101	struct rtnl_hw_stats64 *stats;
 8102
 8103	ASSERT_RTNL();
 8104
 8105	stats = netdev_offload_xstats_get_ptr(dev, type);
 8106	if (WARN_ON(!stats))
 8107		return;
 8108
 8109	netdev_hw_stats64_add(stats, p_stats);
 8110}
 8111EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
 8112
 8113/**
 8114 * netdev_get_xmit_slave - Get the xmit slave of master device
 8115 * @dev: device
 8116 * @skb: The packet
 8117 * @all_slaves: assume all the slaves are active
 8118 *
 8119 * The reference counters are not incremented so the caller must be
 8120 * careful with locks. The caller must hold RCU lock.
 8121 * %NULL is returned if no slave is found.
 
 8122 */
 8123
 8124struct net_device *netdev_get_xmit_slave(struct net_device *dev,
 8125					 struct sk_buff *skb,
 8126					 bool all_slaves)
 8127{
 8128	const struct net_device_ops *ops = dev->netdev_ops;
 8129
 8130	if (!ops->ndo_get_xmit_slave)
 8131		return NULL;
 8132	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
 8133}
 8134EXPORT_SYMBOL(netdev_get_xmit_slave);
 8135
 8136static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
 8137						  struct sock *sk)
 8138{
 8139	const struct net_device_ops *ops = dev->netdev_ops;
 8140
 8141	if (!ops->ndo_sk_get_lower_dev)
 8142		return NULL;
 8143	return ops->ndo_sk_get_lower_dev(dev, sk);
 8144}
 8145
 8146/**
 8147 * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
 8148 * @dev: device
 8149 * @sk: the socket
 8150 *
 8151 * %NULL is returned if no lower device is found.
 8152 */
 8153
 8154struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
 8155					    struct sock *sk)
 8156{
 8157	struct net_device *lower;
 8158
 8159	lower = netdev_sk_get_lower_dev(dev, sk);
 8160	while (lower) {
 8161		dev = lower;
 8162		lower = netdev_sk_get_lower_dev(dev, sk);
 8163	}
 8164
 8165	return dev;
 8166}
 8167EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
 8168
 8169static void netdev_adjacent_add_links(struct net_device *dev)
 8170{
 8171	struct netdev_adjacent *iter;
 8172
 8173	struct net *net = dev_net(dev);
 8174
 8175	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8176		if (!net_eq(net, dev_net(iter->dev)))
 8177			continue;
 8178		netdev_adjacent_sysfs_add(iter->dev, dev,
 8179					  &iter->dev->adj_list.lower);
 8180		netdev_adjacent_sysfs_add(dev, iter->dev,
 8181					  &dev->adj_list.upper);
 8182	}
 8183
 8184	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8185		if (!net_eq(net, dev_net(iter->dev)))
 8186			continue;
 8187		netdev_adjacent_sysfs_add(iter->dev, dev,
 8188					  &iter->dev->adj_list.upper);
 8189		netdev_adjacent_sysfs_add(dev, iter->dev,
 8190					  &dev->adj_list.lower);
 8191	}
 8192}
 8193
 8194static void netdev_adjacent_del_links(struct net_device *dev)
 8195{
 8196	struct netdev_adjacent *iter;
 8197
 8198	struct net *net = dev_net(dev);
 8199
 8200	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8201		if (!net_eq(net, dev_net(iter->dev)))
 8202			continue;
 8203		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8204					  &iter->dev->adj_list.lower);
 8205		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8206					  &dev->adj_list.upper);
 8207	}
 8208
 8209	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8210		if (!net_eq(net, dev_net(iter->dev)))
 8211			continue;
 8212		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8213					  &iter->dev->adj_list.upper);
 8214		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8215					  &dev->adj_list.lower);
 8216	}
 8217}
 8218
 8219void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
 8220{
 8221	struct netdev_adjacent *iter;
 8222
 8223	struct net *net = dev_net(dev);
 8224
 8225	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8226		if (!net_eq(net, dev_net(iter->dev)))
 8227			continue;
 8228		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8229					  &iter->dev->adj_list.lower);
 8230		netdev_adjacent_sysfs_add(iter->dev, dev,
 8231					  &iter->dev->adj_list.lower);
 8232	}
 8233
 8234	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8235		if (!net_eq(net, dev_net(iter->dev)))
 8236			continue;
 8237		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8238					  &iter->dev->adj_list.upper);
 8239		netdev_adjacent_sysfs_add(iter->dev, dev,
 8240					  &iter->dev->adj_list.upper);
 8241	}
 8242}
 8243
 8244void *netdev_lower_dev_get_private(struct net_device *dev,
 8245				   struct net_device *lower_dev)
 8246{
 8247	struct netdev_adjacent *lower;
 8248
 8249	if (!lower_dev)
 8250		return NULL;
 8251	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
 8252	if (!lower)
 8253		return NULL;
 8254
 8255	return lower->private;
 8256}
 8257EXPORT_SYMBOL(netdev_lower_dev_get_private);
 8258
 8259
 8260/**
 8261 * netdev_lower_state_changed - Dispatch event about lower device state change
 8262 * @lower_dev: device
 8263 * @lower_state_info: state to dispatch
 8264 *
 8265 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 8266 * The caller must hold the RTNL lock.
 
 
 8267 */
 8268void netdev_lower_state_changed(struct net_device *lower_dev,
 8269				void *lower_state_info)
 8270{
 8271	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
 8272		.info.dev = lower_dev,
 8273	};
 8274
 8275	ASSERT_RTNL();
 8276	changelowerstate_info.lower_state_info = lower_state_info;
 8277	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
 8278				      &changelowerstate_info.info);
 
 
 
 
 
 
 
 
 8279}
 8280EXPORT_SYMBOL(netdev_lower_state_changed);
 8281
 8282static void dev_change_rx_flags(struct net_device *dev, int flags)
 8283{
 8284	const struct net_device_ops *ops = dev->netdev_ops;
 8285
 8286	if (ops->ndo_change_rx_flags)
 8287		ops->ndo_change_rx_flags(dev, flags);
 8288}
 8289
 8290static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 8291{
 8292	unsigned int old_flags = dev->flags;
 8293	kuid_t uid;
 8294	kgid_t gid;
 8295
 8296	ASSERT_RTNL();
 8297
 8298	dev->flags |= IFF_PROMISC;
 8299	dev->promiscuity += inc;
 8300	if (dev->promiscuity == 0) {
 8301		/*
 8302		 * Avoid overflow.
 8303		 * If inc causes overflow, untouch promisc and return error.
 8304		 */
 8305		if (inc < 0)
 8306			dev->flags &= ~IFF_PROMISC;
 8307		else {
 8308			dev->promiscuity -= inc;
 8309			netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
 
 8310			return -EOVERFLOW;
 8311		}
 8312	}
 8313	if (dev->flags != old_flags) {
 8314		pr_info("device %s %s promiscuous mode\n",
 8315			dev->name,
 8316			dev->flags & IFF_PROMISC ? "entered" : "left");
 8317		if (audit_enabled) {
 8318			current_uid_gid(&uid, &gid);
 8319			audit_log(audit_context(), GFP_ATOMIC,
 8320				  AUDIT_ANOM_PROMISCUOUS,
 8321				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
 8322				  dev->name, (dev->flags & IFF_PROMISC),
 8323				  (old_flags & IFF_PROMISC),
 8324				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
 8325				  from_kuid(&init_user_ns, uid),
 8326				  from_kgid(&init_user_ns, gid),
 8327				  audit_get_sessionid(current));
 8328		}
 8329
 8330		dev_change_rx_flags(dev, IFF_PROMISC);
 8331	}
 8332	if (notify)
 8333		__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
 8334	return 0;
 8335}
 8336
 8337/**
 8338 *	dev_set_promiscuity	- update promiscuity count on a device
 8339 *	@dev: device
 8340 *	@inc: modifier
 8341 *
 8342 *	Add or remove promiscuity from a device. While the count in the device
 8343 *	remains above zero the interface remains promiscuous. Once it hits zero
 8344 *	the device reverts back to normal filtering operation. A negative inc
 8345 *	value is used to drop promiscuity on the device.
 8346 *	Return 0 if successful or a negative errno code on error.
 8347 */
 8348int dev_set_promiscuity(struct net_device *dev, int inc)
 8349{
 8350	unsigned int old_flags = dev->flags;
 8351	int err;
 8352
 8353	err = __dev_set_promiscuity(dev, inc, true);
 8354	if (err < 0)
 8355		return err;
 8356	if (dev->flags != old_flags)
 8357		dev_set_rx_mode(dev);
 8358	return err;
 8359}
 8360EXPORT_SYMBOL(dev_set_promiscuity);
 8361
 8362static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
 
 
 
 
 
 
 
 
 
 
 
 
 
 8363{
 8364	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
 8365
 8366	ASSERT_RTNL();
 8367
 8368	dev->flags |= IFF_ALLMULTI;
 8369	dev->allmulti += inc;
 8370	if (dev->allmulti == 0) {
 8371		/*
 8372		 * Avoid overflow.
 8373		 * If inc causes overflow, untouch allmulti and return error.
 8374		 */
 8375		if (inc < 0)
 8376			dev->flags &= ~IFF_ALLMULTI;
 8377		else {
 8378			dev->allmulti -= inc;
 8379			netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
 
 8380			return -EOVERFLOW;
 8381		}
 8382	}
 8383	if (dev->flags ^ old_flags) {
 8384		dev_change_rx_flags(dev, IFF_ALLMULTI);
 8385		dev_set_rx_mode(dev);
 8386		if (notify)
 8387			__dev_notify_flags(dev, old_flags,
 8388					   dev->gflags ^ old_gflags, 0, NULL);
 8389	}
 8390	return 0;
 8391}
 8392
 8393/**
 8394 *	dev_set_allmulti	- update allmulti count on a device
 8395 *	@dev: device
 8396 *	@inc: modifier
 8397 *
 8398 *	Add or remove reception of all multicast frames to a device. While the
 8399 *	count in the device remains above zero the interface remains listening
 8400 *	to all interfaces. Once it hits zero the device reverts back to normal
 8401 *	filtering operation. A negative @inc value is used to drop the counter
 8402 *	when releasing a resource needing all multicasts.
 8403 *	Return 0 if successful or a negative errno code on error.
 8404 */
 8405
 8406int dev_set_allmulti(struct net_device *dev, int inc)
 8407{
 8408	return __dev_set_allmulti(dev, inc, true);
 8409}
 8410EXPORT_SYMBOL(dev_set_allmulti);
 8411
 8412/*
 8413 *	Upload unicast and multicast address lists to device and
 8414 *	configure RX filtering. When the device doesn't support unicast
 8415 *	filtering it is put in promiscuous mode while unicast addresses
 8416 *	are present.
 8417 */
 8418void __dev_set_rx_mode(struct net_device *dev)
 8419{
 8420	const struct net_device_ops *ops = dev->netdev_ops;
 8421
 8422	/* dev_open will call this function so the list will stay sane. */
 8423	if (!(dev->flags&IFF_UP))
 8424		return;
 8425
 8426	if (!netif_device_present(dev))
 8427		return;
 8428
 8429	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
 8430		/* Unicast addresses changes may only happen under the rtnl,
 8431		 * therefore calling __dev_set_promiscuity here is safe.
 8432		 */
 8433		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
 8434			__dev_set_promiscuity(dev, 1, false);
 8435			dev->uc_promisc = true;
 8436		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
 8437			__dev_set_promiscuity(dev, -1, false);
 8438			dev->uc_promisc = false;
 8439		}
 8440	}
 8441
 8442	if (ops->ndo_set_rx_mode)
 8443		ops->ndo_set_rx_mode(dev);
 8444}
 8445
 8446void dev_set_rx_mode(struct net_device *dev)
 8447{
 8448	netif_addr_lock_bh(dev);
 8449	__dev_set_rx_mode(dev);
 8450	netif_addr_unlock_bh(dev);
 8451}
 8452
 8453/**
 8454 *	dev_get_flags - get flags reported to userspace
 8455 *	@dev: device
 8456 *
 8457 *	Get the combination of flag bits exported through APIs to userspace.
 8458 */
 8459unsigned int dev_get_flags(const struct net_device *dev)
 8460{
 8461	unsigned int flags;
 8462
 8463	flags = (dev->flags & ~(IFF_PROMISC |
 8464				IFF_ALLMULTI |
 8465				IFF_RUNNING |
 8466				IFF_LOWER_UP |
 8467				IFF_DORMANT)) |
 8468		(dev->gflags & (IFF_PROMISC |
 8469				IFF_ALLMULTI));
 8470
 8471	if (netif_running(dev)) {
 8472		if (netif_oper_up(dev))
 8473			flags |= IFF_RUNNING;
 8474		if (netif_carrier_ok(dev))
 8475			flags |= IFF_LOWER_UP;
 8476		if (netif_dormant(dev))
 8477			flags |= IFF_DORMANT;
 8478	}
 8479
 8480	return flags;
 8481}
 8482EXPORT_SYMBOL(dev_get_flags);
 8483
 8484int __dev_change_flags(struct net_device *dev, unsigned int flags,
 8485		       struct netlink_ext_ack *extack)
 8486{
 8487	unsigned int old_flags = dev->flags;
 8488	int ret;
 8489
 8490	ASSERT_RTNL();
 8491
 8492	/*
 8493	 *	Set the flags on our device.
 8494	 */
 8495
 8496	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
 8497			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
 8498			       IFF_AUTOMEDIA)) |
 8499		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
 8500				    IFF_ALLMULTI));
 8501
 8502	/*
 8503	 *	Load in the correct multicast list now the flags have changed.
 8504	 */
 8505
 8506	if ((old_flags ^ flags) & IFF_MULTICAST)
 8507		dev_change_rx_flags(dev, IFF_MULTICAST);
 8508
 8509	dev_set_rx_mode(dev);
 8510
 8511	/*
 8512	 *	Have we downed the interface. We handle IFF_UP ourselves
 8513	 *	according to user attempts to set it, rather than blindly
 8514	 *	setting it.
 8515	 */
 8516
 8517	ret = 0;
 8518	if ((old_flags ^ flags) & IFF_UP) {
 8519		if (old_flags & IFF_UP)
 8520			__dev_close(dev);
 8521		else
 8522			ret = __dev_open(dev, extack);
 8523	}
 8524
 8525	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 8526		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 8527		unsigned int old_flags = dev->flags;
 8528
 8529		dev->gflags ^= IFF_PROMISC;
 8530
 8531		if (__dev_set_promiscuity(dev, inc, false) >= 0)
 8532			if (dev->flags != old_flags)
 8533				dev_set_rx_mode(dev);
 8534	}
 8535
 8536	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
 8537	 * is important. Some (broken) drivers set IFF_PROMISC, when
 8538	 * IFF_ALLMULTI is requested not asking us and not reporting.
 8539	 */
 8540	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
 8541		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
 8542
 8543		dev->gflags ^= IFF_ALLMULTI;
 8544		__dev_set_allmulti(dev, inc, false);
 8545	}
 8546
 8547	return ret;
 8548}
 8549
 8550void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
 8551			unsigned int gchanges, u32 portid,
 8552			const struct nlmsghdr *nlh)
 8553{
 8554	unsigned int changes = dev->flags ^ old_flags;
 8555
 8556	if (gchanges)
 8557		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
 8558
 8559	if (changes & IFF_UP) {
 8560		if (dev->flags & IFF_UP)
 8561			call_netdevice_notifiers(NETDEV_UP, dev);
 8562		else
 8563			call_netdevice_notifiers(NETDEV_DOWN, dev);
 8564	}
 8565
 8566	if (dev->flags & IFF_UP &&
 8567	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
 8568		struct netdev_notifier_change_info change_info = {
 8569			.info = {
 8570				.dev = dev,
 8571			},
 8572			.flags_changed = changes,
 8573		};
 8574
 8575		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
 8576	}
 8577}
 8578
 8579/**
 8580 *	dev_change_flags - change device settings
 8581 *	@dev: device
 8582 *	@flags: device state flags
 8583 *	@extack: netlink extended ack
 8584 *
 8585 *	Change settings on device based state flags. The flags are
 8586 *	in the userspace exported format.
 8587 */
 8588int dev_change_flags(struct net_device *dev, unsigned int flags,
 8589		     struct netlink_ext_ack *extack)
 8590{
 8591	int ret;
 8592	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
 8593
 8594	ret = __dev_change_flags(dev, flags, extack);
 8595	if (ret < 0)
 8596		return ret;
 8597
 8598	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
 8599	__dev_notify_flags(dev, old_flags, changes, 0, NULL);
 
 
 
 8600	return ret;
 8601}
 8602EXPORT_SYMBOL(dev_change_flags);
 8603
 8604int __dev_set_mtu(struct net_device *dev, int new_mtu)
 8605{
 8606	const struct net_device_ops *ops = dev->netdev_ops;
 8607
 8608	if (ops->ndo_change_mtu)
 8609		return ops->ndo_change_mtu(dev, new_mtu);
 8610
 8611	/* Pairs with all the lockless reads of dev->mtu in the stack */
 8612	WRITE_ONCE(dev->mtu, new_mtu);
 8613	return 0;
 8614}
 8615EXPORT_SYMBOL(__dev_set_mtu);
 8616
 8617int dev_validate_mtu(struct net_device *dev, int new_mtu,
 8618		     struct netlink_ext_ack *extack)
 8619{
 8620	/* MTU must be positive, and in range */
 8621	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
 8622		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
 8623		return -EINVAL;
 8624	}
 8625
 8626	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
 8627		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
 8628		return -EINVAL;
 8629	}
 8630	return 0;
 8631}
 8632
 8633/**
 8634 *	dev_set_mtu_ext - Change maximum transfer unit
 8635 *	@dev: device
 8636 *	@new_mtu: new transfer unit
 8637 *	@extack: netlink extended ack
 8638 *
 8639 *	Change the maximum transfer size of the network device.
 8640 */
 8641int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
 8642		    struct netlink_ext_ack *extack)
 8643{
 8644	int err, orig_mtu;
 
 8645
 8646	if (new_mtu == dev->mtu)
 8647		return 0;
 8648
 8649	err = dev_validate_mtu(dev, new_mtu, extack);
 8650	if (err)
 8651		return err;
 8652
 8653	if (!netif_device_present(dev))
 8654		return -ENODEV;
 8655
 8656	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
 8657	err = notifier_to_errno(err);
 8658	if (err)
 8659		return err;
 8660
 8661	orig_mtu = dev->mtu;
 8662	err = __dev_set_mtu(dev, new_mtu);
 8663
 8664	if (!err) {
 8665		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8666						   orig_mtu);
 8667		err = notifier_to_errno(err);
 8668		if (err) {
 8669			/* setting mtu back and notifying everyone again,
 8670			 * so that they have a chance to revert changes.
 8671			 */
 8672			__dev_set_mtu(dev, orig_mtu);
 8673			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8674						     new_mtu);
 8675		}
 8676	}
 8677	return err;
 8678}
 8679
 8680int dev_set_mtu(struct net_device *dev, int new_mtu)
 8681{
 8682	struct netlink_ext_ack extack;
 8683	int err;
 8684
 8685	memset(&extack, 0, sizeof(extack));
 8686	err = dev_set_mtu_ext(dev, new_mtu, &extack);
 8687	if (err && extack._msg)
 8688		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
 8689	return err;
 8690}
 8691EXPORT_SYMBOL(dev_set_mtu);
 8692
 8693/**
 8694 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
 8695 *	@dev: device
 8696 *	@new_len: new tx queue length
 8697 */
 8698int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
 8699{
 8700	unsigned int orig_len = dev->tx_queue_len;
 8701	int res;
 8702
 8703	if (new_len != (unsigned int)new_len)
 8704		return -ERANGE;
 8705
 8706	if (new_len != orig_len) {
 8707		dev->tx_queue_len = new_len;
 8708		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
 8709		res = notifier_to_errno(res);
 8710		if (res)
 8711			goto err_rollback;
 8712		res = dev_qdisc_change_tx_queue_len(dev);
 8713		if (res)
 8714			goto err_rollback;
 8715	}
 8716
 8717	return 0;
 8718
 8719err_rollback:
 8720	netdev_err(dev, "refused to change device tx_queue_len\n");
 8721	dev->tx_queue_len = orig_len;
 8722	return res;
 8723}
 8724
 8725/**
 8726 *	dev_set_group - Change group this device belongs to
 8727 *	@dev: device
 8728 *	@new_group: group this device should belong to
 8729 */
 8730void dev_set_group(struct net_device *dev, int new_group)
 8731{
 8732	dev->group = new_group;
 8733}
 8734
 8735/**
 8736 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
 8737 *	@dev: device
 8738 *	@addr: new address
 8739 *	@extack: netlink extended ack
 8740 */
 8741int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 8742			      struct netlink_ext_ack *extack)
 8743{
 8744	struct netdev_notifier_pre_changeaddr_info info = {
 8745		.info.dev = dev,
 8746		.info.extack = extack,
 8747		.dev_addr = addr,
 8748	};
 8749	int rc;
 8750
 8751	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
 8752	return notifier_to_errno(rc);
 8753}
 8754EXPORT_SYMBOL(dev_pre_changeaddr_notify);
 8755
 8756/**
 8757 *	dev_set_mac_address - Change Media Access Control Address
 8758 *	@dev: device
 8759 *	@sa: new address
 8760 *	@extack: netlink extended ack
 8761 *
 8762 *	Change the hardware (MAC) address of the device
 8763 */
 8764int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
 8765			struct netlink_ext_ack *extack)
 8766{
 8767	const struct net_device_ops *ops = dev->netdev_ops;
 8768	int err;
 8769
 8770	if (!ops->ndo_set_mac_address)
 8771		return -EOPNOTSUPP;
 8772	if (sa->sa_family != dev->type)
 8773		return -EINVAL;
 8774	if (!netif_device_present(dev))
 8775		return -ENODEV;
 8776	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
 8777	if (err)
 8778		return err;
 8779	err = ops->ndo_set_mac_address(dev, sa);
 8780	if (err)
 8781		return err;
 8782	dev->addr_assign_type = NET_ADDR_SET;
 8783	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 8784	add_device_randomness(dev->dev_addr, dev->addr_len);
 8785	return 0;
 8786}
 8787EXPORT_SYMBOL(dev_set_mac_address);
 8788
 8789static DECLARE_RWSEM(dev_addr_sem);
 8790
 8791int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
 8792			     struct netlink_ext_ack *extack)
 8793{
 8794	int ret;
 
 8795
 8796	down_write(&dev_addr_sem);
 8797	ret = dev_set_mac_address(dev, sa, extack);
 8798	up_write(&dev_addr_sem);
 8799	return ret;
 8800}
 8801EXPORT_SYMBOL(dev_set_mac_address_user);
 8802
 8803int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
 8804{
 8805	size_t size = sizeof(sa->sa_data_min);
 8806	struct net_device *dev;
 8807	int ret = 0;
 8808
 8809	down_read(&dev_addr_sem);
 8810	rcu_read_lock();
 
 
 8811
 8812	dev = dev_get_by_name_rcu(net, dev_name);
 8813	if (!dev) {
 8814		ret = -ENODEV;
 8815		goto unlock;
 8816	}
 8817	if (!dev->addr_len)
 8818		memset(sa->sa_data, 0, size);
 8819	else
 8820		memcpy(sa->sa_data, dev->dev_addr,
 8821		       min_t(size_t, size, dev->addr_len));
 8822	sa->sa_family = dev->type;
 8823
 8824unlock:
 8825	rcu_read_unlock();
 8826	up_read(&dev_addr_sem);
 8827	return ret;
 8828}
 8829EXPORT_SYMBOL(dev_get_mac_address);
 
 
 8830
 8831/**
 8832 *	dev_change_carrier - Change device carrier
 8833 *	@dev: device
 8834 *	@new_carrier: new value
 8835 *
 8836 *	Change device carrier
 8837 */
 8838int dev_change_carrier(struct net_device *dev, bool new_carrier)
 8839{
 8840	const struct net_device_ops *ops = dev->netdev_ops;
 8841
 8842	if (!ops->ndo_change_carrier)
 8843		return -EOPNOTSUPP;
 8844	if (!netif_device_present(dev))
 8845		return -ENODEV;
 8846	return ops->ndo_change_carrier(dev, new_carrier);
 8847}
 
 
 8848
 8849/**
 8850 *	dev_get_phys_port_id - Get device physical port ID
 8851 *	@dev: device
 8852 *	@ppid: port ID
 8853 *
 8854 *	Get device physical port ID
 8855 */
 8856int dev_get_phys_port_id(struct net_device *dev,
 8857			 struct netdev_phys_item_id *ppid)
 8858{
 8859	const struct net_device_ops *ops = dev->netdev_ops;
 8860
 8861	if (!ops->ndo_get_phys_port_id)
 8862		return -EOPNOTSUPP;
 8863	return ops->ndo_get_phys_port_id(dev, ppid);
 8864}
 8865
 8866/**
 8867 *	dev_get_phys_port_name - Get device physical port name
 8868 *	@dev: device
 8869 *	@name: port name
 8870 *	@len: limit of bytes to copy to name
 8871 *
 8872 *	Get device physical port name
 8873 */
 8874int dev_get_phys_port_name(struct net_device *dev,
 8875			   char *name, size_t len)
 8876{
 8877	const struct net_device_ops *ops = dev->netdev_ops;
 8878	int err;
 8879
 8880	if (ops->ndo_get_phys_port_name) {
 8881		err = ops->ndo_get_phys_port_name(dev, name, len);
 8882		if (err != -EOPNOTSUPP)
 8883			return err;
 8884	}
 8885	return devlink_compat_phys_port_name_get(dev, name, len);
 8886}
 8887
 8888/**
 8889 *	dev_get_port_parent_id - Get the device's port parent identifier
 8890 *	@dev: network device
 8891 *	@ppid: pointer to a storage for the port's parent identifier
 8892 *	@recurse: allow/disallow recursion to lower devices
 8893 *
 8894 *	Get the devices's port parent identifier
 8895 */
 8896int dev_get_port_parent_id(struct net_device *dev,
 8897			   struct netdev_phys_item_id *ppid,
 8898			   bool recurse)
 8899{
 8900	const struct net_device_ops *ops = dev->netdev_ops;
 8901	struct netdev_phys_item_id first = { };
 8902	struct net_device *lower_dev;
 8903	struct list_head *iter;
 8904	int err;
 
 
 8905
 8906	if (ops->ndo_get_port_parent_id) {
 8907		err = ops->ndo_get_port_parent_id(dev, ppid);
 8908		if (err != -EOPNOTSUPP)
 8909			return err;
 8910	}
 8911
 8912	err = devlink_compat_switch_id_get(dev, ppid);
 8913	if (!recurse || err != -EOPNOTSUPP)
 8914		return err;
 8915
 8916	netdev_for_each_lower_dev(dev, lower_dev, iter) {
 8917		err = dev_get_port_parent_id(lower_dev, ppid, true);
 8918		if (err)
 8919			break;
 8920		if (!first.id_len)
 8921			first = *ppid;
 8922		else if (memcmp(&first, ppid, sizeof(*ppid)))
 8923			return -EOPNOTSUPP;
 8924	}
 8925
 8926	return err;
 8927}
 8928EXPORT_SYMBOL(dev_get_port_parent_id);
 8929
 8930/**
 8931 *	netdev_port_same_parent_id - Indicate if two network devices have
 8932 *	the same port parent identifier
 8933 *	@a: first network device
 8934 *	@b: second network device
 8935 */
 8936bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
 8937{
 8938	struct netdev_phys_item_id a_id = { };
 8939	struct netdev_phys_item_id b_id = { };
 8940
 8941	if (dev_get_port_parent_id(a, &a_id, true) ||
 8942	    dev_get_port_parent_id(b, &b_id, true))
 8943		return false;
 8944
 8945	return netdev_phys_item_id_same(&a_id, &b_id);
 8946}
 8947EXPORT_SYMBOL(netdev_port_same_parent_id);
 
 
 
 
 8948
 8949/**
 8950 *	dev_change_proto_down - set carrier according to proto_down.
 8951 *
 8952 *	@dev: device
 8953 *	@proto_down: new value
 8954 */
 8955int dev_change_proto_down(struct net_device *dev, bool proto_down)
 8956{
 8957	if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
 8958		return -EOPNOTSUPP;
 8959	if (!netif_device_present(dev))
 8960		return -ENODEV;
 8961	if (proto_down)
 8962		netif_carrier_off(dev);
 8963	else
 8964		netif_carrier_on(dev);
 8965	dev->proto_down = proto_down;
 8966	return 0;
 8967}
 8968
 8969/**
 8970 *	dev_change_proto_down_reason - proto down reason
 8971 *
 8972 *	@dev: device
 8973 *	@mask: proto down mask
 8974 *	@value: proto down value
 8975 */
 8976void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
 8977				  u32 value)
 8978{
 8979	int b;
 8980
 8981	if (!mask) {
 8982		dev->proto_down_reason = value;
 8983	} else {
 8984		for_each_set_bit(b, &mask, 32) {
 8985			if (value & (1 << b))
 8986				dev->proto_down_reason |= BIT(b);
 8987			else
 8988				dev->proto_down_reason &= ~BIT(b);
 8989		}
 8990	}
 8991}
 8992
 8993struct bpf_xdp_link {
 8994	struct bpf_link link;
 8995	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
 8996	int flags;
 8997};
 8998
 8999static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
 9000{
 9001	if (flags & XDP_FLAGS_HW_MODE)
 9002		return XDP_MODE_HW;
 9003	if (flags & XDP_FLAGS_DRV_MODE)
 9004		return XDP_MODE_DRV;
 9005	if (flags & XDP_FLAGS_SKB_MODE)
 9006		return XDP_MODE_SKB;
 9007	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
 9008}
 9009
 9010static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
 9011{
 9012	switch (mode) {
 9013	case XDP_MODE_SKB:
 9014		return generic_xdp_install;
 9015	case XDP_MODE_DRV:
 9016	case XDP_MODE_HW:
 9017		return dev->netdev_ops->ndo_bpf;
 9018	default:
 9019		return NULL;
 9020	}
 9021}
 9022
 9023static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
 9024					 enum bpf_xdp_mode mode)
 9025{
 9026	return dev->xdp_state[mode].link;
 9027}
 9028
 9029static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
 9030				     enum bpf_xdp_mode mode)
 9031{
 9032	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 9033
 9034	if (link)
 9035		return link->link.prog;
 9036	return dev->xdp_state[mode].prog;
 9037}
 9038
 9039u8 dev_xdp_prog_count(struct net_device *dev)
 9040{
 9041	u8 count = 0;
 9042	int i;
 9043
 9044	for (i = 0; i < __MAX_XDP_MODE; i++)
 9045		if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
 9046			count++;
 9047	return count;
 9048}
 9049EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
 
 
 
 
 
 9050
 9051u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
 9052{
 9053	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
 
 
 9054
 9055	return prog ? prog->aux->id : 0;
 9056}
 
 
 9057
 9058static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
 9059			     struct bpf_xdp_link *link)
 9060{
 9061	dev->xdp_state[mode].link = link;
 9062	dev->xdp_state[mode].prog = NULL;
 9063}
 9064
 9065static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
 9066			     struct bpf_prog *prog)
 9067{
 9068	dev->xdp_state[mode].link = NULL;
 9069	dev->xdp_state[mode].prog = prog;
 9070}
 9071
 9072static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
 9073			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
 9074			   u32 flags, struct bpf_prog *prog)
 9075{
 9076	struct netdev_bpf xdp;
 9077	int err;
 9078
 9079	memset(&xdp, 0, sizeof(xdp));
 9080	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
 9081	xdp.extack = extack;
 9082	xdp.flags = flags;
 9083	xdp.prog = prog;
 9084
 9085	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
 9086	 * "moved" into driver), so they don't increment it on their own, but
 9087	 * they do decrement refcnt when program is detached or replaced.
 9088	 * Given net_device also owns link/prog, we need to bump refcnt here
 9089	 * to prevent drivers from underflowing it.
 9090	 */
 9091	if (prog)
 9092		bpf_prog_inc(prog);
 9093	err = bpf_op(dev, &xdp);
 9094	if (err) {
 9095		if (prog)
 9096			bpf_prog_put(prog);
 9097		return err;
 9098	}
 
 
 9099
 9100	if (mode != XDP_MODE_HW)
 9101		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
 9102
 9103	return 0;
 9104}
 9105
 9106static void dev_xdp_uninstall(struct net_device *dev)
 9107{
 9108	struct bpf_xdp_link *link;
 9109	struct bpf_prog *prog;
 9110	enum bpf_xdp_mode mode;
 9111	bpf_op_t bpf_op;
 9112
 9113	ASSERT_RTNL();
 
 
 9114
 9115	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
 9116		prog = dev_xdp_prog(dev, mode);
 9117		if (!prog)
 9118			continue;
 9119
 9120		bpf_op = dev_xdp_bpf_op(dev, mode);
 9121		if (!bpf_op)
 9122			continue;
 9123
 9124		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 9125
 9126		/* auto-detach link from net device */
 9127		link = dev_xdp_link(dev, mode);
 9128		if (link)
 9129			link->dev = NULL;
 9130		else
 9131			bpf_prog_put(prog);
 9132
 9133		dev_xdp_set_link(dev, mode, NULL);
 9134	}
 9135}
 9136
 9137static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
 9138			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
 9139			  struct bpf_prog *old_prog, u32 flags)
 9140{
 9141	unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
 9142	struct bpf_prog *cur_prog;
 9143	struct net_device *upper;
 9144	struct list_head *iter;
 9145	enum bpf_xdp_mode mode;
 9146	bpf_op_t bpf_op;
 9147	int err;
 9148
 9149	ASSERT_RTNL();
 9150
 9151	/* either link or prog attachment, never both */
 9152	if (link && (new_prog || old_prog))
 9153		return -EINVAL;
 9154	/* link supports only XDP mode flags */
 9155	if (link && (flags & ~XDP_FLAGS_MODES)) {
 9156		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
 9157		return -EINVAL;
 9158	}
 9159	/* just one XDP mode bit should be set, zero defaults to drv/skb mode */
 9160	if (num_modes > 1) {
 9161		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
 9162		return -EINVAL;
 9163	}
 9164	/* avoid ambiguity if offload + drv/skb mode progs are both loaded */
 9165	if (!num_modes && dev_xdp_prog_count(dev) > 1) {
 9166		NL_SET_ERR_MSG(extack,
 9167			       "More than one program loaded, unset mode is ambiguous");
 9168		return -EINVAL;
 9169	}
 9170	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
 9171	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
 9172		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
 9173		return -EINVAL;
 9174	}
 9175
 9176	mode = dev_xdp_mode(dev, flags);
 9177	/* can't replace attached link */
 9178	if (dev_xdp_link(dev, mode)) {
 9179		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
 9180		return -EBUSY;
 9181	}
 9182
 9183	/* don't allow if an upper device already has a program */
 9184	netdev_for_each_upper_dev_rcu(dev, upper, iter) {
 9185		if (dev_xdp_prog_count(upper) > 0) {
 9186			NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
 9187			return -EEXIST;
 9188		}
 9189	}
 9190
 9191	cur_prog = dev_xdp_prog(dev, mode);
 9192	/* can't replace attached prog with link */
 9193	if (link && cur_prog) {
 9194		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
 9195		return -EBUSY;
 9196	}
 9197	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
 9198		NL_SET_ERR_MSG(extack, "Active program does not match expected");
 9199		return -EEXIST;
 9200	}
 9201
 9202	/* put effective new program into new_prog */
 9203	if (link)
 9204		new_prog = link->link.prog;
 9205
 9206	if (new_prog) {
 9207		bool offload = mode == XDP_MODE_HW;
 9208		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
 9209					       ? XDP_MODE_DRV : XDP_MODE_SKB;
 9210
 9211		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
 9212			NL_SET_ERR_MSG(extack, "XDP program already attached");
 9213			return -EBUSY;
 9214		}
 9215		if (!offload && dev_xdp_prog(dev, other_mode)) {
 9216			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
 9217			return -EEXIST;
 9218		}
 9219		if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
 9220			NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
 9221			return -EINVAL;
 9222		}
 9223		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
 9224			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
 9225			return -EINVAL;
 9226		}
 9227		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
 9228			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
 9229			return -EINVAL;
 9230		}
 9231	}
 9232
 9233	/* don't call drivers if the effective program didn't change */
 9234	if (new_prog != cur_prog) {
 9235		bpf_op = dev_xdp_bpf_op(dev, mode);
 9236		if (!bpf_op) {
 9237			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
 9238			return -EOPNOTSUPP;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 9239		}
 
 9240
 9241		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
 9242		if (err)
 9243			return err;
 9244	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 9245
 9246	if (link)
 9247		dev_xdp_set_link(dev, mode, link);
 9248	else
 9249		dev_xdp_set_prog(dev, mode, new_prog);
 9250	if (cur_prog)
 9251		bpf_prog_put(cur_prog);
 
 
 9252
 9253	return 0;
 9254}
 9255
 9256static int dev_xdp_attach_link(struct net_device *dev,
 9257			       struct netlink_ext_ack *extack,
 9258			       struct bpf_xdp_link *link)
 9259{
 9260	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
 9261}
 9262
 9263static int dev_xdp_detach_link(struct net_device *dev,
 9264			       struct netlink_ext_ack *extack,
 9265			       struct bpf_xdp_link *link)
 9266{
 9267	enum bpf_xdp_mode mode;
 9268	bpf_op_t bpf_op;
 9269
 9270	ASSERT_RTNL();
 9271
 9272	mode = dev_xdp_mode(dev, link->flags);
 9273	if (dev_xdp_link(dev, mode) != link)
 9274		return -EINVAL;
 9275
 9276	bpf_op = dev_xdp_bpf_op(dev, mode);
 9277	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 9278	dev_xdp_set_link(dev, mode, NULL);
 9279	return 0;
 9280}
 9281
 9282static void bpf_xdp_link_release(struct bpf_link *link)
 9283{
 9284	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9285
 9286	rtnl_lock();
 9287
 9288	/* if racing with net_device's tear down, xdp_link->dev might be
 9289	 * already NULL, in which case link was already auto-detached
 9290	 */
 9291	if (xdp_link->dev) {
 9292		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
 9293		xdp_link->dev = NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 9294	}
 9295
 9296	rtnl_unlock();
 9297}
 9298
 9299static int bpf_xdp_link_detach(struct bpf_link *link)
 9300{
 9301	bpf_xdp_link_release(link);
 9302	return 0;
 9303}
 9304
 9305static void bpf_xdp_link_dealloc(struct bpf_link *link)
 
 
 
 
 
 
 
 
 9306{
 9307	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9308
 9309	kfree(xdp_link);
 
 
 
 
 9310}
 9311
 9312static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
 9313				     struct seq_file *seq)
 9314{
 9315	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9316	u32 ifindex = 0;
 9317
 9318	rtnl_lock();
 9319	if (xdp_link->dev)
 9320		ifindex = xdp_link->dev->ifindex;
 9321	rtnl_unlock();
 9322
 9323	seq_printf(seq, "ifindex:\t%u\n", ifindex);
 9324}
 9325
 9326static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
 9327				       struct bpf_link_info *info)
 9328{
 9329	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9330	u32 ifindex = 0;
 9331
 9332	rtnl_lock();
 9333	if (xdp_link->dev)
 9334		ifindex = xdp_link->dev->ifindex;
 9335	rtnl_unlock();
 9336
 9337	info->xdp.ifindex = ifindex;
 9338	return 0;
 9339}
 9340
 9341static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
 9342			       struct bpf_prog *old_prog)
 9343{
 9344	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9345	enum bpf_xdp_mode mode;
 9346	bpf_op_t bpf_op;
 9347	int err = 0;
 9348
 9349	rtnl_lock();
 
 9350
 9351	/* link might have been auto-released already, so fail */
 9352	if (!xdp_link->dev) {
 9353		err = -ENOLINK;
 9354		goto out_unlock;
 9355	}
 
 
 
 9356
 9357	if (old_prog && link->prog != old_prog) {
 9358		err = -EPERM;
 9359		goto out_unlock;
 9360	}
 9361	old_prog = link->prog;
 9362	if (old_prog->type != new_prog->type ||
 9363	    old_prog->expected_attach_type != new_prog->expected_attach_type) {
 9364		err = -EINVAL;
 9365		goto out_unlock;
 9366	}
 9367
 9368	if (old_prog == new_prog) {
 9369		/* no-op, don't disturb drivers */
 9370		bpf_prog_put(new_prog);
 9371		goto out_unlock;
 9372	}
 9373
 9374	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
 9375	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
 9376	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
 9377			      xdp_link->flags, new_prog);
 9378	if (err)
 9379		goto out_unlock;
 9380
 9381	old_prog = xchg(&link->prog, new_prog);
 9382	bpf_prog_put(old_prog);
 9383
 9384out_unlock:
 9385	rtnl_unlock();
 9386	return err;
 9387}
 9388
 9389static const struct bpf_link_ops bpf_xdp_link_lops = {
 9390	.release = bpf_xdp_link_release,
 9391	.dealloc = bpf_xdp_link_dealloc,
 9392	.detach = bpf_xdp_link_detach,
 9393	.show_fdinfo = bpf_xdp_link_show_fdinfo,
 9394	.fill_link_info = bpf_xdp_link_fill_link_info,
 9395	.update_prog = bpf_xdp_link_update,
 9396};
 9397
 9398int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 9399{
 9400	struct net *net = current->nsproxy->net_ns;
 9401	struct bpf_link_primer link_primer;
 9402	struct bpf_xdp_link *link;
 9403	struct net_device *dev;
 9404	int err, fd;
 9405
 9406	rtnl_lock();
 9407	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
 9408	if (!dev) {
 9409		rtnl_unlock();
 9410		return -EINVAL;
 9411	}
 9412
 9413	link = kzalloc(sizeof(*link), GFP_USER);
 9414	if (!link) {
 9415		err = -ENOMEM;
 9416		goto unlock;
 9417	}
 9418
 9419	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
 9420	link->dev = dev;
 9421	link->flags = attr->link_create.flags;
 9422
 9423	err = bpf_link_prime(&link->link, &link_primer);
 9424	if (err) {
 9425		kfree(link);
 9426		goto unlock;
 9427	}
 9428
 9429	err = dev_xdp_attach_link(dev, NULL, link);
 9430	rtnl_unlock();
 
 
 9431
 9432	if (err) {
 9433		link->dev = NULL;
 9434		bpf_link_cleanup(&link_primer);
 9435		goto out_put_dev;
 9436	}
 9437
 9438	fd = bpf_link_settle(&link_primer);
 9439	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
 9440	dev_put(dev);
 9441	return fd;
 
 9442
 9443unlock:
 9444	rtnl_unlock();
 9445
 9446out_put_dev:
 9447	dev_put(dev);
 9448	return err;
 9449}
 9450
 9451/**
 9452 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
 9453 *	@dev: device
 9454 *	@extack: netlink extended ack
 9455 *	@fd: new program fd or negative value to clear
 9456 *	@expected_fd: old program fd that userspace expects to replace or clear
 9457 *	@flags: xdp-related flags
 9458 *
 9459 *	Set or clear a bpf program for a device
 9460 */
 9461int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 9462		      int fd, int expected_fd, u32 flags)
 9463{
 9464	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
 9465	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
 9466	int err;
 9467
 9468	ASSERT_RTNL();
 9469
 9470	if (fd >= 0) {
 9471		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
 9472						 mode != XDP_MODE_SKB);
 9473		if (IS_ERR(new_prog))
 9474			return PTR_ERR(new_prog);
 9475	}
 9476
 9477	if (expected_fd >= 0) {
 9478		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
 9479						 mode != XDP_MODE_SKB);
 9480		if (IS_ERR(old_prog)) {
 9481			err = PTR_ERR(old_prog);
 9482			old_prog = NULL;
 9483			goto err_out;
 9484		}
 9485	}
 9486
 9487	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
 9488
 9489err_out:
 9490	if (err && new_prog)
 9491		bpf_prog_put(new_prog);
 9492	if (old_prog)
 9493		bpf_prog_put(old_prog);
 9494	return err;
 9495}
 9496
 9497/**
 9498 *	dev_new_index	-	allocate an ifindex
 9499 *	@net: the applicable net namespace
 9500 *
 9501 *	Returns a suitable unique value for a new device interface
 9502 *	number.  The caller must hold the rtnl semaphore or the
 9503 *	dev_base_lock to be sure it remains unique.
 9504 */
 9505static int dev_new_index(struct net *net)
 9506{
 9507	int ifindex = net->ifindex;
 9508
 9509	for (;;) {
 9510		if (++ifindex <= 0)
 9511			ifindex = 1;
 9512		if (!__dev_get_by_index(net, ifindex))
 9513			return net->ifindex = ifindex;
 9514	}
 9515}
 9516
 9517/* Delayed registration/unregisteration */
 9518LIST_HEAD(net_todo_list);
 9519DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 9520
 9521static void net_set_todo(struct net_device *dev)
 9522{
 9523	list_add_tail(&dev->todo_list, &net_todo_list);
 9524	atomic_inc(&dev_net(dev)->dev_unreg_count);
 9525}
 9526
 9527static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
 9528	struct net_device *upper, netdev_features_t features)
 9529{
 9530	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9531	netdev_features_t feature;
 9532	int feature_bit;
 9533
 9534	for_each_netdev_feature(upper_disables, feature_bit) {
 9535		feature = __NETIF_F_BIT(feature_bit);
 9536		if (!(upper->wanted_features & feature)
 9537		    && (features & feature)) {
 9538			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
 9539				   &feature, upper->name);
 9540			features &= ~feature;
 9541		}
 9542	}
 9543
 9544	return features;
 9545}
 9546
 9547static void netdev_sync_lower_features(struct net_device *upper,
 9548	struct net_device *lower, netdev_features_t features)
 9549{
 9550	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9551	netdev_features_t feature;
 9552	int feature_bit;
 9553
 9554	for_each_netdev_feature(upper_disables, feature_bit) {
 9555		feature = __NETIF_F_BIT(feature_bit);
 9556		if (!(features & feature) && (lower->features & feature)) {
 9557			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
 9558				   &feature, lower->name);
 9559			lower->wanted_features &= ~feature;
 9560			__netdev_update_features(lower);
 9561
 9562			if (unlikely(lower->features & feature))
 9563				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
 9564					    &feature, lower->name);
 9565			else
 9566				netdev_features_change(lower);
 9567		}
 9568	}
 9569}
 9570
 9571static netdev_features_t netdev_fix_features(struct net_device *dev,
 9572	netdev_features_t features)
 9573{
 9574	/* Fix illegal checksum combinations */
 9575	if ((features & NETIF_F_HW_CSUM) &&
 9576	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
 9577		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
 9578		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
 9579	}
 9580
 
 
 
 
 
 
 
 
 9581	/* TSO requires that SG is present as well. */
 9582	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
 9583		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
 9584		features &= ~NETIF_F_ALL_TSO;
 9585	}
 9586
 9587	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
 9588					!(features & NETIF_F_IP_CSUM)) {
 9589		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
 9590		features &= ~NETIF_F_TSO;
 9591		features &= ~NETIF_F_TSO_ECN;
 9592	}
 9593
 9594	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
 9595					 !(features & NETIF_F_IPV6_CSUM)) {
 9596		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
 9597		features &= ~NETIF_F_TSO6;
 9598	}
 9599
 9600	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
 9601	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
 9602		features &= ~NETIF_F_TSO_MANGLEID;
 9603
 9604	/* TSO ECN requires that TSO is present as well. */
 9605	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
 9606		features &= ~NETIF_F_TSO_ECN;
 9607
 9608	/* Software GSO depends on SG. */
 9609	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
 9610		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
 9611		features &= ~NETIF_F_GSO;
 9612	}
 9613
 9614	/* GSO partial features require GSO partial be set */
 9615	if ((features & dev->gso_partial_features) &&
 9616	    !(features & NETIF_F_GSO_PARTIAL)) {
 9617		netdev_dbg(dev,
 9618			   "Dropping partially supported GSO features since no GSO partial.\n");
 9619		features &= ~dev->gso_partial_features;
 9620	}
 9621
 9622	if (!(features & NETIF_F_RXCSUM)) {
 9623		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
 9624		 * successfully merged by hardware must also have the
 9625		 * checksum verified by hardware.  If the user does not
 9626		 * want to enable RXCSUM, logically, we should disable GRO_HW.
 9627		 */
 9628		if (features & NETIF_F_GRO_HW) {
 9629			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
 9630			features &= ~NETIF_F_GRO_HW;
 9631		}
 9632	}
 9633
 9634	/* LRO/HW-GRO features cannot be combined with RX-FCS */
 9635	if (features & NETIF_F_RXFCS) {
 9636		if (features & NETIF_F_LRO) {
 9637			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
 9638			features &= ~NETIF_F_LRO;
 9639		}
 9640
 9641		if (features & NETIF_F_GRO_HW) {
 9642			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
 9643			features &= ~NETIF_F_GRO_HW;
 
 9644		}
 9645	}
 9646
 9647	if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
 9648		netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
 9649		features &= ~NETIF_F_LRO;
 9650	}
 9651
 9652	if (features & NETIF_F_HW_TLS_TX) {
 9653		bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
 9654			(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
 9655		bool hw_csum = features & NETIF_F_HW_CSUM;
 9656
 9657		if (!ip_csum && !hw_csum) {
 9658			netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
 9659			features &= ~NETIF_F_HW_TLS_TX;
 9660		}
 9661	}
 9662
 9663	if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
 9664		netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
 9665		features &= ~NETIF_F_HW_TLS_RX;
 9666	}
 9667
 9668	return features;
 9669}
 9670
 9671int __netdev_update_features(struct net_device *dev)
 9672{
 9673	struct net_device *upper, *lower;
 9674	netdev_features_t features;
 9675	struct list_head *iter;
 9676	int err = -1;
 9677
 9678	ASSERT_RTNL();
 9679
 9680	features = netdev_get_wanted_features(dev);
 9681
 9682	if (dev->netdev_ops->ndo_fix_features)
 9683		features = dev->netdev_ops->ndo_fix_features(dev, features);
 9684
 9685	/* driver might be less strict about feature dependencies */
 9686	features = netdev_fix_features(dev, features);
 9687
 9688	/* some features can't be enabled if they're off on an upper device */
 9689	netdev_for_each_upper_dev_rcu(dev, upper, iter)
 9690		features = netdev_sync_upper_features(dev, upper, features);
 9691
 9692	if (dev->features == features)
 9693		goto sync_lower;
 9694
 9695	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
 9696		&dev->features, &features);
 9697
 9698	if (dev->netdev_ops->ndo_set_features)
 9699		err = dev->netdev_ops->ndo_set_features(dev, features);
 9700	else
 9701		err = 0;
 9702
 9703	if (unlikely(err < 0)) {
 9704		netdev_err(dev,
 9705			"set_features() failed (%d); wanted %pNF, left %pNF\n",
 9706			err, &features, &dev->features);
 9707		/* return non-0 since some features might have changed and
 9708		 * it's better to fire a spurious notification than miss it
 9709		 */
 9710		return -1;
 9711	}
 9712
 9713sync_lower:
 9714	/* some features must be disabled on lower devices when disabled
 9715	 * on an upper device (think: bonding master or bridge)
 9716	 */
 9717	netdev_for_each_lower_dev(dev, lower, iter)
 9718		netdev_sync_lower_features(dev, lower, features);
 9719
 9720	if (!err) {
 9721		netdev_features_t diff = features ^ dev->features;
 9722
 9723		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9724			/* udp_tunnel_{get,drop}_rx_info both need
 9725			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
 9726			 * device, or they won't do anything.
 9727			 * Thus we need to update dev->features
 9728			 * *before* calling udp_tunnel_get_rx_info,
 9729			 * but *after* calling udp_tunnel_drop_rx_info.
 9730			 */
 9731			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9732				dev->features = features;
 9733				udp_tunnel_get_rx_info(dev);
 9734			} else {
 9735				udp_tunnel_drop_rx_info(dev);
 9736			}
 9737		}
 9738
 9739		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9740			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9741				dev->features = features;
 9742				err |= vlan_get_rx_ctag_filter_info(dev);
 9743			} else {
 9744				vlan_drop_rx_ctag_filter_info(dev);
 9745			}
 9746		}
 9747
 9748		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
 9749			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
 9750				dev->features = features;
 9751				err |= vlan_get_rx_stag_filter_info(dev);
 9752			} else {
 9753				vlan_drop_rx_stag_filter_info(dev);
 9754			}
 9755		}
 9756
 9757		dev->features = features;
 9758	}
 9759
 9760	return err < 0 ? 0 : 1;
 9761}
 9762
 9763/**
 9764 *	netdev_update_features - recalculate device features
 9765 *	@dev: the device to check
 9766 *
 9767 *	Recalculate dev->features set and send notifications if it
 9768 *	has changed. Should be called after driver or hardware dependent
 9769 *	conditions might have changed that influence the features.
 9770 */
 9771void netdev_update_features(struct net_device *dev)
 9772{
 9773	if (__netdev_update_features(dev))
 9774		netdev_features_change(dev);
 9775}
 9776EXPORT_SYMBOL(netdev_update_features);
 9777
 9778/**
 9779 *	netdev_change_features - recalculate device features
 9780 *	@dev: the device to check
 9781 *
 9782 *	Recalculate dev->features set and send notifications even
 9783 *	if they have not changed. Should be called instead of
 9784 *	netdev_update_features() if also dev->vlan_features might
 9785 *	have changed to allow the changes to be propagated to stacked
 9786 *	VLAN devices.
 9787 */
 9788void netdev_change_features(struct net_device *dev)
 9789{
 9790	__netdev_update_features(dev);
 9791	netdev_features_change(dev);
 9792}
 9793EXPORT_SYMBOL(netdev_change_features);
 9794
 9795/**
 9796 *	netif_stacked_transfer_operstate -	transfer operstate
 9797 *	@rootdev: the root or lower level device to transfer state from
 9798 *	@dev: the device to transfer operstate to
 9799 *
 9800 *	Transfer operational state from root to device. This is normally
 9801 *	called when a stacking relationship exists between the root
 9802 *	device and the device(a leaf device).
 9803 */
 9804void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 9805					struct net_device *dev)
 9806{
 9807	if (rootdev->operstate == IF_OPER_DORMANT)
 9808		netif_dormant_on(dev);
 9809	else
 9810		netif_dormant_off(dev);
 9811
 9812	if (rootdev->operstate == IF_OPER_TESTING)
 9813		netif_testing_on(dev);
 9814	else
 9815		netif_testing_off(dev);
 9816
 9817	if (netif_carrier_ok(rootdev))
 9818		netif_carrier_on(dev);
 9819	else
 9820		netif_carrier_off(dev);
 9821}
 9822EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 9823
 
 9824static int netif_alloc_rx_queues(struct net_device *dev)
 9825{
 9826	unsigned int i, count = dev->num_rx_queues;
 9827	struct netdev_rx_queue *rx;
 9828	size_t sz = count * sizeof(*rx);
 9829	int err = 0;
 9830
 9831	BUG_ON(count < 1);
 9832
 9833	rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
 9834	if (!rx)
 
 9835		return -ENOMEM;
 9836
 9837	dev->_rx = rx;
 9838
 9839	for (i = 0; i < count; i++) {
 9840		rx[i].dev = dev;
 9841
 9842		/* XDP RX-queue setup */
 9843		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
 9844		if (err < 0)
 9845			goto err_rxq_info;
 9846	}
 9847	return 0;
 9848
 9849err_rxq_info:
 9850	/* Rollback successful reg's and free other resources */
 9851	while (i--)
 9852		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
 9853	kvfree(dev->_rx);
 9854	dev->_rx = NULL;
 9855	return err;
 9856}
 9857
 9858static void netif_free_rx_queues(struct net_device *dev)
 9859{
 9860	unsigned int i, count = dev->num_rx_queues;
 9861
 9862	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
 9863	if (!dev->_rx)
 9864		return;
 9865
 9866	for (i = 0; i < count; i++)
 9867		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
 9868
 9869	kvfree(dev->_rx);
 9870}
 
 9871
 9872static void netdev_init_one_queue(struct net_device *dev,
 9873				  struct netdev_queue *queue, void *_unused)
 9874{
 9875	/* Initialize queue lock */
 9876	spin_lock_init(&queue->_xmit_lock);
 9877	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
 9878	queue->xmit_lock_owner = -1;
 9879	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
 9880	queue->dev = dev;
 9881#ifdef CONFIG_BQL
 9882	dql_init(&queue->dql, HZ);
 9883#endif
 9884}
 9885
 9886static void netif_free_tx_queues(struct net_device *dev)
 9887{
 9888	kvfree(dev->_tx);
 9889}
 9890
 9891static int netif_alloc_netdev_queues(struct net_device *dev)
 9892{
 9893	unsigned int count = dev->num_tx_queues;
 9894	struct netdev_queue *tx;
 9895	size_t sz = count * sizeof(*tx);
 9896
 9897	if (count < 1 || count > 0xffff)
 9898		return -EINVAL;
 9899
 9900	tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
 9901	if (!tx)
 
 9902		return -ENOMEM;
 9903
 9904	dev->_tx = tx;
 9905
 9906	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
 9907	spin_lock_init(&dev->tx_global_lock);
 9908
 9909	return 0;
 9910}
 9911
 9912void netif_tx_stop_all_queues(struct net_device *dev)
 9913{
 9914	unsigned int i;
 9915
 9916	for (i = 0; i < dev->num_tx_queues; i++) {
 9917		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 9918
 9919		netif_tx_stop_queue(txq);
 9920	}
 9921}
 9922EXPORT_SYMBOL(netif_tx_stop_all_queues);
 9923
 9924/**
 9925 * register_netdevice() - register a network device
 9926 * @dev: device to register
 9927 *
 9928 * Take a prepared network device structure and make it externally accessible.
 9929 * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
 9930 * Callers must hold the rtnl lock - you may want register_netdev()
 9931 * instead of this.
 
 
 
 
 
 
 
 9932 */
 
 9933int register_netdevice(struct net_device *dev)
 9934{
 9935	int ret;
 9936	struct net *net = dev_net(dev);
 9937
 9938	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
 9939		     NETDEV_FEATURE_COUNT);
 9940	BUG_ON(dev_boot_phase);
 9941	ASSERT_RTNL();
 9942
 9943	might_sleep();
 9944
 9945	/* When net_device's are persistent, this will be fatal. */
 9946	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
 9947	BUG_ON(!net);
 9948
 9949	ret = ethtool_check_ops(dev->ethtool_ops);
 9950	if (ret)
 9951		return ret;
 9952
 9953	spin_lock_init(&dev->addr_list_lock);
 9954	netdev_set_addr_lockdep_class(dev);
 9955
 9956	ret = dev_get_valid_name(net, dev, dev->name);
 
 
 9957	if (ret < 0)
 9958		goto out;
 9959
 9960	ret = -ENOMEM;
 9961	dev->name_node = netdev_name_node_head_alloc(dev);
 9962	if (!dev->name_node)
 9963		goto out;
 9964
 9965	/* Init, if this function is available */
 9966	if (dev->netdev_ops->ndo_init) {
 9967		ret = dev->netdev_ops->ndo_init(dev);
 9968		if (ret) {
 9969			if (ret > 0)
 9970				ret = -EIO;
 9971			goto err_free_name;
 9972		}
 9973	}
 9974
 9975	if (((dev->hw_features | dev->features) &
 9976	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
 9977	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
 9978	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
 9979		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
 9980		ret = -EINVAL;
 9981		goto err_uninit;
 9982	}
 9983
 9984	ret = -EBUSY;
 9985	if (!dev->ifindex)
 9986		dev->ifindex = dev_new_index(net);
 9987	else if (__dev_get_by_index(net, dev->ifindex))
 9988		goto err_uninit;
 9989
 9990	/* Transfer changeable features to wanted_features and enable
 9991	 * software offloads (GSO and GRO).
 9992	 */
 9993	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
 9994	dev->features |= NETIF_F_SOFT_FEATURES;
 9995
 9996	if (dev->udp_tunnel_nic_info) {
 9997		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 9998		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 9999	}
10000
10001	dev->wanted_features = dev->features & dev->hw_features;
10002
10003	if (!(dev->flags & IFF_LOOPBACK))
 
10004		dev->hw_features |= NETIF_F_NOCACHE_COPY;
10005
10006	/* If IPv4 TCP segmentation offload is supported we should also
10007	 * allow the device to enable segmenting the frame with the option
10008	 * of ignoring a static IP ID value.  This doesn't enable the
10009	 * feature itself but allows the user to enable it later.
10010	 */
10011	if (dev->hw_features & NETIF_F_TSO)
10012		dev->hw_features |= NETIF_F_TSO_MANGLEID;
10013	if (dev->vlan_features & NETIF_F_TSO)
10014		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
10015	if (dev->mpls_features & NETIF_F_TSO)
10016		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
10017	if (dev->hw_enc_features & NETIF_F_TSO)
10018		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
10019
10020	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10021	 */
10022	dev->vlan_features |= NETIF_F_HIGHDMA;
10023
10024	/* Make NETIF_F_SG inheritable to tunnel devices.
10025	 */
10026	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
10027
10028	/* Make NETIF_F_SG inheritable to MPLS.
10029	 */
10030	dev->mpls_features |= NETIF_F_SG;
10031
10032	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
10033	ret = notifier_to_errno(ret);
10034	if (ret)
10035		goto err_uninit;
10036
10037	ret = netdev_register_kobject(dev);
10038	write_lock(&dev_base_lock);
10039	dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
10040	write_unlock(&dev_base_lock);
10041	if (ret)
10042		goto err_uninit_notify;
 
10043
10044	__netdev_update_features(dev);
10045
10046	/*
10047	 *	Default initial state at registry is that the
10048	 *	device is present.
10049	 */
10050
10051	set_bit(__LINK_STATE_PRESENT, &dev->state);
10052
10053	linkwatch_init_dev(dev);
10054
10055	dev_init_scheduler(dev);
10056
10057	netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
10058	list_netdevice(dev);
10059
10060	add_device_randomness(dev->dev_addr, dev->addr_len);
10061
10062	/* If the device has permanent device address, driver should
10063	 * set dev_addr and also addr_assign_type should be set to
10064	 * NET_ADDR_PERM (default value).
10065	 */
10066	if (dev->addr_assign_type == NET_ADDR_PERM)
10067		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
10068
10069	/* Notify protocols, that a new device appeared. */
10070	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10071	ret = notifier_to_errno(ret);
10072	if (ret) {
10073		/* Expect explicit free_netdev() on failure */
10074		dev->needs_free_netdev = false;
10075		unregister_netdevice_queue(dev, NULL);
10076		goto out;
10077	}
10078	/*
10079	 *	Prevent userspace races by waiting until the network
10080	 *	device is fully setup before sending notifications.
10081	 */
10082	if (!dev->rtnl_link_ops ||
10083	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10084		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
10085
10086out:
10087	return ret;
10088
10089err_uninit_notify:
10090	call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
10091err_uninit:
10092	if (dev->netdev_ops->ndo_uninit)
10093		dev->netdev_ops->ndo_uninit(dev);
10094	if (dev->priv_destructor)
10095		dev->priv_destructor(dev);
10096err_free_name:
10097	netdev_name_node_free(dev->name_node);
10098	goto out;
10099}
10100EXPORT_SYMBOL(register_netdevice);
10101
10102/**
10103 *	init_dummy_netdev	- init a dummy network device for NAPI
10104 *	@dev: device to init
10105 *
10106 *	This takes a network device structure and initialize the minimum
10107 *	amount of fields so it can be used to schedule NAPI polls without
10108 *	registering a full blown interface. This is to be used by drivers
10109 *	that need to tie several hardware interfaces to a single NAPI
10110 *	poll scheduler due to HW limitations.
10111 */
10112int init_dummy_netdev(struct net_device *dev)
10113{
10114	/* Clear everything. Note we don't initialize spinlocks
10115	 * are they aren't supposed to be taken by any of the
10116	 * NAPI code and this dummy netdev is supposed to be
10117	 * only ever used for NAPI polls
10118	 */
10119	memset(dev, 0, sizeof(struct net_device));
10120
10121	/* make sure we BUG if trying to hit standard
10122	 * register/unregister code path
10123	 */
10124	dev->reg_state = NETREG_DUMMY;
10125
10126	/* NAPI wants this */
10127	INIT_LIST_HEAD(&dev->napi_list);
10128
10129	/* a dummy interface is started by default */
10130	set_bit(__LINK_STATE_PRESENT, &dev->state);
10131	set_bit(__LINK_STATE_START, &dev->state);
10132
10133	/* napi_busy_loop stats accounting wants this */
10134	dev_net_set(dev, &init_net);
10135
10136	/* Note : We dont allocate pcpu_refcnt for dummy devices,
10137	 * because users of this 'device' dont need to change
10138	 * its refcount.
10139	 */
10140
10141	return 0;
10142}
10143EXPORT_SYMBOL_GPL(init_dummy_netdev);
10144
10145
10146/**
10147 *	register_netdev	- register a network device
10148 *	@dev: device to register
10149 *
10150 *	Take a completed network device structure and add it to the kernel
10151 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10152 *	chain. 0 is returned on success. A negative errno code is returned
10153 *	on a failure to set up the device, or if the name is a duplicate.
10154 *
10155 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
10156 *	and expands the device name if you passed a format string to
10157 *	alloc_netdev.
10158 */
10159int register_netdev(struct net_device *dev)
10160{
10161	int err;
10162
10163	if (rtnl_lock_killable())
10164		return -EINTR;
10165	err = register_netdevice(dev);
10166	rtnl_unlock();
10167	return err;
10168}
10169EXPORT_SYMBOL(register_netdev);
10170
10171int netdev_refcnt_read(const struct net_device *dev)
10172{
10173#ifdef CONFIG_PCPU_DEV_REFCNT
10174	int i, refcnt = 0;
10175
10176	for_each_possible_cpu(i)
10177		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10178	return refcnt;
10179#else
10180	return refcount_read(&dev->dev_refcnt);
10181#endif
10182}
10183EXPORT_SYMBOL(netdev_refcnt_read);
10184
10185int netdev_unregister_timeout_secs __read_mostly = 10;
10186
10187#define WAIT_REFS_MIN_MSECS 1
10188#define WAIT_REFS_MAX_MSECS 250
10189/**
10190 * netdev_wait_allrefs_any - wait until all references are gone.
10191 * @list: list of net_devices to wait on
10192 *
10193 * This is called when unregistering network devices.
10194 *
10195 * Any protocol or device that holds a reference should register
10196 * for netdevice notification, and cleanup and put back the
10197 * reference if they receive an UNREGISTER event.
10198 * We can get stuck here if buggy protocols don't correctly
10199 * call dev_put.
10200 */
10201static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
10202{
10203	unsigned long rebroadcast_time, warning_time;
10204	struct net_device *dev;
10205	int wait = 0;
 
10206
10207	rebroadcast_time = warning_time = jiffies;
 
10208
10209	list_for_each_entry(dev, list, todo_list)
10210		if (netdev_refcnt_read(dev) == 1)
10211			return dev;
10212
10213	while (true) {
10214		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10215			rtnl_lock();
10216
10217			/* Rebroadcast unregister notification */
10218			list_for_each_entry(dev, list, todo_list)
10219				call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10220
10221			__rtnl_unlock();
10222			rcu_barrier();
10223			rtnl_lock();
10224
10225			list_for_each_entry(dev, list, todo_list)
10226				if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10227					     &dev->state)) {
10228					/* We must not have linkwatch events
10229					 * pending on unregister. If this
10230					 * happens, we simply run the queue
10231					 * unscheduled, resulting in a noop
10232					 * for this device.
10233					 */
10234					linkwatch_run_queue();
10235					break;
10236				}
10237
10238			__rtnl_unlock();
10239
10240			rebroadcast_time = jiffies;
10241		}
10242
10243		if (!wait) {
10244			rcu_barrier();
10245			wait = WAIT_REFS_MIN_MSECS;
10246		} else {
10247			msleep(wait);
10248			wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10249		}
10250
10251		list_for_each_entry(dev, list, todo_list)
10252			if (netdev_refcnt_read(dev) == 1)
10253				return dev;
10254
10255		if (time_after(jiffies, warning_time +
10256			       READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
10257			list_for_each_entry(dev, list, todo_list) {
10258				pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10259					 dev->name, netdev_refcnt_read(dev));
10260				ref_tracker_dir_print(&dev->refcnt_tracker, 10);
10261			}
10262
 
 
 
10263			warning_time = jiffies;
10264		}
10265	}
10266}
10267
10268/* The sequence is:
10269 *
10270 *	rtnl_lock();
10271 *	...
10272 *	register_netdevice(x1);
10273 *	register_netdevice(x2);
10274 *	...
10275 *	unregister_netdevice(y1);
10276 *	unregister_netdevice(y2);
10277 *      ...
10278 *	rtnl_unlock();
10279 *	free_netdev(y1);
10280 *	free_netdev(y2);
10281 *
10282 * We are invoked by rtnl_unlock().
10283 * This allows us to deal with problems:
10284 * 1) We can delete sysfs objects which invoke hotplug
10285 *    without deadlocking with linkwatch via keventd.
10286 * 2) Since we run with the RTNL semaphore not held, we can sleep
10287 *    safely in order to wait for the netdev refcnt to drop to zero.
10288 *
10289 * We must not return until all unregister events added during
10290 * the interval the lock was held have been completed.
10291 */
10292void netdev_run_todo(void)
10293{
10294	struct net_device *dev, *tmp;
10295	struct list_head list;
10296#ifdef CONFIG_LOCKDEP
10297	struct list_head unlink_list;
10298
10299	list_replace_init(&net_unlink_list, &unlink_list);
10300
10301	while (!list_empty(&unlink_list)) {
10302		struct net_device *dev = list_first_entry(&unlink_list,
10303							  struct net_device,
10304							  unlink_list);
10305		list_del_init(&dev->unlink_list);
10306		dev->nested_level = dev->lower_level - 1;
10307	}
10308#endif
10309
10310	/* Snapshot list, allow later requests */
10311	list_replace_init(&net_todo_list, &list);
10312
10313	__rtnl_unlock();
10314
10315	/* Wait for rcu callbacks to finish before next phase */
 
 
10316	if (!list_empty(&list))
10317		rcu_barrier();
10318
10319	list_for_each_entry_safe(dev, tmp, &list, todo_list) {
 
 
 
 
10320		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10321			netdev_WARN(dev, "run_todo but not unregistering\n");
10322			list_del(&dev->todo_list);
 
10323			continue;
10324		}
10325
10326		write_lock(&dev_base_lock);
10327		dev->reg_state = NETREG_UNREGISTERED;
10328		write_unlock(&dev_base_lock);
10329		linkwatch_forget_dev(dev);
10330	}
10331
10332	while (!list_empty(&list)) {
10333		dev = netdev_wait_allrefs_any(&list);
10334		list_del(&dev->todo_list);
10335
10336		/* paranoia */
10337		BUG_ON(netdev_refcnt_read(dev) != 1);
10338		BUG_ON(!list_empty(&dev->ptype_all));
10339		BUG_ON(!list_empty(&dev->ptype_specific));
10340		WARN_ON(rcu_access_pointer(dev->ip_ptr));
10341		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
 
10342
10343		if (dev->priv_destructor)
10344			dev->priv_destructor(dev);
10345		if (dev->needs_free_netdev)
10346			free_netdev(dev);
10347
10348		if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
10349			wake_up(&netdev_unregistering_wq);
10350
10351		/* Free network device */
10352		kobject_put(&dev->dev.kobj);
10353	}
10354}
10355
10356/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10357 * all the same fields in the same order as net_device_stats, with only
10358 * the type differing, but rtnl_link_stats64 may have additional fields
10359 * at the end for newer counters.
10360 */
10361void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10362			     const struct net_device_stats *netdev_stats)
10363{
10364	size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
10365	const atomic_long_t *src = (atomic_long_t *)netdev_stats;
 
 
 
 
10366	u64 *dst = (u64 *)stats64;
10367
10368	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
 
10369	for (i = 0; i < n; i++)
10370		dst[i] = (unsigned long)atomic_long_read(&src[i]);
10371	/* zero out counters that only exist in rtnl_link_stats64 */
10372	memset((char *)stats64 + n * sizeof(u64), 0,
10373	       sizeof(*stats64) - n * sizeof(u64));
10374}
10375EXPORT_SYMBOL(netdev_stats_to_stats64);
10376
10377struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev)
10378{
10379	struct net_device_core_stats __percpu *p;
10380
10381	p = alloc_percpu_gfp(struct net_device_core_stats,
10382			     GFP_ATOMIC | __GFP_NOWARN);
10383
10384	if (p && cmpxchg(&dev->core_stats, NULL, p))
10385		free_percpu(p);
10386
10387	/* This READ_ONCE() pairs with the cmpxchg() above */
10388	return READ_ONCE(dev->core_stats);
10389}
10390EXPORT_SYMBOL(netdev_core_stats_alloc);
10391
10392/**
10393 *	dev_get_stats	- get network device statistics
10394 *	@dev: device to get statistics from
10395 *	@storage: place to store stats
10396 *
10397 *	Get network statistics from device. Return @storage.
10398 *	The device driver may provide its own method by setting
10399 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10400 *	otherwise the internal statistics structure is used.
10401 */
10402struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10403					struct rtnl_link_stats64 *storage)
10404{
10405	const struct net_device_ops *ops = dev->netdev_ops;
10406	const struct net_device_core_stats __percpu *p;
10407
10408	if (ops->ndo_get_stats64) {
10409		memset(storage, 0, sizeof(*storage));
10410		ops->ndo_get_stats64(dev, storage);
10411	} else if (ops->ndo_get_stats) {
10412		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10413	} else {
10414		netdev_stats_to_stats64(storage, &dev->stats);
10415	}
10416
10417	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10418	p = READ_ONCE(dev->core_stats);
10419	if (p) {
10420		const struct net_device_core_stats *core_stats;
10421		int i;
10422
10423		for_each_possible_cpu(i) {
10424			core_stats = per_cpu_ptr(p, i);
10425			storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
10426			storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
10427			storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
10428			storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
10429		}
10430	}
10431	return storage;
10432}
10433EXPORT_SYMBOL(dev_get_stats);
10434
10435/**
10436 *	dev_fetch_sw_netstats - get per-cpu network device statistics
10437 *	@s: place to store stats
10438 *	@netstats: per-cpu network stats to read from
10439 *
10440 *	Read per-cpu network statistics and populate the related fields in @s.
10441 */
10442void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10443			   const struct pcpu_sw_netstats __percpu *netstats)
10444{
10445	int cpu;
10446
10447	for_each_possible_cpu(cpu) {
10448		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
10449		const struct pcpu_sw_netstats *stats;
10450		unsigned int start;
10451
10452		stats = per_cpu_ptr(netstats, cpu);
10453		do {
10454			start = u64_stats_fetch_begin(&stats->syncp);
10455			rx_packets = u64_stats_read(&stats->rx_packets);
10456			rx_bytes   = u64_stats_read(&stats->rx_bytes);
10457			tx_packets = u64_stats_read(&stats->tx_packets);
10458			tx_bytes   = u64_stats_read(&stats->tx_bytes);
10459		} while (u64_stats_fetch_retry(&stats->syncp, start));
10460
10461		s->rx_packets += rx_packets;
10462		s->rx_bytes   += rx_bytes;
10463		s->tx_packets += tx_packets;
10464		s->tx_bytes   += tx_bytes;
10465	}
10466}
10467EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10468
10469/**
10470 *	dev_get_tstats64 - ndo_get_stats64 implementation
10471 *	@dev: device to get statistics from
10472 *	@s: place to store stats
10473 *
10474 *	Populate @s from dev->stats and dev->tstats. Can be used as
10475 *	ndo_get_stats64() callback.
10476 */
10477void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
10478{
10479	netdev_stats_to_stats64(s, &dev->stats);
10480	dev_fetch_sw_netstats(s, dev->tstats);
10481}
10482EXPORT_SYMBOL_GPL(dev_get_tstats64);
10483
10484struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10485{
10486	struct netdev_queue *queue = dev_ingress_queue(dev);
10487
10488#ifdef CONFIG_NET_CLS_ACT
10489	if (queue)
10490		return queue;
10491	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10492	if (!queue)
10493		return NULL;
10494	netdev_init_one_queue(dev, queue, NULL);
10495	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10496	queue->qdisc_sleeping = &noop_qdisc;
10497	rcu_assign_pointer(dev->ingress_queue, queue);
10498#endif
10499	return queue;
10500}
10501
10502static const struct ethtool_ops default_ethtool_ops;
10503
10504void netdev_set_default_ethtool_ops(struct net_device *dev,
10505				    const struct ethtool_ops *ops)
10506{
10507	if (dev->ethtool_ops == &default_ethtool_ops)
10508		dev->ethtool_ops = ops;
10509}
10510EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10511
10512/**
10513 * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
10514 * @dev: netdev to enable the IRQ coalescing on
10515 *
10516 * Sets a conservative default for SW IRQ coalescing. Users can use
10517 * sysfs attributes to override the default values.
10518 */
10519void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
10520{
10521	WARN_ON(dev->reg_state == NETREG_REGISTERED);
10522
10523	dev->gro_flush_timeout = 20000;
10524	dev->napi_defer_hard_irqs = 1;
10525}
10526EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
10527
10528void netdev_freemem(struct net_device *dev)
10529{
10530	char *addr = (char *)dev - dev->padded;
10531
10532	kvfree(addr);
10533}
10534
10535/**
10536 * alloc_netdev_mqs - allocate network device
10537 * @sizeof_priv: size of private data to allocate space for
10538 * @name: device name format string
10539 * @name_assign_type: origin of device name
10540 * @setup: callback to initialize device
10541 * @txqs: the number of TX subqueues to allocate
10542 * @rxqs: the number of RX subqueues to allocate
10543 *
10544 * Allocates a struct net_device with private data area for driver use
10545 * and performs basic initialization.  Also allocates subqueue structs
10546 * for each queue on the device.
10547 */
10548struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10549		unsigned char name_assign_type,
10550		void (*setup)(struct net_device *),
10551		unsigned int txqs, unsigned int rxqs)
10552{
10553	struct net_device *dev;
10554	unsigned int alloc_size;
10555	struct net_device *p;
10556
10557	BUG_ON(strlen(name) >= sizeof(dev->name));
10558
10559	if (txqs < 1) {
10560		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10561		return NULL;
10562	}
10563
 
10564	if (rxqs < 1) {
10565		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10566		return NULL;
10567	}
 
10568
10569	alloc_size = sizeof(struct net_device);
10570	if (sizeof_priv) {
10571		/* ensure 32-byte alignment of private area */
10572		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10573		alloc_size += sizeof_priv;
10574	}
10575	/* ensure 32-byte alignment of whole construct */
10576	alloc_size += NETDEV_ALIGN - 1;
10577
10578	p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10579	if (!p)
 
10580		return NULL;
 
10581
10582	dev = PTR_ALIGN(p, NETDEV_ALIGN);
10583	dev->padded = (char *)dev - (char *)p;
10584
10585	ref_tracker_dir_init(&dev->refcnt_tracker, 128);
10586#ifdef CONFIG_PCPU_DEV_REFCNT
10587	dev->pcpu_refcnt = alloc_percpu(int);
10588	if (!dev->pcpu_refcnt)
10589		goto free_dev;
10590	__dev_hold(dev);
10591#else
10592	refcount_set(&dev->dev_refcnt, 1);
10593#endif
10594
10595	if (dev_addr_init(dev))
10596		goto free_pcpu;
10597
10598	dev_mc_init(dev);
10599	dev_uc_init(dev);
10600
10601	dev_net_set(dev, &init_net);
10602
10603	dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
10604	dev->gso_max_segs = GSO_MAX_SEGS;
10605	dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
10606	dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
10607	dev->tso_max_segs = TSO_MAX_SEGS;
10608	dev->upper_level = 1;
10609	dev->lower_level = 1;
10610#ifdef CONFIG_LOCKDEP
10611	dev->nested_level = 0;
10612	INIT_LIST_HEAD(&dev->unlink_list);
10613#endif
10614
10615	INIT_LIST_HEAD(&dev->napi_list);
10616	INIT_LIST_HEAD(&dev->unreg_list);
10617	INIT_LIST_HEAD(&dev->close_list);
10618	INIT_LIST_HEAD(&dev->link_watch_list);
10619	INIT_LIST_HEAD(&dev->adj_list.upper);
10620	INIT_LIST_HEAD(&dev->adj_list.lower);
10621	INIT_LIST_HEAD(&dev->ptype_all);
10622	INIT_LIST_HEAD(&dev->ptype_specific);
10623	INIT_LIST_HEAD(&dev->net_notifier_list);
10624#ifdef CONFIG_NET_SCHED
10625	hash_init(dev->qdisc_hash);
10626#endif
10627	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10628	setup(dev);
10629
10630	if (!dev->tx_queue_len) {
10631		dev->priv_flags |= IFF_NO_QUEUE;
10632		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10633	}
10634
10635	dev->num_tx_queues = txqs;
10636	dev->real_num_tx_queues = txqs;
10637	if (netif_alloc_netdev_queues(dev))
10638		goto free_all;
10639
 
10640	dev->num_rx_queues = rxqs;
10641	dev->real_num_rx_queues = rxqs;
10642	if (netif_alloc_rx_queues(dev))
10643		goto free_all;
 
10644
10645	strcpy(dev->name, name);
10646	dev->name_assign_type = name_assign_type;
10647	dev->group = INIT_NETDEV_GROUP;
10648	if (!dev->ethtool_ops)
10649		dev->ethtool_ops = &default_ethtool_ops;
10650
10651	nf_hook_netdev_init(dev);
10652
10653	return dev;
10654
10655free_all:
10656	free_netdev(dev);
10657	return NULL;
10658
10659free_pcpu:
10660#ifdef CONFIG_PCPU_DEV_REFCNT
10661	free_percpu(dev->pcpu_refcnt);
10662free_dev:
 
 
10663#endif
10664	netdev_freemem(dev);
 
 
10665	return NULL;
10666}
10667EXPORT_SYMBOL(alloc_netdev_mqs);
10668
10669/**
10670 * free_netdev - free network device
10671 * @dev: device
10672 *
10673 * This function does the last stage of destroying an allocated device
10674 * interface. The reference to the device object is released. If this
10675 * is the last reference then it will be freed.Must be called in process
10676 * context.
10677 */
10678void free_netdev(struct net_device *dev)
10679{
10680	struct napi_struct *p, *n;
10681
10682	might_sleep();
10683
10684	/* When called immediately after register_netdevice() failed the unwind
10685	 * handling may still be dismantling the device. Handle that case by
10686	 * deferring the free.
10687	 */
10688	if (dev->reg_state == NETREG_UNREGISTERING) {
10689		ASSERT_RTNL();
10690		dev->needs_free_netdev = true;
10691		return;
10692	}
10693
10694	netif_free_tx_queues(dev);
10695	netif_free_rx_queues(dev);
 
 
10696
10697	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10698
10699	/* Flush device addresses */
10700	dev_addr_flush(dev);
10701
10702	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10703		netif_napi_del(p);
10704
10705	ref_tracker_dir_exit(&dev->refcnt_tracker);
10706#ifdef CONFIG_PCPU_DEV_REFCNT
10707	free_percpu(dev->pcpu_refcnt);
10708	dev->pcpu_refcnt = NULL;
10709#endif
10710	free_percpu(dev->core_stats);
10711	dev->core_stats = NULL;
10712	free_percpu(dev->xdp_bulkq);
10713	dev->xdp_bulkq = NULL;
10714
10715	/*  Compatibility with error handling in drivers */
10716	if (dev->reg_state == NETREG_UNINITIALIZED) {
10717		netdev_freemem(dev);
10718		return;
10719	}
10720
10721	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10722	dev->reg_state = NETREG_RELEASED;
10723
10724	/* will free via device release */
10725	put_device(&dev->dev);
10726}
10727EXPORT_SYMBOL(free_netdev);
10728
10729/**
10730 *	synchronize_net -  Synchronize with packet receive processing
10731 *
10732 *	Wait for packets currently being received to be done.
10733 *	Does not block later packets from starting.
10734 */
10735void synchronize_net(void)
10736{
10737	might_sleep();
10738	if (rtnl_is_locked())
10739		synchronize_rcu_expedited();
10740	else
10741		synchronize_rcu();
10742}
10743EXPORT_SYMBOL(synchronize_net);
10744
10745/**
10746 *	unregister_netdevice_queue - remove device from the kernel
10747 *	@dev: device
10748 *	@head: list
10749 *
10750 *	This function shuts down a device interface and removes it
10751 *	from the kernel tables.
10752 *	If head not NULL, device is queued to be unregistered later.
10753 *
10754 *	Callers must hold the rtnl semaphore.  You may want
10755 *	unregister_netdev() instead of this.
10756 */
10757
10758void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
10759{
10760	ASSERT_RTNL();
10761
10762	if (head) {
10763		list_move_tail(&dev->unreg_list, head);
10764	} else {
10765		LIST_HEAD(single);
10766
10767		list_add(&dev->unreg_list, &single);
10768		unregister_netdevice_many(&single);
10769	}
10770}
10771EXPORT_SYMBOL(unregister_netdevice_queue);
10772
10773void unregister_netdevice_many_notify(struct list_head *head,
10774				      u32 portid, const struct nlmsghdr *nlh)
10775{
10776	struct net_device *dev, *tmp;
10777	LIST_HEAD(close_head);
10778
10779	BUG_ON(dev_boot_phase);
10780	ASSERT_RTNL();
10781
10782	if (list_empty(head))
10783		return;
10784
10785	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
10786		/* Some devices call without registering
10787		 * for initialization unwind. Remove those
10788		 * devices and proceed with the remaining.
10789		 */
10790		if (dev->reg_state == NETREG_UNINITIALIZED) {
10791			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
10792				 dev->name, dev);
10793
10794			WARN_ON(1);
10795			list_del(&dev->unreg_list);
10796			continue;
10797		}
10798		dev->dismantle = true;
10799		BUG_ON(dev->reg_state != NETREG_REGISTERED);
10800	}
10801
10802	/* If device is running, close it first. */
10803	list_for_each_entry(dev, head, unreg_list)
10804		list_add_tail(&dev->close_list, &close_head);
10805	dev_close_many(&close_head, true);
10806
10807	list_for_each_entry(dev, head, unreg_list) {
10808		/* And unlink it from device chain. */
10809		write_lock(&dev_base_lock);
10810		unlist_netdevice(dev, false);
10811		dev->reg_state = NETREG_UNREGISTERING;
10812		write_unlock(&dev_base_lock);
10813	}
10814	flush_all_backlogs();
10815
10816	synchronize_net();
10817
10818	list_for_each_entry(dev, head, unreg_list) {
10819		struct sk_buff *skb = NULL;
10820
10821		/* Shutdown queueing discipline. */
10822		dev_shutdown(dev);
10823
10824		dev_xdp_uninstall(dev);
10825
10826		netdev_offload_xstats_disable_all(dev);
10827
10828		/* Notify protocols, that we are about to destroy
10829		 * this device. They should clean all the things.
10830		 */
10831		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10832
10833		if (!dev->rtnl_link_ops ||
10834		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10835			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
10836						     GFP_KERNEL, NULL, 0,
10837						     portid, nlmsg_seq(nlh));
10838
10839		/*
10840		 *	Flush the unicast and multicast chains
10841		 */
10842		dev_uc_flush(dev);
10843		dev_mc_flush(dev);
10844
10845		netdev_name_node_alt_flush(dev);
10846		netdev_name_node_free(dev->name_node);
10847
10848		call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
10849
10850		if (dev->netdev_ops->ndo_uninit)
10851			dev->netdev_ops->ndo_uninit(dev);
10852
10853		if (skb)
10854			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
10855
10856		/* Notifier chain MUST detach us all upper devices. */
10857		WARN_ON(netdev_has_any_upper_dev(dev));
10858		WARN_ON(netdev_has_any_lower_dev(dev));
10859
10860		/* Remove entries from kobject tree */
10861		netdev_unregister_kobject(dev);
10862#ifdef CONFIG_XPS
10863		/* Remove XPS queueing entries */
10864		netif_reset_xps_queues_gt(dev, 0);
10865#endif
10866	}
10867
10868	synchronize_net();
10869
10870	list_for_each_entry(dev, head, unreg_list) {
10871		netdev_put(dev, &dev->dev_registered_tracker);
10872		net_set_todo(dev);
10873	}
10874
10875	list_del(head);
10876}
10877
10878/**
10879 *	unregister_netdevice_many - unregister many devices
10880 *	@head: list of devices
10881 *
10882 *  Note: As most callers use a stack allocated list_head,
10883 *  we force a list_del() to make sure stack wont be corrupted later.
10884 */
10885void unregister_netdevice_many(struct list_head *head)
10886{
10887	unregister_netdevice_many_notify(head, 0, NULL);
 
 
 
 
 
 
10888}
10889EXPORT_SYMBOL(unregister_netdevice_many);
10890
10891/**
10892 *	unregister_netdev - remove device from the kernel
10893 *	@dev: device
10894 *
10895 *	This function shuts down a device interface and removes it
10896 *	from the kernel tables.
10897 *
10898 *	This is just a wrapper for unregister_netdevice that takes
10899 *	the rtnl semaphore.  In general you want to use this and not
10900 *	unregister_netdevice.
10901 */
10902void unregister_netdev(struct net_device *dev)
10903{
10904	rtnl_lock();
10905	unregister_netdevice(dev);
10906	rtnl_unlock();
10907}
10908EXPORT_SYMBOL(unregister_netdev);
10909
10910/**
10911 *	__dev_change_net_namespace - move device to different nethost namespace
10912 *	@dev: device
10913 *	@net: network namespace
10914 *	@pat: If not NULL name pattern to try if the current device name
10915 *	      is already taken in the destination network namespace.
10916 *	@new_ifindex: If not zero, specifies device index in the target
10917 *	              namespace.
10918 *
10919 *	This function shuts down a device interface and moves it
10920 *	to a new network namespace. On success 0 is returned, on
10921 *	a failure a netagive errno code is returned.
10922 *
10923 *	Callers must hold the rtnl semaphore.
10924 */
10925
10926int __dev_change_net_namespace(struct net_device *dev, struct net *net,
10927			       const char *pat, int new_ifindex)
10928{
10929	struct net *net_old = dev_net(dev);
10930	int err, new_nsid;
10931
10932	ASSERT_RTNL();
10933
10934	/* Don't allow namespace local devices to be moved. */
10935	err = -EINVAL;
10936	if (dev->features & NETIF_F_NETNS_LOCAL)
10937		goto out;
10938
10939	/* Ensure the device has been registrered */
 
10940	if (dev->reg_state != NETREG_REGISTERED)
10941		goto out;
10942
10943	/* Get out if there is nothing todo */
10944	err = 0;
10945	if (net_eq(net_old, net))
10946		goto out;
10947
10948	/* Pick the destination device name, and ensure
10949	 * we can use it in the destination network namespace.
10950	 */
10951	err = -EEXIST;
10952	if (netdev_name_in_use(net, dev->name)) {
10953		/* We get here if we can't use the current device name */
10954		if (!pat)
10955			goto out;
10956		err = dev_get_valid_name(net, dev, pat);
10957		if (err < 0)
10958			goto out;
10959	}
10960
10961	/* Check that new_ifindex isn't used yet. */
10962	err = -EBUSY;
10963	if (new_ifindex && __dev_get_by_index(net, new_ifindex))
10964		goto out;
10965
10966	/*
10967	 * And now a mini version of register_netdevice unregister_netdevice.
10968	 */
10969
10970	/* If device is running close it first. */
10971	dev_close(dev);
10972
10973	/* And unlink it from device chain */
10974	unlist_netdevice(dev, true);
 
10975
10976	synchronize_net();
10977
10978	/* Shutdown queueing discipline. */
10979	dev_shutdown(dev);
10980
10981	/* Notify protocols, that we are about to destroy
10982	 * this device. They should clean all the things.
10983	 *
10984	 * Note that dev->reg_state stays at NETREG_REGISTERED.
10985	 * This is wanted because this way 8021q and macvlan know
10986	 * the device is just moving and can keep their slaves up.
10987	 */
10988	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10989	rcu_barrier();
10990
10991	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
10992	/* If there is an ifindex conflict assign a new one */
10993	if (!new_ifindex) {
10994		if (__dev_get_by_index(net, dev->ifindex))
10995			new_ifindex = dev_new_index(net);
10996		else
10997			new_ifindex = dev->ifindex;
10998	}
10999
11000	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
11001			    new_ifindex);
11002
11003	/*
11004	 *	Flush the unicast and multicast chains
11005	 */
11006	dev_uc_flush(dev);
11007	dev_mc_flush(dev);
11008
11009	/* Send a netdev-removed uevent to the old namespace */
11010	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
11011	netdev_adjacent_del_links(dev);
11012
11013	/* Move per-net netdevice notifiers that are following the netdevice */
11014	move_netdevice_notifiers_dev_net(dev, net);
11015
11016	/* Actually switch the network namespace */
11017	dev_net_set(dev, net);
11018	dev->ifindex = new_ifindex;
11019
11020	/* Send a netdev-add uevent to the new namespace */
11021	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
11022	netdev_adjacent_add_links(dev);
 
 
 
 
11023
11024	/* Fixup kobjects */
11025	err = device_rename(&dev->dev, dev->name);
11026	WARN_ON(err);
11027
11028	/* Adapt owner in case owning user namespace of target network
11029	 * namespace is different from the original one.
11030	 */
11031	err = netdev_change_owner(dev, net_old, net);
11032	WARN_ON(err);
11033
11034	/* Add the device back in the hashes */
11035	list_netdevice(dev);
11036
11037	/* Notify protocols, that a new device appeared. */
11038	call_netdevice_notifiers(NETDEV_REGISTER, dev);
11039
11040	/*
11041	 *	Prevent userspace races by waiting until the network
11042	 *	device is fully setup before sending notifications.
11043	 */
11044	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
11045
11046	synchronize_net();
11047	err = 0;
11048out:
11049	return err;
11050}
11051EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
11052
11053static int dev_cpu_dead(unsigned int oldcpu)
 
 
11054{
11055	struct sk_buff **list_skb;
11056	struct sk_buff *skb;
11057	unsigned int cpu;
11058	struct softnet_data *sd, *oldsd, *remsd = NULL;
 
 
 
11059
11060	local_irq_disable();
11061	cpu = smp_processor_id();
11062	sd = &per_cpu(softnet_data, cpu);
11063	oldsd = &per_cpu(softnet_data, oldcpu);
11064
11065	/* Find end of our completion_queue. */
11066	list_skb = &sd->completion_queue;
11067	while (*list_skb)
11068		list_skb = &(*list_skb)->next;
11069	/* Append completion queue from offline CPU. */
11070	*list_skb = oldsd->completion_queue;
11071	oldsd->completion_queue = NULL;
11072
11073	/* Append output queue from offline CPU. */
11074	if (oldsd->output_queue) {
11075		*sd->output_queue_tailp = oldsd->output_queue;
11076		sd->output_queue_tailp = oldsd->output_queue_tailp;
11077		oldsd->output_queue = NULL;
11078		oldsd->output_queue_tailp = &oldsd->output_queue;
11079	}
11080	/* Append NAPI poll list from offline CPU, with one exception :
11081	 * process_backlog() must be called by cpu owning percpu backlog.
11082	 * We properly handle process_queue & input_pkt_queue later.
11083	 */
11084	while (!list_empty(&oldsd->poll_list)) {
11085		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
11086							    struct napi_struct,
11087							    poll_list);
11088
11089		list_del_init(&napi->poll_list);
11090		if (napi->poll == process_backlog)
11091			napi->state = 0;
11092		else
11093			____napi_schedule(sd, napi);
11094	}
11095
11096	raise_softirq_irqoff(NET_TX_SOFTIRQ);
11097	local_irq_enable();
11098
11099#ifdef CONFIG_RPS
11100	remsd = oldsd->rps_ipi_list;
11101	oldsd->rps_ipi_list = NULL;
11102#endif
11103	/* send out pending IPI's on offline CPU */
11104	net_rps_send_ipi(remsd);
11105
11106	/* Process offline CPU's input_pkt_queue */
11107	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
11108		netif_rx(skb);
11109		input_queue_head_incr(oldsd);
11110	}
11111	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
11112		netif_rx(skb);
11113		input_queue_head_incr(oldsd);
11114	}
11115
11116	return 0;
11117}
11118
 
11119/**
11120 *	netdev_increment_features - increment feature set by one
11121 *	@all: current feature set
11122 *	@one: new feature set
11123 *	@mask: mask feature set
11124 *
11125 *	Computes a new feature set after adding a device with feature set
11126 *	@one to the master device with current feature set @all.  Will not
11127 *	enable anything that is off in @mask. Returns the new feature set.
11128 */
11129netdev_features_t netdev_increment_features(netdev_features_t all,
11130	netdev_features_t one, netdev_features_t mask)
11131{
11132	if (mask & NETIF_F_HW_CSUM)
11133		mask |= NETIF_F_CSUM_MASK;
11134	mask |= NETIF_F_VLAN_CHALLENGED;
11135
11136	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
11137	all &= one | ~NETIF_F_ALL_FOR_ALL;
11138
11139	/* If one device supports hw checksumming, set for all. */
11140	if (all & NETIF_F_HW_CSUM)
11141		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
11142
11143	return all;
11144}
11145EXPORT_SYMBOL(netdev_increment_features);
11146
11147static struct hlist_head * __net_init netdev_create_hash(void)
11148{
11149	int i;
11150	struct hlist_head *hash;
11151
11152	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
11153	if (hash != NULL)
11154		for (i = 0; i < NETDEV_HASHENTRIES; i++)
11155			INIT_HLIST_HEAD(&hash[i]);
11156
11157	return hash;
11158}
11159
11160/* Initialize per network namespace state */
11161static int __net_init netdev_init(struct net *net)
11162{
11163	BUILD_BUG_ON(GRO_HASH_BUCKETS >
11164		     8 * sizeof_field(struct napi_struct, gro_bitmask));
11165
11166	INIT_LIST_HEAD(&net->dev_base_head);
11167
11168	net->dev_name_head = netdev_create_hash();
11169	if (net->dev_name_head == NULL)
11170		goto err_name;
11171
11172	net->dev_index_head = netdev_create_hash();
11173	if (net->dev_index_head == NULL)
11174		goto err_idx;
11175
11176	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
11177
11178	return 0;
11179
11180err_idx:
11181	kfree(net->dev_name_head);
11182err_name:
11183	return -ENOMEM;
11184}
11185
11186/**
11187 *	netdev_drivername - network driver for the device
11188 *	@dev: network device
11189 *
11190 *	Determine network driver for device.
11191 */
11192const char *netdev_drivername(const struct net_device *dev)
11193{
11194	const struct device_driver *driver;
11195	const struct device *parent;
11196	const char *empty = "";
11197
11198	parent = dev->dev.parent;
11199	if (!parent)
11200		return empty;
11201
11202	driver = parent->driver;
11203	if (driver && driver->name)
11204		return driver->name;
11205	return empty;
11206}
11207
11208static void __netdev_printk(const char *level, const struct net_device *dev,
11209			    struct va_format *vaf)
11210{
11211	if (dev && dev->dev.parent) {
11212		dev_printk_emit(level[1] - '0',
11213				dev->dev.parent,
11214				"%s %s %s%s: %pV",
11215				dev_driver_string(dev->dev.parent),
11216				dev_name(dev->dev.parent),
11217				netdev_name(dev), netdev_reg_state(dev),
11218				vaf);
11219	} else if (dev) {
11220		printk("%s%s%s: %pV",
11221		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
11222	} else {
11223		printk("%s(NULL net_device): %pV", level, vaf);
11224	}
11225}
 
11226
11227void netdev_printk(const char *level, const struct net_device *dev,
11228		   const char *format, ...)
11229{
11230	struct va_format vaf;
11231	va_list args;
 
11232
11233	va_start(args, format);
11234
11235	vaf.fmt = format;
11236	vaf.va = &args;
11237
11238	__netdev_printk(level, dev, &vaf);
 
11239
11240	va_end(args);
11241}
11242EXPORT_SYMBOL(netdev_printk);
11243
11244#define define_netdev_printk_level(func, level)			\
11245void func(const struct net_device *dev, const char *fmt, ...)	\
11246{								\
 
11247	struct va_format vaf;					\
11248	va_list args;						\
11249								\
11250	va_start(args, fmt);					\
11251								\
11252	vaf.fmt = fmt;						\
11253	vaf.va = &args;						\
11254								\
11255	__netdev_printk(level, dev, &vaf);			\
 
11256								\
11257	va_end(args);						\
11258}								\
11259EXPORT_SYMBOL(func);
11260
11261define_netdev_printk_level(netdev_emerg, KERN_EMERG);
11262define_netdev_printk_level(netdev_alert, KERN_ALERT);
11263define_netdev_printk_level(netdev_crit, KERN_CRIT);
11264define_netdev_printk_level(netdev_err, KERN_ERR);
11265define_netdev_printk_level(netdev_warn, KERN_WARNING);
11266define_netdev_printk_level(netdev_notice, KERN_NOTICE);
11267define_netdev_printk_level(netdev_info, KERN_INFO);
11268
11269static void __net_exit netdev_exit(struct net *net)
11270{
11271	kfree(net->dev_name_head);
11272	kfree(net->dev_index_head);
11273	if (net != &init_net)
11274		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
11275}
11276
11277static struct pernet_operations __net_initdata netdev_net_ops = {
11278	.init = netdev_init,
11279	.exit = netdev_exit,
11280};
11281
11282static void __net_exit default_device_exit_net(struct net *net)
11283{
11284	struct net_device *dev, *aux;
11285	/*
11286	 * Push all migratable network devices back to the
11287	 * initial network namespace
11288	 */
11289	ASSERT_RTNL();
11290	for_each_netdev_safe(net, dev, aux) {
11291		int err;
11292		char fb_name[IFNAMSIZ];
11293
11294		/* Ignore unmoveable devices (i.e. loopback) */
11295		if (dev->features & NETIF_F_NETNS_LOCAL)
11296			continue;
11297
11298		/* Leave virtual devices for the generic cleanup */
11299		if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
11300			continue;
11301
11302		/* Push remaining network devices to init_net */
11303		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
11304		if (netdev_name_in_use(&init_net, fb_name))
11305			snprintf(fb_name, IFNAMSIZ, "dev%%d");
11306		err = dev_change_net_namespace(dev, &init_net, fb_name);
11307		if (err) {
11308			pr_emerg("%s: failed to move %s to init_net: %d\n",
11309				 __func__, dev->name, err);
11310			BUG();
11311		}
11312	}
 
11313}
11314
11315static void __net_exit default_device_exit_batch(struct list_head *net_list)
11316{
11317	/* At exit all network devices most be removed from a network
11318	 * namespace.  Do this in the reverse order of registration.
11319	 * Do this across as many network namespaces as possible to
11320	 * improve batching efficiency.
11321	 */
11322	struct net_device *dev;
11323	struct net *net;
11324	LIST_HEAD(dev_kill_list);
11325
11326	rtnl_lock();
11327	list_for_each_entry(net, net_list, exit_list) {
11328		default_device_exit_net(net);
11329		cond_resched();
11330	}
11331
11332	list_for_each_entry(net, net_list, exit_list) {
11333		for_each_netdev_reverse(net, dev) {
11334			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
11335				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
11336			else
11337				unregister_netdevice_queue(dev, &dev_kill_list);
11338		}
11339	}
11340	unregister_netdevice_many(&dev_kill_list);
 
11341	rtnl_unlock();
11342}
11343
11344static struct pernet_operations __net_initdata default_device_ops = {
 
11345	.exit_batch = default_device_exit_batch,
11346};
11347
11348/*
11349 *	Initialize the DEV module. At boot time this walks the device list and
11350 *	unhooks any devices that fail to initialise (normally hardware not
11351 *	present) and leaves us with a valid list of present and active devices.
11352 *
11353 */
11354
11355/*
11356 *       This is called single threaded during boot, so no need
11357 *       to take the rtnl semaphore.
11358 */
11359static int __init net_dev_init(void)
11360{
11361	int i, rc = -ENOMEM;
11362
11363	BUG_ON(!dev_boot_phase);
11364
11365	if (dev_proc_init())
11366		goto out;
11367
11368	if (netdev_kobject_init())
11369		goto out;
11370
11371	INIT_LIST_HEAD(&ptype_all);
11372	for (i = 0; i < PTYPE_HASH_SIZE; i++)
11373		INIT_LIST_HEAD(&ptype_base[i]);
11374
11375	if (register_pernet_subsys(&netdev_net_ops))
11376		goto out;
11377
11378	/*
11379	 *	Initialise the packet receive queues.
11380	 */
11381
11382	for_each_possible_cpu(i) {
11383		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11384		struct softnet_data *sd = &per_cpu(softnet_data, i);
11385
11386		INIT_WORK(flush, flush_backlog);
11387
11388		skb_queue_head_init(&sd->input_pkt_queue);
11389		skb_queue_head_init(&sd->process_queue);
11390#ifdef CONFIG_XFRM_OFFLOAD
11391		skb_queue_head_init(&sd->xfrm_backlog);
11392#endif
11393		INIT_LIST_HEAD(&sd->poll_list);
 
11394		sd->output_queue_tailp = &sd->output_queue;
11395#ifdef CONFIG_RPS
11396		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
 
 
11397		sd->cpu = i;
11398#endif
11399		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
11400		spin_lock_init(&sd->defer_lock);
11401
11402		init_gro_hash(&sd->backlog);
11403		sd->backlog.poll = process_backlog;
11404		sd->backlog.weight = weight_p;
 
 
11405	}
11406
11407	dev_boot_phase = 0;
11408
11409	/* The loopback device is special if any other network devices
11410	 * is present in a network namespace the loopback device must
11411	 * be present. Since we now dynamically allocate and free the
11412	 * loopback device ensure this invariant is maintained by
11413	 * keeping the loopback device as the first device on the
11414	 * list of network devices.  Ensuring the loopback devices
11415	 * is the first device that appears and the last network device
11416	 * that disappears.
11417	 */
11418	if (register_pernet_device(&loopback_net_ops))
11419		goto out;
11420
11421	if (register_pernet_device(&default_device_ops))
11422		goto out;
11423
11424	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11425	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11426
11427	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11428				       NULL, dev_cpu_dead);
11429	WARN_ON(rc < 0);
11430	rc = 0;
11431out:
11432	return rc;
11433}
11434
11435subsys_initcall(net_dev_init);