dev.c - net/core/dev.c - Linux diff v3.5.6 - Bootlin Elixir Cross Referencer

 
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
 
  84#include <linux/mutex.h>
 
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
 
 
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
 
  99#include <linux/rtnetlink.h>
 100#include <linux/proc_fs.h>
 101#include <linux/seq_file.h>
 102#include <linux/stat.h>
 103#include <net/dst.h>
 
 104#include <net/pkt_sched.h>
 
 105#include <net/checksum.h>
 106#include <net/xfrm.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/kmod.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/wext.h>
 115#include <net/iw_handler.h>
 116#include <asm/current.h>
 117#include <linux/audit.h>
 118#include <linux/dmaengine.h>
 119#include <linux/err.h>
 120#include <linux/ctype.h>
 121#include <linux/if_arp.h>
 122#include <linux/if_vlan.h>
 123#include <linux/ip.h>
 124#include <net/ip.h>
 
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129#include <trace/events/napi.h>
 130#include <trace/events/net.h>
 131#include <trace/events/skb.h>
 132#include <linux/pci.h>
 133#include <linux/inetdevice.h>
 134#include <linux/cpu_rmap.h>
 135#include <linux/net_tstamp.h>
 136#include <linux/static_key.h>
 137#include <net/flow_keys.h>
 
 
 
 
 
 
 
 
 
 
 
 
 138
 139#include "net-sysfs.h"
 140
 141/* Instead of increasing this, you should create a hash table. */
 142#define MAX_GRO_SKBS 8
 143
 144/* This should be increased if a protocol with a bigger head is added. */
 145#define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147/*
 148 *	The list of packet types we will receive (as opposed to discard)
 149 *	and the routines to invoke.
 150 *
 151 *	Why 16. Because with 16 the only overlap we get on a hash of the
 152 *	low nibble of the protocol value is RARP/SNAP/X.25.
 153 *
 154 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 155 *             sure which should go first, but I bet it won't make much
 156 *             difference if we are running VLANs.  The good news is that
 157 *             this protocol won't be in the list unless compiled in, so
 158 *             the average user (w/out VLANs) will not be adversely affected.
 159 *             --BLG
 160 *
 161 *		0800	IP
 162 *		8100    802.1Q VLAN
 163 *		0001	802.3
 164 *		0002	AX.25
 165 *		0004	802.2
 166 *		8035	RARP
 167 *		0005	SNAP
 168 *		0805	X.25
 169 *		0806	ARP
 170 *		8137	IPX
 171 *		0009	Localtalk
 172 *		86DD	IPv6
 173 */
 174
 175#define PTYPE_HASH_SIZE	(16)
 176#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 177
 178static DEFINE_SPINLOCK(ptype_lock);
 179static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 180static struct list_head ptype_all __read_mostly;	/* Taps */
 
 
 
 
 
 
 
 
 
 
 181
 182/*
 183 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 184 * semaphore.
 185 *
 186 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 187 *
 188 * Writers must hold the rtnl semaphore while they loop through the
 189 * dev_base_head list, and hold dev_base_lock for writing when they do the
 190 * actual updates.  This allows pure readers to access the list even
 191 * while a writer is preparing to update it.
 192 *
 193 * To put it another way, dev_base_lock is held for writing only to
 194 * protect against pure readers; the rtnl semaphore provides the
 195 * protection against other writers.
 196 *
 197 * See, for example usages, register_netdevice() and
 198 * unregister_netdevice(), which must be called with the rtnl
 199 * semaphore held.
 200 */
 201DEFINE_RWLOCK(dev_base_lock);
 202EXPORT_SYMBOL(dev_base_lock);
 203
 
 
 
 
 
 
 
 
 
 
 204static inline void dev_base_seq_inc(struct net *net)
 205{
 206	while (++net->dev_base_seq == 0);
 
 207}
 208
 209static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 210{
 211	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 212
 213	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 214}
 215
 216static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 217{
 218	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 219}
 220
 221static inline void rps_lock(struct softnet_data *sd)
 222{
 223#ifdef CONFIG_RPS
 224	spin_lock(&sd->input_pkt_queue.lock);
 225#endif
 226}
 227
 228static inline void rps_unlock(struct softnet_data *sd)
 229{
 230#ifdef CONFIG_RPS
 231	spin_unlock(&sd->input_pkt_queue.lock);
 232#endif
 233}
 234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 235/* Device list insertion */
 236static int list_netdevice(struct net_device *dev)
 237{
 238	struct net *net = dev_net(dev);
 239
 240	ASSERT_RTNL();
 241
 242	write_lock_bh(&dev_base_lock);
 243	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 244	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 245	hlist_add_head_rcu(&dev->index_hlist,
 246			   dev_index_hash(net, dev->ifindex));
 247	write_unlock_bh(&dev_base_lock);
 248
 249	dev_base_seq_inc(net);
 250
 251	return 0;
 252}
 253
 254/* Device list removal
 255 * caller must respect a RCU grace period before freeing/reusing dev
 256 */
 257static void unlist_netdevice(struct net_device *dev)
 258{
 259	ASSERT_RTNL();
 260
 261	/* Unlink dev from the device chain */
 262	write_lock_bh(&dev_base_lock);
 263	list_del_rcu(&dev->dev_list);
 264	hlist_del_rcu(&dev->name_hlist);
 265	hlist_del_rcu(&dev->index_hlist);
 266	write_unlock_bh(&dev_base_lock);
 267
 268	dev_base_seq_inc(dev_net(dev));
 269}
 270
 271/*
 272 *	Our notifier list
 273 */
 274
 275static RAW_NOTIFIER_HEAD(netdev_chain);
 276
 277/*
 278 *	Device drivers call our routines to queue packets here. We empty the
 279 *	queue in the local softnet handler.
 280 */
 281
 282DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 283EXPORT_PER_CPU_SYMBOL(softnet_data);
 284
 285#ifdef CONFIG_LOCKDEP
 286/*
 287 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 288 * according to dev->type
 289 */
 290static const unsigned short netdev_lock_type[] =
 291	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 292	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 293	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 294	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 295	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 296	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 297	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 298	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 299	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 300	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 301	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 302	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 303	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 304	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 305	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 306
 307static const char *const netdev_lock_name[] =
 308	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 309	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 310	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 311	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 312	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 313	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 314	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 315	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 316	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 317	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 318	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 319	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 320	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 321	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 322	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 323
 324static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 325static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 326
 327static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 328{
 329	int i;
 330
 331	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 332		if (netdev_lock_type[i] == dev_type)
 333			return i;
 334	/* the last key is used by default */
 335	return ARRAY_SIZE(netdev_lock_type) - 1;
 336}
 337
 338static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 339						 unsigned short dev_type)
 340{
 341	int i;
 342
 343	i = netdev_lock_pos(dev_type);
 344	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 345				   netdev_lock_name[i]);
 346}
 347
 348static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 349{
 350	int i;
 351
 352	i = netdev_lock_pos(dev->type);
 353	lockdep_set_class_and_name(&dev->addr_list_lock,
 354				   &netdev_addr_lock_key[i],
 355				   netdev_lock_name[i]);
 356}
 357#else
 358static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 359						 unsigned short dev_type)
 360{
 361}
 
 362static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 363{
 364}
 365#endif
 366
 367/*******************************************************************************
 
 
 
 
 368
 369		Protocol management and registration routines
 370
 371*******************************************************************************/
 372
 373/*
 374 *	Add a protocol ID to the list. Now that the input handler is
 375 *	smarter we can dispense with all the messy stuff that used to be
 376 *	here.
 377 *
 378 *	BEWARE!!! Protocol handlers, mangling input packets,
 379 *	MUST BE last in hash buckets and checking protocol handlers
 380 *	MUST start from promiscuous ptype_all chain in net_bh.
 381 *	It is true now, do not change it.
 382 *	Explanation follows: if protocol handler, mangling packet, will
 383 *	be the first on list, it is not able to sense, that packet
 384 *	is cloned and should be copied-on-write, so that it will
 385 *	change it and subsequent readers will get broken packet.
 386 *							--ANK (980803)
 387 */
 388
 389static inline struct list_head *ptype_head(const struct packet_type *pt)
 390{
 391	if (pt->type == htons(ETH_P_ALL))
 392		return &ptype_all;
 393	else
 394		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 
 395}
 396
 397/**
 398 *	dev_add_pack - add packet handler
 399 *	@pt: packet type declaration
 400 *
 401 *	Add a protocol handler to the networking stack. The passed &packet_type
 402 *	is linked into kernel lists and may not be freed until it has been
 403 *	removed from the kernel lists.
 404 *
 405 *	This call does not sleep therefore it can not
 406 *	guarantee all CPU's that are in middle of receiving packets
 407 *	will see the new packet type (until the next received packet).
 408 */
 409
 410void dev_add_pack(struct packet_type *pt)
 411{
 412	struct list_head *head = ptype_head(pt);
 413
 414	spin_lock(&ptype_lock);
 415	list_add_rcu(&pt->list, head);
 416	spin_unlock(&ptype_lock);
 417}
 418EXPORT_SYMBOL(dev_add_pack);
 419
 420/**
 421 *	__dev_remove_pack	 - remove packet handler
 422 *	@pt: packet type declaration
 423 *
 424 *	Remove a protocol handler that was previously added to the kernel
 425 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 426 *	from the kernel lists and can be freed or reused once this function
 427 *	returns.
 428 *
 429 *      The packet type might still be in use by receivers
 430 *	and must not be freed until after all the CPU's have gone
 431 *	through a quiescent state.
 432 */
 433void __dev_remove_pack(struct packet_type *pt)
 434{
 435	struct list_head *head = ptype_head(pt);
 436	struct packet_type *pt1;
 437
 438	spin_lock(&ptype_lock);
 439
 440	list_for_each_entry(pt1, head, list) {
 441		if (pt == pt1) {
 442			list_del_rcu(&pt->list);
 443			goto out;
 444		}
 445	}
 446
 447	pr_warn("dev_remove_pack: %p not found\n", pt);
 448out:
 449	spin_unlock(&ptype_lock);
 450}
 451EXPORT_SYMBOL(__dev_remove_pack);
 452
 453/**
 454 *	dev_remove_pack	 - remove packet handler
 455 *	@pt: packet type declaration
 456 *
 457 *	Remove a protocol handler that was previously added to the kernel
 458 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 459 *	from the kernel lists and can be freed or reused once this function
 460 *	returns.
 461 *
 462 *	This call sleeps to guarantee that no CPU is looking at the packet
 463 *	type after return.
 464 */
 465void dev_remove_pack(struct packet_type *pt)
 466{
 467	__dev_remove_pack(pt);
 468
 469	synchronize_net();
 470}
 471EXPORT_SYMBOL(dev_remove_pack);
 472
 473/******************************************************************************
 474
 475		      Device Boot-time Settings Routines
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 476
 477*******************************************************************************/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 478
 479/* Boot time configuration table */
 480static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 481
 482/**
 483 *	netdev_boot_setup_add	- add new setup entry
 484 *	@name: name of the device
 485 *	@map: configured settings for the device
 486 *
 487 *	Adds new setup entry to the dev_boot_setup list.  The function
 488 *	returns 0 on error and 1 on success.  This is a generic routine to
 489 *	all netdevices.
 490 */
 491static int netdev_boot_setup_add(char *name, struct ifmap *map)
 492{
 493	struct netdev_boot_setup *s;
 494	int i;
 495
 496	s = dev_boot_setup;
 497	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 498		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 499			memset(s[i].name, 0, sizeof(s[i].name));
 500			strlcpy(s[i].name, name, IFNAMSIZ);
 501			memcpy(&s[i].map, map, sizeof(s[i].map));
 502			break;
 503		}
 504	}
 505
 506	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 507}
 508
 509/**
 510 *	netdev_boot_setup_check	- check boot time settings
 511 *	@dev: the netdevice
 512 *
 513 * 	Check boot time settings for the device.
 514 *	The found settings are set for the device to be used
 515 *	later in the device probing.
 516 *	Returns 0 if no settings found, 1 if they are.
 517 */
 518int netdev_boot_setup_check(struct net_device *dev)
 519{
 520	struct netdev_boot_setup *s = dev_boot_setup;
 521	int i;
 522
 523	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 524		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 525		    !strcmp(dev->name, s[i].name)) {
 526			dev->irq 	= s[i].map.irq;
 527			dev->base_addr 	= s[i].map.base_addr;
 528			dev->mem_start 	= s[i].map.mem_start;
 529			dev->mem_end 	= s[i].map.mem_end;
 530			return 1;
 531		}
 532	}
 533	return 0;
 534}
 535EXPORT_SYMBOL(netdev_boot_setup_check);
 536
 537
 538/**
 539 *	netdev_boot_base	- get address from boot time settings
 540 *	@prefix: prefix for network device
 541 *	@unit: id for network device
 542 *
 543 * 	Check boot time settings for the base address of device.
 544 *	The found settings are set for the device to be used
 545 *	later in the device probing.
 546 *	Returns 0 if no settings found.
 547 */
 548unsigned long netdev_boot_base(const char *prefix, int unit)
 549{
 550	const struct netdev_boot_setup *s = dev_boot_setup;
 551	char name[IFNAMSIZ];
 552	int i;
 553
 554	sprintf(name, "%s%d", prefix, unit);
 555
 556	/*
 557	 * If device already registered then return base of 1
 558	 * to indicate not to probe for this interface
 559	 */
 560	if (__dev_get_by_name(&init_net, name))
 561		return 1;
 562
 563	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 564		if (!strcmp(name, s[i].name))
 565			return s[i].map.base_addr;
 566	return 0;
 567}
 568
 569/*
 570 * Saves at boot time configured settings for any netdevice.
 571 */
 572int __init netdev_boot_setup(char *str)
 573{
 574	int ints[5];
 575	struct ifmap map;
 576
 577	str = get_options(str, ARRAY_SIZE(ints), ints);
 578	if (!str || !*str)
 579		return 0;
 580
 581	/* Save settings */
 582	memset(&map, 0, sizeof(map));
 583	if (ints[0] > 0)
 584		map.irq = ints[1];
 585	if (ints[0] > 1)
 586		map.base_addr = ints[2];
 587	if (ints[0] > 2)
 588		map.mem_start = ints[3];
 589	if (ints[0] > 3)
 590		map.mem_end = ints[4];
 591
 592	/* Add new entry to the list */
 593	return netdev_boot_setup_add(str, &map);
 594}
 595
 596__setup("netdev=", netdev_boot_setup);
 597
 598/*******************************************************************************
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 599
 600			    Device Interface Subroutines
 
 
 
 
 601
 602*******************************************************************************/
 
 
 603
 604/**
 605 *	__dev_get_by_name	- find a device by its name
 606 *	@net: the applicable net namespace
 607 *	@name: name to find
 608 *
 609 *	Find an interface by name. Must be called under RTNL semaphore
 610 *	or @dev_base_lock. If the name is found a pointer to the device
 611 *	is returned. If the name is not found then %NULL is returned. The
 612 *	reference counters are not incremented so the caller must be
 613 *	careful with locks.
 614 */
 615
 616struct net_device *__dev_get_by_name(struct net *net, const char *name)
 617{
 618	struct hlist_node *p;
 619	struct net_device *dev;
 620	struct hlist_head *head = dev_name_hash(net, name);
 621
 622	hlist_for_each_entry(dev, p, head, name_hlist)
 623		if (!strncmp(dev->name, name, IFNAMSIZ))
 624			return dev;
 625
 626	return NULL;
 627}
 628EXPORT_SYMBOL(__dev_get_by_name);
 629
 630/**
 631 *	dev_get_by_name_rcu	- find a device by its name
 632 *	@net: the applicable net namespace
 633 *	@name: name to find
 634 *
 635 *	Find an interface by name.
 636 *	If the name is found a pointer to the device is returned.
 637 * 	If the name is not found then %NULL is returned.
 638 *	The reference counters are not incremented so the caller must be
 639 *	careful with locks. The caller must hold RCU lock.
 640 */
 641
 642struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 643{
 644	struct hlist_node *p;
 645	struct net_device *dev;
 646	struct hlist_head *head = dev_name_hash(net, name);
 647
 648	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 649		if (!strncmp(dev->name, name, IFNAMSIZ))
 650			return dev;
 651
 652	return NULL;
 
 653}
 654EXPORT_SYMBOL(dev_get_by_name_rcu);
 655
 656/**
 657 *	dev_get_by_name		- find a device by its name
 658 *	@net: the applicable net namespace
 659 *	@name: name to find
 660 *
 661 *	Find an interface by name. This can be called from any
 662 *	context and does its own locking. The returned handle has
 663 *	the usage count incremented and the caller must use dev_put() to
 664 *	release it when it is no longer needed. %NULL is returned if no
 665 *	matching device is found.
 666 */
 667
 668struct net_device *dev_get_by_name(struct net *net, const char *name)
 669{
 670	struct net_device *dev;
 671
 672	rcu_read_lock();
 673	dev = dev_get_by_name_rcu(net, name);
 674	if (dev)
 675		dev_hold(dev);
 676	rcu_read_unlock();
 677	return dev;
 678}
 679EXPORT_SYMBOL(dev_get_by_name);
 680
 681/**
 682 *	__dev_get_by_index - find a device by its ifindex
 683 *	@net: the applicable net namespace
 684 *	@ifindex: index of device
 685 *
 686 *	Search for an interface by index. Returns %NULL if the device
 687 *	is not found or a pointer to the device. The device has not
 688 *	had its reference counter increased so the caller must be careful
 689 *	about locking. The caller must hold either the RTNL semaphore
 690 *	or @dev_base_lock.
 691 */
 692
 693struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 694{
 695	struct hlist_node *p;
 696	struct net_device *dev;
 697	struct hlist_head *head = dev_index_hash(net, ifindex);
 698
 699	hlist_for_each_entry(dev, p, head, index_hlist)
 700		if (dev->ifindex == ifindex)
 701			return dev;
 702
 703	return NULL;
 704}
 705EXPORT_SYMBOL(__dev_get_by_index);
 706
 707/**
 708 *	dev_get_by_index_rcu - find a device by its ifindex
 709 *	@net: the applicable net namespace
 710 *	@ifindex: index of device
 711 *
 712 *	Search for an interface by index. Returns %NULL if the device
 713 *	is not found or a pointer to the device. The device has not
 714 *	had its reference counter increased so the caller must be careful
 715 *	about locking. The caller must hold RCU lock.
 716 */
 717
 718struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 719{
 720	struct hlist_node *p;
 721	struct net_device *dev;
 722	struct hlist_head *head = dev_index_hash(net, ifindex);
 723
 724	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 725		if (dev->ifindex == ifindex)
 726			return dev;
 727
 728	return NULL;
 729}
 730EXPORT_SYMBOL(dev_get_by_index_rcu);
 731
 732
 733/**
 734 *	dev_get_by_index - find a device by its ifindex
 735 *	@net: the applicable net namespace
 736 *	@ifindex: index of device
 737 *
 738 *	Search for an interface by index. Returns NULL if the device
 739 *	is not found or a pointer to the device. The device returned has
 740 *	had a reference added and the pointer is safe until the user calls
 741 *	dev_put to indicate they have finished with it.
 742 */
 743
 744struct net_device *dev_get_by_index(struct net *net, int ifindex)
 745{
 746	struct net_device *dev;
 747
 748	rcu_read_lock();
 749	dev = dev_get_by_index_rcu(net, ifindex);
 750	if (dev)
 751		dev_hold(dev);
 752	rcu_read_unlock();
 753	return dev;
 754}
 755EXPORT_SYMBOL(dev_get_by_index);
 756
 757/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 758 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 759 *	@net: the applicable net namespace
 760 *	@type: media type of device
 761 *	@ha: hardware address
 762 *
 763 *	Search for an interface by MAC address. Returns NULL if the device
 764 *	is not found or a pointer to the device.
 765 *	The caller must hold RCU or RTNL.
 766 *	The returned device has not had its ref count increased
 767 *	and the caller must therefore be careful about locking
 768 *
 769 */
 770
 771struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 772				       const char *ha)
 773{
 774	struct net_device *dev;
 775
 776	for_each_netdev_rcu(net, dev)
 777		if (dev->type == type &&
 778		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 779			return dev;
 780
 781	return NULL;
 782}
 783EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 784
 785struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 786{
 787	struct net_device *dev;
 788
 789	ASSERT_RTNL();
 790	for_each_netdev(net, dev)
 791		if (dev->type == type)
 792			return dev;
 793
 794	return NULL;
 795}
 796EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 797
 798struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 799{
 800	struct net_device *dev, *ret = NULL;
 801
 802	rcu_read_lock();
 803	for_each_netdev_rcu(net, dev)
 804		if (dev->type == type) {
 805			dev_hold(dev);
 806			ret = dev;
 807			break;
 808		}
 809	rcu_read_unlock();
 810	return ret;
 811}
 812EXPORT_SYMBOL(dev_getfirstbyhwtype);
 813
 814/**
 815 *	dev_get_by_flags_rcu - find any device with given flags
 816 *	@net: the applicable net namespace
 817 *	@if_flags: IFF_* values
 818 *	@mask: bitmask of bits in if_flags to check
 819 *
 820 *	Search for any interface with the given flags. Returns NULL if a device
 821 *	is not found or a pointer to the device. Must be called inside
 822 *	rcu_read_lock(), and result refcount is unchanged.
 823 */
 824
 825struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 826				    unsigned short mask)
 827{
 828	struct net_device *dev, *ret;
 829
 
 
 830	ret = NULL;
 831	for_each_netdev_rcu(net, dev) {
 832		if (((dev->flags ^ if_flags) & mask) == 0) {
 833			ret = dev;
 834			break;
 835		}
 836	}
 837	return ret;
 838}
 839EXPORT_SYMBOL(dev_get_by_flags_rcu);
 840
 841/**
 842 *	dev_valid_name - check if name is okay for network device
 843 *	@name: name string
 844 *
 845 *	Network device names need to be valid file names to
 846 *	to allow sysfs to work.  We also disallow any kind of
 847 *	whitespace.
 848 */
 849bool dev_valid_name(const char *name)
 850{
 851	if (*name == '\0')
 852		return false;
 853	if (strlen(name) >= IFNAMSIZ)
 854		return false;
 855	if (!strcmp(name, ".") || !strcmp(name, ".."))
 856		return false;
 857
 858	while (*name) {
 859		if (*name == '/' || isspace(*name))
 860			return false;
 861		name++;
 862	}
 863	return true;
 864}
 865EXPORT_SYMBOL(dev_valid_name);
 866
 867/**
 868 *	__dev_alloc_name - allocate a name for a device
 869 *	@net: network namespace to allocate the device name in
 870 *	@name: name format string
 871 *	@buf:  scratch buffer and result name string
 872 *
 873 *	Passed a format string - eg "lt%d" it will try and find a suitable
 874 *	id. It scans list of devices to build up a free map, then chooses
 875 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 876 *	while allocating the name and adding the device in order to avoid
 877 *	duplicates.
 878 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 879 *	Returns the number of the unit assigned or a negative errno code.
 880 */
 881
 882static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 883{
 884	int i = 0;
 885	const char *p;
 886	const int max_netdevices = 8*PAGE_SIZE;
 887	unsigned long *inuse;
 888	struct net_device *d;
 889
 890	p = strnchr(name, IFNAMSIZ-1, '%');
 
 
 
 891	if (p) {
 892		/*
 893		 * Verify the string as this thing may have come from
 894		 * the user.  There must be either one "%d" and no other "%"
 895		 * characters.
 896		 */
 897		if (p[1] != 'd' || strchr(p + 2, '%'))
 898			return -EINVAL;
 899
 900		/* Use one page as a bit array of possible slots */
 901		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 902		if (!inuse)
 903			return -ENOMEM;
 904
 905		for_each_netdev(net, d) {
 906			if (!sscanf(d->name, name, &i))
 907				continue;
 908			if (i < 0 || i >= max_netdevices)
 909				continue;
 910
 911			/*  avoid cases where sscanf is not exact inverse of printf */
 912			snprintf(buf, IFNAMSIZ, name, i);
 913			if (!strncmp(buf, d->name, IFNAMSIZ))
 914				set_bit(i, inuse);
 915		}
 916
 917		i = find_first_zero_bit(inuse, max_netdevices);
 918		free_page((unsigned long) inuse);
 919	}
 920
 921	if (buf != name)
 922		snprintf(buf, IFNAMSIZ, name, i);
 923	if (!__dev_get_by_name(net, buf))
 924		return i;
 925
 926	/* It is possible to run out of possible slots
 927	 * when the name is long and there isn't enough space left
 928	 * for the digits, or if all bits are used.
 929	 */
 930	return -ENFILE;
 931}
 932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 933/**
 934 *	dev_alloc_name - allocate a name for a device
 935 *	@dev: device
 936 *	@name: name format string
 937 *
 938 *	Passed a format string - eg "lt%d" it will try and find a suitable
 939 *	id. It scans list of devices to build up a free map, then chooses
 940 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 941 *	while allocating the name and adding the device in order to avoid
 942 *	duplicates.
 943 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 944 *	Returns the number of the unit assigned or a negative errno code.
 945 */
 946
 947int dev_alloc_name(struct net_device *dev, const char *name)
 948{
 949	char buf[IFNAMSIZ];
 950	struct net *net;
 951	int ret;
 952
 953	BUG_ON(!dev_net(dev));
 954	net = dev_net(dev);
 955	ret = __dev_alloc_name(net, name, buf);
 956	if (ret >= 0)
 957		strlcpy(dev->name, buf, IFNAMSIZ);
 958	return ret;
 959}
 960EXPORT_SYMBOL(dev_alloc_name);
 961
 962static int dev_get_valid_name(struct net_device *dev, const char *name)
 
 963{
 964	struct net *net;
 965
 966	BUG_ON(!dev_net(dev));
 967	net = dev_net(dev);
 968
 969	if (!dev_valid_name(name))
 970		return -EINVAL;
 971
 972	if (strchr(name, '%'))
 973		return dev_alloc_name(dev, name);
 974	else if (__dev_get_by_name(net, name))
 975		return -EEXIST;
 976	else if (dev->name != name)
 977		strlcpy(dev->name, name, IFNAMSIZ);
 978
 979	return 0;
 980}
 981
 982/**
 983 *	dev_change_name - change name of a device
 984 *	@dev: device
 985 *	@newname: name (or format string) must be at least IFNAMSIZ
 986 *
 987 *	Change name of a device, can pass format strings "eth%d".
 988 *	for wildcarding.
 989 */
 990int dev_change_name(struct net_device *dev, const char *newname)
 991{
 
 992	char oldname[IFNAMSIZ];
 993	int err = 0;
 994	int ret;
 995	struct net *net;
 996
 997	ASSERT_RTNL();
 998	BUG_ON(!dev_net(dev));
 999
1000	net = dev_net(dev);
1001	if (dev->flags & IFF_UP)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1002		return -EBUSY;
1003
1004	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 
 
 
1005		return 0;
 
1006
1007	memcpy(oldname, dev->name, IFNAMSIZ);
1008
1009	err = dev_get_valid_name(dev, newname);
1010	if (err < 0)
 
1011		return err;
 
 
 
 
 
 
 
1012
1013rollback:
1014	ret = device_rename(&dev->dev, dev->name);
1015	if (ret) {
1016		memcpy(dev->name, oldname, IFNAMSIZ);
 
 
1017		return ret;
1018	}
1019
 
 
 
 
1020	write_lock_bh(&dev_base_lock);
1021	hlist_del_rcu(&dev->name_hlist);
1022	write_unlock_bh(&dev_base_lock);
1023
1024	synchronize_rcu();
1025
1026	write_lock_bh(&dev_base_lock);
1027	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1028	write_unlock_bh(&dev_base_lock);
1029
1030	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1031	ret = notifier_to_errno(ret);
1032
1033	if (ret) {
1034		/* err >= 0 after dev_alloc_name() or stores the first errno */
1035		if (err >= 0) {
1036			err = ret;
 
1037			memcpy(dev->name, oldname, IFNAMSIZ);
 
 
 
1038			goto rollback;
1039		} else {
1040			pr_err("%s: name change rollback failed: %d\n",
1041			       dev->name, ret);
1042		}
1043	}
1044
1045	return err;
1046}
1047
1048/**
1049 *	dev_set_alias - change ifalias of a device
1050 *	@dev: device
1051 *	@alias: name up to IFALIASZ
1052 *	@len: limit of bytes to copy from info
1053 *
1054 *	Set ifalias for a device,
1055 */
1056int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057{
1058	char *new_ifalias;
1059
1060	ASSERT_RTNL();
1061
1062	if (len >= IFALIASZ)
1063		return -EINVAL;
1064
1065	if (!len) {
1066		if (dev->ifalias) {
1067			kfree(dev->ifalias);
1068			dev->ifalias = NULL;
1069		}
1070		return 0;
 
1071	}
1072
1073	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1074	if (!new_ifalias)
1075		return -ENOMEM;
1076	dev->ifalias = new_ifalias;
 
 
 
1077
1078	strlcpy(dev->ifalias, alias, len+1);
1079	return len;
1080}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1081
 
 
1082
1083/**
1084 *	netdev_features_change - device changes features
1085 *	@dev: device to cause notification
1086 *
1087 *	Called to indicate a device has changed features.
1088 */
1089void netdev_features_change(struct net_device *dev)
1090{
1091	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1092}
1093EXPORT_SYMBOL(netdev_features_change);
1094
1095/**
1096 *	netdev_state_change - device changes state
1097 *	@dev: device to cause notification
1098 *
1099 *	Called to indicate a device has changed state. This function calls
1100 *	the notifier chains for netdev_chain and sends a NEWLINK message
1101 *	to the routing socket.
1102 */
1103void netdev_state_change(struct net_device *dev)
1104{
1105	if (dev->flags & IFF_UP) {
1106		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1107		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 
 
 
 
 
1108	}
1109}
1110EXPORT_SYMBOL(netdev_state_change);
1111
1112int netdev_bonding_change(struct net_device *dev, unsigned long event)
1113{
1114	return call_netdevice_notifiers(event, dev);
1115}
1116EXPORT_SYMBOL(netdev_bonding_change);
1117
1118/**
1119 *	dev_load 	- load a network module
1120 *	@net: the applicable net namespace
1121 *	@name: name of interface
1122 *
1123 *	If a network interface is not present and the process has suitable
1124 *	privileges this function loads the module. If module loading is not
1125 *	available in this kernel then it becomes a nop.
 
 
1126 */
1127
1128void dev_load(struct net *net, const char *name)
1129{
1130	struct net_device *dev;
1131	int no_module;
1132
1133	rcu_read_lock();
1134	dev = dev_get_by_name_rcu(net, name);
1135	rcu_read_unlock();
1136
1137	no_module = !dev;
1138	if (no_module && capable(CAP_NET_ADMIN))
1139		no_module = request_module("netdev-%s", name);
1140	if (no_module && capable(CAP_SYS_MODULE)) {
1141		if (!request_module("%s", name))
1142			pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1143				name);
1144	}
1145}
1146EXPORT_SYMBOL(dev_load);
1147
1148static int __dev_open(struct net_device *dev)
1149{
1150	const struct net_device_ops *ops = dev->netdev_ops;
1151	int ret;
1152
1153	ASSERT_RTNL();
1154
1155	if (!netif_device_present(dev))
1156		return -ENODEV;
 
 
 
 
 
 
 
 
 
 
 
1157
1158	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1159	ret = notifier_to_errno(ret);
1160	if (ret)
1161		return ret;
1162
1163	set_bit(__LINK_STATE_START, &dev->state);
1164
1165	if (ops->ndo_validate_addr)
1166		ret = ops->ndo_validate_addr(dev);
1167
1168	if (!ret && ops->ndo_open)
1169		ret = ops->ndo_open(dev);
1170
 
 
1171	if (ret)
1172		clear_bit(__LINK_STATE_START, &dev->state);
1173	else {
1174		dev->flags |= IFF_UP;
1175		net_dmaengine_get();
1176		dev_set_rx_mode(dev);
1177		dev_activate(dev);
1178		add_device_randomness(dev->dev_addr, dev->addr_len);
1179	}
1180
1181	return ret;
1182}
1183
1184/**
1185 *	dev_open	- prepare an interface for use.
1186 *	@dev:	device to open
 
1187 *
1188 *	Takes a device from down to up state. The device's private open
1189 *	function is invoked and then the multicast lists are loaded. Finally
1190 *	the device is moved into the up state and a %NETDEV_UP message is
1191 *	sent to the netdev notifier chain.
1192 *
1193 *	Calling this function on an active interface is a nop. On a failure
1194 *	a negative errno code is returned.
1195 */
1196int dev_open(struct net_device *dev)
1197{
1198	int ret;
1199
1200	if (dev->flags & IFF_UP)
1201		return 0;
1202
1203	ret = __dev_open(dev);
1204	if (ret < 0)
1205		return ret;
1206
1207	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1208	call_netdevice_notifiers(NETDEV_UP, dev);
1209
1210	return ret;
1211}
1212EXPORT_SYMBOL(dev_open);
1213
1214static int __dev_close_many(struct list_head *head)
1215{
1216	struct net_device *dev;
1217
1218	ASSERT_RTNL();
1219	might_sleep();
1220
1221	list_for_each_entry(dev, head, unreg_list) {
 
 
 
1222		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1223
1224		clear_bit(__LINK_STATE_START, &dev->state);
1225
1226		/* Synchronize to scheduled poll. We cannot touch poll list, it
1227		 * can be even on different cpu. So just clear netif_running().
1228		 *
1229		 * dev->stop() will invoke napi_disable() on all of it's
1230		 * napi_struct instances on this device.
1231		 */
1232		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1233	}
1234
1235	dev_deactivate_many(head);
1236
1237	list_for_each_entry(dev, head, unreg_list) {
1238		const struct net_device_ops *ops = dev->netdev_ops;
1239
1240		/*
1241		 *	Call the device specific close. This cannot fail.
1242		 *	Only if device is UP
1243		 *
1244		 *	We allow it to be called even after a DETACH hot-plug
1245		 *	event.
1246		 */
1247		if (ops->ndo_stop)
1248			ops->ndo_stop(dev);
1249
1250		dev->flags &= ~IFF_UP;
1251		net_dmaengine_put();
1252	}
1253
1254	return 0;
1255}
1256
1257static int __dev_close(struct net_device *dev)
1258{
1259	int retval;
1260	LIST_HEAD(single);
1261
1262	list_add(&dev->unreg_list, &single);
1263	retval = __dev_close_many(&single);
1264	list_del(&single);
1265	return retval;
1266}
1267
1268static int dev_close_many(struct list_head *head)
1269{
1270	struct net_device *dev, *tmp;
1271	LIST_HEAD(tmp_list);
1272
1273	list_for_each_entry_safe(dev, tmp, head, unreg_list)
 
1274		if (!(dev->flags & IFF_UP))
1275			list_move(&dev->unreg_list, &tmp_list);
1276
1277	__dev_close_many(head);
1278
1279	list_for_each_entry(dev, head, unreg_list) {
1280		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1281		call_netdevice_notifiers(NETDEV_DOWN, dev);
 
 
1282	}
1283
1284	/* rollback_registered_many needs the complete original list */
1285	list_splice(&tmp_list, head);
1286	return 0;
1287}
 
1288
1289/**
1290 *	dev_close - shutdown an interface.
1291 *	@dev: device to shutdown
1292 *
1293 *	This function moves an active device into down state. A
1294 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1295 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1296 *	chain.
1297 */
1298int dev_close(struct net_device *dev)
1299{
1300	if (dev->flags & IFF_UP) {
1301		LIST_HEAD(single);
1302
1303		list_add(&dev->unreg_list, &single);
1304		dev_close_many(&single);
1305		list_del(&single);
1306	}
1307	return 0;
1308}
1309EXPORT_SYMBOL(dev_close);
1310
1311
1312/**
1313 *	dev_disable_lro - disable Large Receive Offload on a device
1314 *	@dev: device
1315 *
1316 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1317 *	called under RTNL.  This is needed if received packets may be
1318 *	forwarded to another interface.
1319 */
1320void dev_disable_lro(struct net_device *dev)
1321{
1322	/*
1323	 * If we're trying to disable lro on a vlan device
1324	 * use the underlying physical device instead
1325	 */
1326	if (is_vlan_dev(dev))
1327		dev = vlan_dev_real_dev(dev);
1328
1329	dev->wanted_features &= ~NETIF_F_LRO;
1330	netdev_update_features(dev);
1331
1332	if (unlikely(dev->features & NETIF_F_LRO))
1333		netdev_WARN(dev, "failed to disable LRO!\n");
 
 
 
1334}
1335EXPORT_SYMBOL(dev_disable_lro);
1336
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1337
1338static int dev_boot_phase = 1;
1339
1340/**
1341 *	register_netdevice_notifier - register a network notifier block
1342 *	@nb: notifier
1343 *
1344 *	Register a notifier to be called when network device events occur.
1345 *	The notifier passed is linked into the kernel structures and must
1346 *	not be reused until it has been unregistered. A negative errno code
1347 *	is returned on a failure.
1348 *
1349 * 	When registered all registration and up events are replayed
1350 *	to the new notifier to allow device to have a race free
1351 *	view of the network device list.
1352 */
1353
1354int register_netdevice_notifier(struct notifier_block *nb)
1355{
1356	struct net_device *dev;
1357	struct net_device *last;
1358	struct net *net;
1359	int err;
1360
 
 
1361	rtnl_lock();
1362	err = raw_notifier_chain_register(&netdev_chain, nb);
1363	if (err)
1364		goto unlock;
1365	if (dev_boot_phase)
1366		goto unlock;
1367	for_each_net(net) {
1368		for_each_netdev(net, dev) {
1369			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1370			err = notifier_to_errno(err);
1371			if (err)
1372				goto rollback;
1373
1374			if (!(dev->flags & IFF_UP))
1375				continue;
1376
1377			nb->notifier_call(nb, NETDEV_UP, dev);
1378		}
1379	}
1380
1381unlock:
1382	rtnl_unlock();
 
1383	return err;
1384
1385rollback:
1386	last = dev;
1387	for_each_net(net) {
1388		for_each_netdev(net, dev) {
1389			if (dev == last)
1390				goto outroll;
1391
1392			if (dev->flags & IFF_UP) {
1393				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1394				nb->notifier_call(nb, NETDEV_DOWN, dev);
1395			}
1396			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1397			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1398		}
1399	}
1400
1401outroll:
1402	raw_notifier_chain_unregister(&netdev_chain, nb);
1403	goto unlock;
1404}
1405EXPORT_SYMBOL(register_netdevice_notifier);
1406
1407/**
1408 *	unregister_netdevice_notifier - unregister a network notifier block
1409 *	@nb: notifier
1410 *
1411 *	Unregister a notifier previously registered by
1412 *	register_netdevice_notifier(). The notifier is unlinked into the
1413 *	kernel structures and may then be reused. A negative errno code
1414 *	is returned on a failure.
1415 *
1416 * 	After unregistering unregister and down device events are synthesized
1417 *	for all devices on the device list to the removed notifier to remove
1418 *	the need for special case cleanup code.
1419 */
1420
1421int unregister_netdevice_notifier(struct notifier_block *nb)
1422{
1423	struct net_device *dev;
1424	struct net *net;
1425	int err;
1426
 
 
1427	rtnl_lock();
1428	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1429	if (err)
1430		goto unlock;
1431
1432	for_each_net(net) {
1433		for_each_netdev(net, dev) {
1434			if (dev->flags & IFF_UP) {
1435				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1436				nb->notifier_call(nb, NETDEV_DOWN, dev);
1437			}
1438			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1439			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1440		}
1441	}
1442unlock:
1443	rtnl_unlock();
 
1444	return err;
1445}
1446EXPORT_SYMBOL(unregister_netdevice_notifier);
1447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1448/**
1449 *	call_netdevice_notifiers - call all network notifier blocks
1450 *      @val: value passed unmodified to notifier function
1451 *      @dev: net_device pointer passed unmodified to notifier function
1452 *
1453 *	Call all network notifier blocks.  Parameters and return value
1454 *	are as for raw_notifier_call_chain().
1455 */
1456
1457int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1458{
1459	ASSERT_RTNL();
1460	return raw_notifier_call_chain(&netdev_chain, val, dev);
1461}
1462EXPORT_SYMBOL(call_netdevice_notifiers);
1463
1464static struct static_key netstamp_needed __read_mostly;
1465#ifdef HAVE_JUMP_LABEL
1466/* We are not allowed to call static_key_slow_dec() from irq context
1467 * If net_disable_timestamp() is called from irq context, defer the
1468 * static_key_slow_dec() calls.
 
 
 
1469 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1470static atomic_t netstamp_needed_deferred;
 
 
 
 
 
 
 
 
 
 
 
 
 
1471#endif
1472
1473void net_enable_timestamp(void)
1474{
1475#ifdef HAVE_JUMP_LABEL
1476	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1477
1478	if (deferred) {
1479		while (--deferred)
1480			static_key_slow_dec(&netstamp_needed);
1481		return;
 
 
1482	}
 
 
 
 
1483#endif
1484	WARN_ON(in_interrupt());
1485	static_key_slow_inc(&netstamp_needed);
1486}
1487EXPORT_SYMBOL(net_enable_timestamp);
1488
1489void net_disable_timestamp(void)
1490{
1491#ifdef HAVE_JUMP_LABEL
1492	if (in_interrupt()) {
1493		atomic_inc(&netstamp_needed_deferred);
1494		return;
 
 
 
 
 
1495	}
 
 
 
 
1496#endif
1497	static_key_slow_dec(&netstamp_needed);
1498}
1499EXPORT_SYMBOL(net_disable_timestamp);
1500
1501static inline void net_timestamp_set(struct sk_buff *skb)
1502{
1503	skb->tstamp.tv64 = 0;
1504	if (static_key_false(&netstamp_needed))
1505		__net_timestamp(skb);
1506}
1507
1508#define net_timestamp_check(COND, SKB)			\
1509	if (static_key_false(&netstamp_needed)) {		\
1510		if ((COND) && !(SKB)->tstamp.tv64)	\
1511			__net_timestamp(SKB);		\
1512	}						\
1513
1514static int net_hwtstamp_validate(struct ifreq *ifr)
1515{
1516	struct hwtstamp_config cfg;
1517	enum hwtstamp_tx_types tx_type;
1518	enum hwtstamp_rx_filters rx_filter;
1519	int tx_type_valid = 0;
1520	int rx_filter_valid = 0;
1521
1522	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1523		return -EFAULT;
1524
1525	if (cfg.flags) /* reserved for future extensions */
1526		return -EINVAL;
1527
1528	tx_type = cfg.tx_type;
1529	rx_filter = cfg.rx_filter;
1530
1531	switch (tx_type) {
1532	case HWTSTAMP_TX_OFF:
1533	case HWTSTAMP_TX_ON:
1534	case HWTSTAMP_TX_ONESTEP_SYNC:
1535		tx_type_valid = 1;
1536		break;
1537	}
1538
1539	switch (rx_filter) {
1540	case HWTSTAMP_FILTER_NONE:
1541	case HWTSTAMP_FILTER_ALL:
1542	case HWTSTAMP_FILTER_SOME:
1543	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1544	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1545	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1546	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1547	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1548	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1549	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1550	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1551	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1552	case HWTSTAMP_FILTER_PTP_V2_EVENT:
1553	case HWTSTAMP_FILTER_PTP_V2_SYNC:
1554	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1555		rx_filter_valid = 1;
1556		break;
1557	}
1558
1559	if (!tx_type_valid || !rx_filter_valid)
1560		return -ERANGE;
1561
1562	return 0;
1563}
1564
1565static inline bool is_skb_forwardable(struct net_device *dev,
1566				      struct sk_buff *skb)
1567{
1568	unsigned int len;
1569
1570	if (!(dev->flags & IFF_UP))
1571		return false;
1572
1573	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1574	if (skb->len <= len)
1575		return true;
1576
1577	/* if TSO is enabled, we don't care about the length as the packet
1578	 * could be forwarded without being segmented before
1579	 */
1580	if (skb_is_gso(skb))
1581		return true;
1582
1583	return false;
1584}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1585
1586/**
1587 * dev_forward_skb - loopback an skb to another netif
1588 *
1589 * @dev: destination network device
1590 * @skb: buffer to forward
1591 *
1592 * return values:
1593 *	NET_RX_SUCCESS	(no congestion)
1594 *	NET_RX_DROP     (packet was dropped, but freed)
1595 *
1596 * dev_forward_skb can be used for injecting an skb from the
1597 * start_xmit function of one device into the receive queue
1598 * of another device.
1599 *
1600 * The receiving device may be in another namespace, so
1601 * we have to clear all information in the skb that could
1602 * impact namespace isolation.
1603 */
1604int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1605{
1606	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1607		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1608			atomic_long_inc(&dev->rx_dropped);
1609			kfree_skb(skb);
1610			return NET_RX_DROP;
1611		}
1612	}
1613
1614	skb_orphan(skb);
1615	nf_reset(skb);
1616
1617	if (unlikely(!is_skb_forwardable(dev, skb))) {
1618		atomic_long_inc(&dev->rx_dropped);
1619		kfree_skb(skb);
1620		return NET_RX_DROP;
1621	}
1622	skb->skb_iif = 0;
1623	skb->dev = dev;
1624	skb_dst_drop(skb);
1625	skb->tstamp.tv64 = 0;
1626	skb->pkt_type = PACKET_HOST;
1627	skb->protocol = eth_type_trans(skb, dev);
1628	skb->mark = 0;
1629	secpath_reset(skb);
1630	nf_reset(skb);
1631	return netif_rx(skb);
1632}
1633EXPORT_SYMBOL_GPL(dev_forward_skb);
1634
1635static inline int deliver_skb(struct sk_buff *skb,
1636			      struct packet_type *pt_prev,
1637			      struct net_device *orig_dev)
1638{
1639	atomic_inc(&skb->users);
 
 
1640	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1641}
1642
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1643static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1644{
1645	if (ptype->af_packet_priv == NULL)
1646		return false;
1647
1648	if (ptype->id_match)
1649		return ptype->id_match(ptype, skb->sk);
1650	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1651		return true;
1652
1653	return false;
1654}
1655
 
 
 
 
 
 
 
 
 
 
 
1656/*
1657 *	Support routine. Sends outgoing frames to any network
1658 *	taps currently in use.
1659 */
1660
1661static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1662{
1663	struct packet_type *ptype;
1664	struct sk_buff *skb2 = NULL;
1665	struct packet_type *pt_prev = NULL;
 
1666
1667	rcu_read_lock();
1668	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 
 
 
 
1669		/* Never send packets back to the socket
1670		 * they originated from - MvS (miquels@drinkel.ow.org)
1671		 */
1672		if ((ptype->dev == dev || !ptype->dev) &&
1673		    (!skb_loop_sk(ptype, skb))) {
1674			if (pt_prev) {
1675				deliver_skb(skb2, pt_prev, skb->dev);
1676				pt_prev = ptype;
1677				continue;
1678			}
1679
1680			skb2 = skb_clone(skb, GFP_ATOMIC);
1681			if (!skb2)
1682				break;
 
 
1683
1684			net_timestamp_set(skb2);
 
 
 
 
 
 
 
 
 
 
 
1685
1686			/* skb->nh should be correctly
1687			   set by sender, so that the second statement is
1688			   just protection against buggy protocols.
1689			 */
1690			skb_reset_mac_header(skb2);
 
 
1691
1692			if (skb_network_header(skb2) < skb2->data ||
1693			    skb2->network_header > skb2->tail) {
1694				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1695						     ntohs(skb2->protocol),
1696						     dev->name);
1697				skb_reset_network_header(skb2);
1698			}
1699
1700			skb2->transport_header = skb2->network_header;
1701			skb2->pkt_type = PACKET_OUTGOING;
1702			pt_prev = ptype;
1703		}
 
 
 
 
 
 
1704	}
1705	if (pt_prev)
1706		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1707	rcu_read_unlock();
1708}
 
1709
1710/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 
1711 * @dev: Network device
1712 * @txq: number of queues available
1713 *
1714 * If real_num_tx_queues is changed the tc mappings may no longer be
1715 * valid. To resolve this verify the tc mapping remains valid and if
1716 * not NULL the mapping. With no priorities mapping to this
1717 * offset/count pair it will no longer be used. In the worst case TC0
1718 * is invalid nothing can be done so disable priority mappings. If is
1719 * expected that drivers will fix this mapping if they can before
1720 * calling netif_set_real_num_tx_queues.
1721 */
1722static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1723{
1724	int i;
1725	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1726
1727	/* If TC0 is invalidated disable TC mapping */
1728	if (tc->offset + tc->count > txq) {
1729		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1730		dev->num_tc = 0;
1731		return;
1732	}
1733
1734	/* Invalidated prio to tc mappings set to TC0 */
1735	for (i = 1; i < TC_BITMASK + 1; i++) {
1736		int q = netdev_get_prio_tc_map(dev, i);
1737
1738		tc = &dev->tc_to_txq[q];
1739		if (tc->offset + tc->count > txq) {
1740			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1741				i, q);
1742			netdev_set_prio_tc_map(dev, i, 0);
1743		}
1744	}
1745}
1746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1747/*
1748 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1749 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1750 */
1751int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1752{
 
1753	int rc;
1754
 
 
1755	if (txq < 1 || txq > dev->num_tx_queues)
1756		return -EINVAL;
1757
1758	if (dev->reg_state == NETREG_REGISTERED ||
1759	    dev->reg_state == NETREG_UNREGISTERING) {
1760		ASSERT_RTNL();
1761
1762		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1763						  txq);
1764		if (rc)
1765			return rc;
1766
1767		if (dev->num_tc)
1768			netif_setup_tc(dev, txq);
1769
1770		if (txq < dev->real_num_tx_queues)
 
 
 
1771			qdisc_reset_all_tx_gt(dev, txq);
 
 
 
 
 
 
1772	}
1773
1774	dev->real_num_tx_queues = txq;
1775	return 0;
1776}
1777EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1778
1779#ifdef CONFIG_RPS
1780/**
1781 *	netif_set_real_num_rx_queues - set actual number of RX queues used
1782 *	@dev: Network device
1783 *	@rxq: Actual number of RX queues
1784 *
1785 *	This must be called either with the rtnl_lock held or before
1786 *	registration of the net device.  Returns 0 on success, or a
1787 *	negative error code.  If called before registration, it always
1788 *	succeeds.
1789 */
1790int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1791{
1792	int rc;
1793
1794	if (rxq < 1 || rxq > dev->num_rx_queues)
1795		return -EINVAL;
1796
1797	if (dev->reg_state == NETREG_REGISTERED) {
1798		ASSERT_RTNL();
1799
1800		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1801						  rxq);
1802		if (rc)
1803			return rc;
1804	}
1805
1806	dev->real_num_rx_queues = rxq;
1807	return 0;
1808}
1809EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1810#endif
1811
1812static inline void __netif_reschedule(struct Qdisc *q)
 
 
 
 
 
 
 
 
 
 
 
 
 
1813{
1814	struct softnet_data *sd;
1815	unsigned long flags;
1816
1817	local_irq_save(flags);
1818	sd = &__get_cpu_var(softnet_data);
1819	q->next_sched = NULL;
1820	*sd->output_queue_tailp = q;
1821	sd->output_queue_tailp = &q->next_sched;
1822	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1823	local_irq_restore(flags);
1824}
1825
1826void __netif_schedule(struct Qdisc *q)
1827{
1828	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1829		__netif_reschedule(q);
1830}
1831EXPORT_SYMBOL(__netif_schedule);
1832
1833void dev_kfree_skb_irq(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
1834{
1835	if (atomic_dec_and_test(&skb->users)) {
1836		struct softnet_data *sd;
1837		unsigned long flags;
1838
1839		local_irq_save(flags);
1840		sd = &__get_cpu_var(softnet_data);
1841		skb->next = sd->completion_queue;
1842		sd->completion_queue = skb;
1843		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1844		local_irq_restore(flags);
 
 
 
 
 
 
 
 
 
1845	}
1846}
1847EXPORT_SYMBOL(dev_kfree_skb_irq);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1848
1849void dev_kfree_skb_any(struct sk_buff *skb)
1850{
1851	if (in_irq() || irqs_disabled())
1852		dev_kfree_skb_irq(skb);
1853	else
1854		dev_kfree_skb(skb);
1855}
1856EXPORT_SYMBOL(dev_kfree_skb_any);
1857
1858
1859/**
1860 * netif_device_detach - mark device as removed
1861 * @dev: network device
1862 *
1863 * Mark device as removed from system and therefore no longer available.
1864 */
1865void netif_device_detach(struct net_device *dev)
1866{
1867	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1868	    netif_running(dev)) {
1869		netif_tx_stop_all_queues(dev);
1870	}
1871}
1872EXPORT_SYMBOL(netif_device_detach);
1873
1874/**
1875 * netif_device_attach - mark device as attached
1876 * @dev: network device
1877 *
1878 * Mark device as attached from system and restart if needed.
1879 */
1880void netif_device_attach(struct net_device *dev)
1881{
1882	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1883	    netif_running(dev)) {
1884		netif_tx_wake_all_queues(dev);
1885		__netdev_watchdog_up(dev);
1886	}
1887}
1888EXPORT_SYMBOL(netif_device_attach);
1889
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1890static void skb_warn_bad_offload(const struct sk_buff *skb)
1891{
1892	static const netdev_features_t null_features = 0;
1893	struct net_device *dev = skb->dev;
1894	const char *driver = "";
1895
1896	if (dev && dev->dev.parent)
1897		driver = dev_driver_string(dev->dev.parent);
1898
1899	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1900	     "gso_type=%d ip_summed=%d\n",
1901	     driver, dev ? &dev->features : &null_features,
1902	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
1903	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1904	     skb_shinfo(skb)->gso_type, skb->ip_summed);
 
 
 
 
1905}
1906
1907/*
1908 * Invalidate hardware checksum when packet is to be mangled, and
1909 * complete checksum manually on outgoing path.
1910 */
1911int skb_checksum_help(struct sk_buff *skb)
1912{
1913	__wsum csum;
1914	int ret = 0, offset;
1915
1916	if (skb->ip_summed == CHECKSUM_COMPLETE)
1917		goto out_set_summed;
1918
1919	if (unlikely(skb_shinfo(skb)->gso_size)) {
1920		skb_warn_bad_offload(skb);
1921		return -EINVAL;
1922	}
1923
 
 
 
 
 
 
 
 
 
1924	offset = skb_checksum_start_offset(skb);
1925	BUG_ON(offset >= skb_headlen(skb));
1926	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1927
1928	offset += skb->csum_offset;
1929	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1930
1931	if (skb_cloned(skb) &&
1932	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1933		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1934		if (ret)
1935			goto out;
1936	}
 
 
 
 
 
 
1937
1938	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1939out_set_summed:
 
 
 
 
 
 
1940	skb->ip_summed = CHECKSUM_NONE;
 
1941out:
1942	return ret;
1943}
1944EXPORT_SYMBOL(skb_checksum_help);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1945
1946/**
1947 *	skb_gso_segment - Perform segmentation on skb.
1948 *	@skb: buffer to segment
1949 *	@features: features for the output path (see dev->features)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1950 *
1951 *	This function segments the given skb and returns a list of segments.
1952 *
1953 *	It may return NULL if the skb requires no segmentation.  This is
1954 *	only possible when GSO is used for verifying header integrity.
 
 
1955 */
1956struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1957	netdev_features_t features)
1958{
1959	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1960	struct packet_type *ptype;
1961	__be16 type = skb->protocol;
1962	int vlan_depth = ETH_HLEN;
1963	int err;
1964
1965	while (type == htons(ETH_P_8021Q)) {
1966		struct vlan_hdr *vh;
 
 
 
1967
1968		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1969			return ERR_PTR(-EINVAL);
 
 
 
 
 
1970
1971		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1972		type = vh->h_vlan_encapsulated_proto;
1973		vlan_depth += VLAN_HLEN;
1974	}
1975
1976	skb_reset_mac_header(skb);
1977	skb->mac_len = skb->network_header - skb->mac_header;
1978	__skb_pull(skb, skb->mac_len);
1979
1980	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1981		skb_warn_bad_offload(skb);
1982
1983		if (skb_header_cloned(skb) &&
1984		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1985			return ERR_PTR(err);
1986	}
1987
1988	rcu_read_lock();
1989	list_for_each_entry_rcu(ptype,
1990			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1991		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1992			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1993				err = ptype->gso_send_check(skb);
1994				segs = ERR_PTR(err);
1995				if (err || skb_gso_ok(skb, features))
1996					break;
1997				__skb_push(skb, (skb->data -
1998						 skb_network_header(skb)));
1999			}
2000			segs = ptype->gso_segment(skb, features);
2001			break;
2002		}
2003	}
2004	rcu_read_unlock();
2005
2006	__skb_push(skb, skb->data - skb_mac_header(skb));
 
2007
2008	return segs;
2009}
2010EXPORT_SYMBOL(skb_gso_segment);
2011
2012/* Take action when hardware reception checksum errors are detected. */
2013#ifdef CONFIG_BUG
2014void netdev_rx_csum_fault(struct net_device *dev)
2015{
2016	if (net_ratelimit()) {
2017		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
 
2018		dump_stack();
2019	}
2020}
2021EXPORT_SYMBOL(netdev_rx_csum_fault);
2022#endif
2023
2024/* Actually, we should eliminate this check as soon as we know, that:
2025 * 1. IOMMU is present and allows to map all the memory.
2026 * 2. No high memory really exists on this machine.
2027 */
2028
2029static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2030{
2031#ifdef CONFIG_HIGHMEM
2032	int i;
 
2033	if (!(dev->features & NETIF_F_HIGHDMA)) {
2034		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2035			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2036			if (PageHighMem(skb_frag_page(frag)))
2037				return 1;
2038		}
2039	}
2040
2041	if (PCI_DMA_BUS_IS_PHYS) {
2042		struct device *pdev = dev->dev.parent;
2043
2044		if (!pdev)
2045			return 0;
2046		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2047			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2048			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2049			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2050				return 1;
2051		}
2052	}
2053#endif
2054	return 0;
2055}
2056
2057struct dev_gso_cb {
2058	void (*destructor)(struct sk_buff *skb);
2059};
2060
2061#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2062
2063static void dev_gso_skb_destructor(struct sk_buff *skb)
2064{
2065	struct dev_gso_cb *cb;
2066
2067	do {
2068		struct sk_buff *nskb = skb->next;
2069
2070		skb->next = nskb->next;
2071		nskb->next = NULL;
2072		kfree_skb(nskb);
2073	} while (skb->next);
2074
2075	cb = DEV_GSO_CB(skb);
2076	if (cb->destructor)
2077		cb->destructor(skb);
2078}
2079
2080/**
2081 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2082 *	@skb: buffer to segment
2083 *	@features: device features as applicable to this skb
2084 *
2085 *	This function segments the given skb and stores the list of segments
2086 *	in skb->next.
2087 */
2088static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2089{
2090	struct sk_buff *segs;
 
 
2091
2092	segs = skb_gso_segment(skb, features);
 
 
 
2093
2094	/* Verifying header integrity only. */
2095	if (!segs)
2096		return 0;
2097
2098	if (IS_ERR(segs))
2099		return PTR_ERR(segs);
 
 
 
 
2100
2101	skb->next = segs;
2102	DEV_GSO_CB(skb)->destructor = skb->destructor;
2103	skb->destructor = dev_gso_skb_destructor;
2104
2105	return 0;
 
 
 
 
2106}
 
2107
2108static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
 
 
2109{
2110	return ((features & NETIF_F_GEN_CSUM) ||
2111		((features & NETIF_F_V4_CSUM) &&
2112		 protocol == htons(ETH_P_IP)) ||
2113		((features & NETIF_F_V6_CSUM) &&
2114		 protocol == htons(ETH_P_IPV6)) ||
2115		((features & NETIF_F_FCOE_CRC) &&
2116		 protocol == htons(ETH_P_FCOE)));
2117}
2118
2119static netdev_features_t harmonize_features(struct sk_buff *skb,
2120	__be16 protocol, netdev_features_t features)
 
2121{
2122	if (!can_checksum_protocol(features, protocol)) {
2123		features &= ~NETIF_F_ALL_CSUM;
2124		features &= ~NETIF_F_SG;
2125	} else if (illegal_highdma(skb->dev, skb)) {
2126		features &= ~NETIF_F_SG;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2127	}
2128
2129	return features;
2130}
2131
2132netdev_features_t netif_skb_features(struct sk_buff *skb)
2133{
2134	__be16 protocol = skb->protocol;
2135	netdev_features_t features = skb->dev->features;
2136
2137	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2138		features &= ~NETIF_F_GSO_MASK;
2139
2140	if (protocol == htons(ETH_P_8021Q)) {
2141		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2142		protocol = veh->h_vlan_encapsulated_proto;
2143	} else if (!vlan_tx_tag_present(skb)) {
2144		return harmonize_features(skb, protocol, features);
2145	}
2146
2147	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2148
2149	if (protocol != htons(ETH_P_8021Q)) {
2150		return harmonize_features(skb, protocol, features);
2151	} else {
2152		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2153				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2154		return harmonize_features(skb, protocol, features);
2155	}
2156}
2157EXPORT_SYMBOL(netif_skb_features);
2158
2159/*
2160 * Returns true if either:
2161 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2162 *	2. skb is fragmented and the device does not support SG, or if
2163 *	   at least one of fragments is in highmem and device does not
2164 *	   support DMA from it.
2165 */
2166static inline int skb_needs_linearize(struct sk_buff *skb,
2167				      int features)
2168{
2169	return skb_is_nonlinear(skb) &&
2170			((skb_has_frag_list(skb) &&
2171				!(features & NETIF_F_FRAGLIST)) ||
2172			(skb_shinfo(skb)->nr_frags &&
2173				!(features & NETIF_F_SG)));
2174}
2175
2176int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2177			struct netdev_queue *txq)
2178{
2179	const struct net_device_ops *ops = dev->netdev_ops;
2180	int rc = NETDEV_TX_OK;
2181	unsigned int skb_len;
2182
2183	if (likely(!skb->next)) {
2184		netdev_features_t features;
2185
2186		/*
2187		 * If device doesn't need skb->dst, release it right now while
2188		 * its hot in this cpu cache
2189		 */
2190		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2191			skb_dst_drop(skb);
2192
2193		if (!list_empty(&ptype_all))
2194			dev_queue_xmit_nit(skb, dev);
2195
2196		features = netif_skb_features(skb);
 
 
 
 
2197
2198		if (vlan_tx_tag_present(skb) &&
2199		    !(features & NETIF_F_HW_VLAN_TX)) {
2200			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2201			if (unlikely(!skb))
2202				goto out;
2203
2204			skb->vlan_tci = 0;
 
 
 
 
2205		}
2206
2207		if (netif_needs_gso(skb, features)) {
2208			if (unlikely(dev_gso_segment(skb, features)))
2209				goto out_kfree_skb;
2210			if (skb->next)
2211				goto gso;
2212		} else {
2213			if (skb_needs_linearize(skb, features) &&
2214			    __skb_linearize(skb))
2215				goto out_kfree_skb;
2216
2217			/* If packet is not checksummed and device does not
2218			 * support checksumming for this protocol, complete
2219			 * checksumming here.
2220			 */
2221			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2222				skb_set_transport_header(skb,
2223					skb_checksum_start_offset(skb));
2224				if (!(features & NETIF_F_ALL_CSUM) &&
2225				     skb_checksum_help(skb))
2226					goto out_kfree_skb;
2227			}
2228		}
2229
2230		skb_len = skb->len;
2231		rc = ops->ndo_start_xmit(skb, dev);
2232		trace_net_dev_xmit(skb, rc, dev, skb_len);
2233		if (rc == NETDEV_TX_OK)
2234			txq_trans_update(txq);
2235		return rc;
2236	}
2237
2238gso:
2239	do {
2240		struct sk_buff *nskb = skb->next;
 
2241
2242		skb->next = nskb->next;
2243		nskb->next = NULL;
 
 
 
 
 
 
2244
2245		/*
2246		 * If device doesn't need nskb->dst, release it right now while
2247		 * its hot in this cpu cache
2248		 */
2249		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2250			skb_dst_drop(nskb);
2251
2252		skb_len = nskb->len;
2253		rc = ops->ndo_start_xmit(nskb, dev);
2254		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2255		if (unlikely(rc != NETDEV_TX_OK)) {
2256			if (rc & ~NETDEV_TX_MASK)
2257				goto out_kfree_gso_skb;
2258			nskb->next = skb->next;
2259			skb->next = nskb;
2260			return rc;
2261		}
2262		txq_trans_update(txq);
2263		if (unlikely(netif_xmit_stopped(txq) && skb->next))
2264			return NETDEV_TX_BUSY;
2265	} while (skb->next);
2266
2267out_kfree_gso_skb:
2268	if (likely(skb->next == NULL))
2269		skb->destructor = DEV_GSO_CB(skb)->destructor;
2270out_kfree_skb:
2271	kfree_skb(skb);
2272out:
2273	return rc;
2274}
 
2275
2276static u32 hashrnd __read_mostly;
2277
2278/*
2279 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2280 * to be used as a distribution range.
2281 */
2282u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2283		  unsigned int num_tx_queues)
2284{
2285	u32 hash;
2286	u16 qoffset = 0;
2287	u16 qcount = num_tx_queues;
2288
2289	if (skb_rx_queue_recorded(skb)) {
2290		hash = skb_get_rx_queue(skb);
2291		while (unlikely(hash >= num_tx_queues))
2292			hash -= num_tx_queues;
2293		return hash;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2294	}
2295
2296	if (dev->num_tc) {
2297		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2298		qoffset = dev->tc_to_txq[tc].offset;
2299		qcount = dev->tc_to_txq[tc].count;
2300	}
2301
2302	if (skb->sk && skb->sk->sk_hash)
2303		hash = skb->sk->sk_hash;
2304	else
2305		hash = (__force u16) skb->protocol;
2306	hash = jhash_1word(hash, hashrnd);
2307
2308	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
 
 
 
 
2309}
2310EXPORT_SYMBOL(__skb_tx_hash);
2311
2312static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2313{
2314	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2315		net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2316				     dev->name, queue_index,
2317				     dev->real_num_tx_queues);
2318		return 0;
2319	}
2320	return queue_index;
2321}
2322
2323static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2324{
2325#ifdef CONFIG_XPS
2326	struct xps_dev_maps *dev_maps;
2327	struct xps_map *map;
2328	int queue_index = -1;
2329
2330	rcu_read_lock();
2331	dev_maps = rcu_dereference(dev->xps_maps);
2332	if (dev_maps) {
2333		map = rcu_dereference(
2334		    dev_maps->cpu_map[raw_smp_processor_id()]);
2335		if (map) {
2336			if (map->len == 1)
2337				queue_index = map->queues[0];
2338			else {
2339				u32 hash;
2340				if (skb->sk && skb->sk->sk_hash)
2341					hash = skb->sk->sk_hash;
2342				else
2343					hash = (__force u16) skb->protocol ^
2344					    skb->rxhash;
2345				hash = jhash_1word(hash, hashrnd);
2346				queue_index = map->queues[
2347				    ((u64)hash * map->len) >> 32];
2348			}
2349			if (unlikely(queue_index >= dev->real_num_tx_queues))
2350				queue_index = -1;
2351		}
2352	}
2353	rcu_read_unlock();
2354
2355	return queue_index;
2356#else
2357	return -1;
2358#endif
 
 
 
 
 
 
 
 
 
 
2359}
 
2360
2361static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2362					struct sk_buff *skb)
2363{
2364	int queue_index;
2365	const struct net_device_ops *ops = dev->netdev_ops;
2366
2367	if (dev->real_num_tx_queues == 1)
2368		queue_index = 0;
2369	else if (ops->ndo_select_queue) {
2370		queue_index = ops->ndo_select_queue(dev, skb);
2371		queue_index = dev_cap_txqueue(dev, queue_index);
2372	} else {
2373		struct sock *sk = skb->sk;
2374		queue_index = sk_tx_queue_get(sk);
2375
2376		if (queue_index < 0 || skb->ooo_okay ||
2377		    queue_index >= dev->real_num_tx_queues) {
2378			int old_index = queue_index;
2379
2380			queue_index = get_xps_queue(dev, skb);
2381			if (queue_index < 0)
2382				queue_index = skb_tx_hash(dev, skb);
2383
2384			if (queue_index != old_index && sk) {
2385				struct dst_entry *dst =
2386				    rcu_dereference_check(sk->sk_dst_cache, 1);
 
 
 
 
 
 
 
 
 
 
2387
2388				if (dst && skb_dst(skb) == dst)
2389					sk_tx_queue_set(sk, queue_index);
2390			}
2391		}
2392	}
2393
2394	skb_set_queue_mapping(skb, queue_index);
2395	return netdev_get_tx_queue(dev, queue_index);
 
 
 
 
2396}
2397
2398static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2399				 struct net_device *dev,
2400				 struct netdev_queue *txq)
2401{
2402	spinlock_t *root_lock = qdisc_lock(q);
 
2403	bool contended;
2404	int rc;
2405
2406	qdisc_skb_cb(skb)->pkt_len = skb->len;
2407	qdisc_calculate_pkt_len(skb, q);
 
 
 
 
 
 
 
 
 
 
2408	/*
2409	 * Heuristic to force contended enqueues to serialize on a
2410	 * separate lock before trying to get qdisc main lock.
2411	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2412	 * and dequeue packets faster.
2413	 */
2414	contended = qdisc_is_running(q);
2415	if (unlikely(contended))
2416		spin_lock(&q->busylock);
2417
2418	spin_lock(root_lock);
2419	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2420		kfree_skb(skb);
2421		rc = NET_XMIT_DROP;
2422	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2423		   qdisc_run_begin(q)) {
2424		/*
2425		 * This is a work-conserving queue; there are no old skbs
2426		 * waiting to be sent out; and the qdisc is not running -
2427		 * xmit the skb directly.
2428		 */
2429		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2430			skb_dst_force(skb);
2431
2432		qdisc_bstats_update(q, skb);
2433
2434		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2435			if (unlikely(contended)) {
2436				spin_unlock(&q->busylock);
2437				contended = false;
2438			}
2439			__qdisc_run(q);
2440		} else
2441			qdisc_run_end(q);
2442
 
2443		rc = NET_XMIT_SUCCESS;
2444	} else {
2445		skb_dst_force(skb);
2446		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2447		if (qdisc_run_begin(q)) {
2448			if (unlikely(contended)) {
2449				spin_unlock(&q->busylock);
2450				contended = false;
2451			}
2452			__qdisc_run(q);
 
2453		}
2454	}
2455	spin_unlock(root_lock);
 
 
2456	if (unlikely(contended))
2457		spin_unlock(&q->busylock);
2458	return rc;
2459}
2460
2461#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2462static void skb_update_prio(struct sk_buff *skb)
2463{
2464	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
 
 
 
 
 
 
 
 
 
 
 
2465
2466	if (!skb->priority && skb->sk && map) {
2467		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2468
2469		if (prioidx < map->priomap_len)
2470			skb->priority = map->priomap[prioidx];
2471	}
2472}
2473#else
2474#define skb_update_prio(skb)
2475#endif
2476
2477static DEFINE_PER_CPU(int, xmit_recursion);
2478#define RECURSION_LIMIT 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2479
2480/**
2481 *	dev_queue_xmit - transmit a buffer
2482 *	@skb: buffer to transmit
 
2483 *
2484 *	Queue a buffer for transmission to a network device. The caller must
2485 *	have set the device and priority and built the buffer before calling
2486 *	this function. The function can be called from an interrupt.
2487 *
2488 *	A negative errno code is returned on a failure. A success does not
2489 *	guarantee the frame will be transmitted as it may be dropped due
2490 *	to congestion or traffic shaping.
2491 *
2492 * -----------------------------------------------------------------------------------
2493 *      I notice this method can also return errors from the queue disciplines,
2494 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2495 *      be positive.
2496 *
2497 *      Regardless of the return value, the skb is consumed, so it is currently
2498 *      difficult to retry a send to this method.  (You can bump the ref count
2499 *      before sending to hold a reference for retry if you are careful.)
2500 *
2501 *      When calling this method, interrupts MUST be enabled.  This is because
2502 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2503 *          --BLG
2504 */
2505int dev_queue_xmit(struct sk_buff *skb)
2506{
2507	struct net_device *dev = skb->dev;
2508	struct netdev_queue *txq;
2509	struct Qdisc *q;
2510	int rc = -ENOMEM;
 
 
 
 
 
 
2511
2512	/* Disable soft irqs for various locks below. Also
2513	 * stops preemption for RCU.
2514	 */
2515	rcu_read_lock_bh();
2516
2517	skb_update_prio(skb);
2518
2519	txq = dev_pick_tx(dev, skb);
2520	q = rcu_dereference_bh(txq->qdisc);
2521
2522#ifdef CONFIG_NET_CLS_ACT
2523	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
 
 
 
 
 
 
 
2524#endif
 
 
 
 
 
 
 
 
 
 
 
2525	trace_net_dev_queue(skb);
2526	if (q->enqueue) {
2527		rc = __dev_xmit_skb(skb, q, dev, txq);
2528		goto out;
2529	}
2530
2531	/* The device has no queue. Common case for software devices:
2532	   loopback, all the sorts of tunnels...
2533
2534	   Really, it is unlikely that netif_tx_lock protection is necessary
2535	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2536	   counters.)
2537	   However, it is possible, that they rely on protection
2538	   made by us here.
2539
2540	   Check this and shot the lock. It is not prone from deadlocks.
2541	   Either shot noqueue qdisc, it is even simpler 8)
2542	 */
2543	if (dev->flags & IFF_UP) {
2544		int cpu = smp_processor_id(); /* ok because BHs are off */
2545
2546		if (txq->xmit_lock_owner != cpu) {
2547
2548			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2549				goto recursion_alert;
2550
 
 
 
 
2551			HARD_TX_LOCK(dev, txq, cpu);
2552
2553			if (!netif_xmit_stopped(txq)) {
2554				__this_cpu_inc(xmit_recursion);
2555				rc = dev_hard_start_xmit(skb, dev, txq);
2556				__this_cpu_dec(xmit_recursion);
2557				if (dev_xmit_complete(rc)) {
2558					HARD_TX_UNLOCK(dev, txq);
2559					goto out;
2560				}
2561			}
2562			HARD_TX_UNLOCK(dev, txq);
2563			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2564					     dev->name);
2565		} else {
2566			/* Recursion is detected! It is possible,
2567			 * unfortunately
2568			 */
2569recursion_alert:
2570			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2571					     dev->name);
2572		}
2573	}
2574
2575	rc = -ENETDOWN;
2576	rcu_read_unlock_bh();
2577
2578	kfree_skb(skb);
 
2579	return rc;
2580out:
2581	rcu_read_unlock_bh();
2582	return rc;
2583}
 
 
 
 
 
2584EXPORT_SYMBOL(dev_queue_xmit);
2585
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2586
2587/*=======================================================================
2588			Receiver routines
2589  =======================================================================*/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2590
2591int netdev_max_backlog __read_mostly = 1000;
 
 
2592int netdev_tstamp_prequeue __read_mostly = 1;
2593int netdev_budget __read_mostly = 300;
2594int weight_p __read_mostly = 64;            /* old backlog weight */
 
 
 
 
 
 
 
 
2595
2596/* Called with irq disabled */
2597static inline void ____napi_schedule(struct softnet_data *sd,
2598				     struct napi_struct *napi)
2599{
2600	list_add_tail(&napi->poll_list, &sd->poll_list);
2601	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2602}
2603
2604/*
2605 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2606 * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2607 * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2608 * if hash is a canonical 4-tuple hash over transport ports.
2609 */
2610void __skb_get_rxhash(struct sk_buff *skb)
2611{
2612	struct flow_keys keys;
2613	u32 hash;
2614
2615	if (!skb_flow_dissect(skb, &keys))
2616		return;
2617
2618	if (keys.ports) {
2619		if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2620			swap(keys.port16[0], keys.port16[1]);
2621		skb->l4_rxhash = 1;
2622	}
2623
2624	/* get a consistent hash (same value on both flow directions) */
2625	if ((__force u32)keys.dst < (__force u32)keys.src)
2626		swap(keys.dst, keys.src);
2627
2628	hash = jhash_3words((__force u32)keys.dst,
2629			    (__force u32)keys.src,
2630			    (__force u32)keys.ports, hashrnd);
2631	if (!hash)
2632		hash = 1;
2633
2634	skb->rxhash = hash;
2635}
2636EXPORT_SYMBOL(__skb_get_rxhash);
2637
2638#ifdef CONFIG_RPS
2639
2640/* One global table that all flow-based protocols share. */
2641struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2642EXPORT_SYMBOL(rps_sock_flow_table);
 
 
2643
2644struct static_key rps_needed __read_mostly;
 
 
 
2645
2646static struct rps_dev_flow *
2647set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2648	    struct rps_dev_flow *rflow, u16 next_cpu)
2649{
2650	if (next_cpu != RPS_NO_CPU) {
2651#ifdef CONFIG_RFS_ACCEL
2652		struct netdev_rx_queue *rxqueue;
2653		struct rps_dev_flow_table *flow_table;
2654		struct rps_dev_flow *old_rflow;
2655		u32 flow_id;
2656		u16 rxq_index;
2657		int rc;
2658
2659		/* Should we steer this flow to a different hardware queue? */
2660		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2661		    !(dev->features & NETIF_F_NTUPLE))
2662			goto out;
2663		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2664		if (rxq_index == skb_get_rx_queue(skb))
2665			goto out;
2666
2667		rxqueue = dev->_rx + rxq_index;
2668		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2669		if (!flow_table)
2670			goto out;
2671		flow_id = skb->rxhash & flow_table->mask;
2672		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2673							rxq_index, flow_id);
2674		if (rc < 0)
2675			goto out;
2676		old_rflow = rflow;
2677		rflow = &flow_table->flows[flow_id];
2678		rflow->filter = rc;
2679		if (old_rflow->filter == rflow->filter)
2680			old_rflow->filter = RPS_NO_FILTER;
2681	out:
2682#endif
2683		rflow->last_qtail =
2684			per_cpu(softnet_data, next_cpu).input_queue_head;
2685	}
2686
2687	rflow->cpu = next_cpu;
2688	return rflow;
2689}
2690
2691/*
2692 * get_rps_cpu is called from netif_receive_skb and returns the target
2693 * CPU from the RPS map of the receiving queue for a given skb.
2694 * rcu_read_lock must be held on entry.
2695 */
2696static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2697		       struct rps_dev_flow **rflowp)
2698{
2699	struct netdev_rx_queue *rxqueue;
2700	struct rps_map *map;
2701	struct rps_dev_flow_table *flow_table;
2702	struct rps_sock_flow_table *sock_flow_table;
2703	int cpu = -1;
2704	u16 tcpu;
 
2705
2706	if (skb_rx_queue_recorded(skb)) {
2707		u16 index = skb_get_rx_queue(skb);
 
2708		if (unlikely(index >= dev->real_num_rx_queues)) {
2709			WARN_ONCE(dev->real_num_rx_queues > 1,
2710				  "%s received packet on queue %u, but number "
2711				  "of RX queues is %u\n",
2712				  dev->name, index, dev->real_num_rx_queues);
2713			goto done;
2714		}
2715		rxqueue = dev->_rx + index;
2716	} else
2717		rxqueue = dev->_rx;
 
2718
 
2719	map = rcu_dereference(rxqueue->rps_map);
2720	if (map) {
2721		if (map->len == 1 &&
2722		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2723			tcpu = map->cpus[0];
2724			if (cpu_online(tcpu))
2725				cpu = tcpu;
2726			goto done;
2727		}
2728	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2729		goto done;
2730	}
2731
2732	skb_reset_network_header(skb);
2733	if (!skb_get_rxhash(skb))
 
2734		goto done;
2735
2736	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2737	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2738	if (flow_table && sock_flow_table) {
2739		u16 next_cpu;
2740		struct rps_dev_flow *rflow;
 
 
2741
2742		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2743		tcpu = rflow->cpu;
 
 
 
 
2744
2745		next_cpu = sock_flow_table->ents[skb->rxhash &
2746		    sock_flow_table->mask];
 
 
 
2747
2748		/*
2749		 * If the desired CPU (where last recvmsg was done) is
2750		 * different from current CPU (one in the rx-queue flow
2751		 * table entry), switch if one of the following holds:
2752		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2753		 *   - Current CPU is offline.
2754		 *   - The current CPU's queue tail has advanced beyond the
2755		 *     last packet that was enqueued using this table entry.
2756		 *     This guarantees that all previous packets for the flow
2757		 *     have been dequeued, thus preserving in order delivery.
2758		 */
2759		if (unlikely(tcpu != next_cpu) &&
2760		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2761		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2762		      rflow->last_qtail)) >= 0))
 
2763			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 
2764
2765		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2766			*rflowp = rflow;
2767			cpu = tcpu;
2768			goto done;
2769		}
2770	}
2771
2772	if (map) {
2773		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2774
 
 
2775		if (cpu_online(tcpu)) {
2776			cpu = tcpu;
2777			goto done;
2778		}
2779	}
2780
2781done:
2782	return cpu;
2783}
2784
2785#ifdef CONFIG_RFS_ACCEL
2786
2787/**
2788 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2789 * @dev: Device on which the filter was set
2790 * @rxq_index: RX queue index
2791 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2792 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2793 *
2794 * Drivers that implement ndo_rx_flow_steer() should periodically call
2795 * this function for each installed filter and remove the filters for
2796 * which it returns %true.
2797 */
2798bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2799			 u32 flow_id, u16 filter_id)
2800{
2801	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2802	struct rps_dev_flow_table *flow_table;
2803	struct rps_dev_flow *rflow;
2804	bool expire = true;
2805	int cpu;
2806
2807	rcu_read_lock();
2808	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2809	if (flow_table && flow_id <= flow_table->mask) {
2810		rflow = &flow_table->flows[flow_id];
2811		cpu = ACCESS_ONCE(rflow->cpu);
2812		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2813		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2814			   rflow->last_qtail) <
2815		     (int)(10 * flow_table->mask)))
2816			expire = false;
2817	}
2818	rcu_read_unlock();
2819	return expire;
2820}
2821EXPORT_SYMBOL(rps_may_expire_flow);
2822
2823#endif /* CONFIG_RFS_ACCEL */
2824
2825/* Called from hardirq (IPI) context */
2826static void rps_trigger_softirq(void *data)
2827{
2828	struct softnet_data *sd = data;
2829
2830	____napi_schedule(sd, &sd->backlog);
2831	sd->received_rps++;
2832}
2833
2834#endif /* CONFIG_RPS */
2835
2836/*
2837 * Check if this softnet_data structure is another cpu one
2838 * If yes, queue it to our IPI list and return 1
2839 * If no, return 0
2840 */
2841static int rps_ipi_queued(struct softnet_data *sd)
2842{
2843#ifdef CONFIG_RPS
2844	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2845
2846	if (sd != mysd) {
2847		sd->rps_ipi_next = mysd->rps_ipi_list;
2848		mysd->rps_ipi_list = sd;
2849
2850		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2851		return 1;
2852	}
2853#endif /* CONFIG_RPS */
2854	return 0;
2855}
2856
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2857/*
2858 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2859 * queue (may be a remote CPU queue).
2860 */
2861static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2862			      unsigned int *qtail)
2863{
2864	struct softnet_data *sd;
2865	unsigned long flags;
 
2866
2867	sd = &per_cpu(softnet_data, cpu);
2868
2869	local_irq_save(flags);
2870
2871	rps_lock(sd);
2872	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2873		if (skb_queue_len(&sd->input_pkt_queue)) {
 
 
 
2874enqueue:
2875			__skb_queue_tail(&sd->input_pkt_queue, skb);
2876			input_queue_tail_incr_save(sd, qtail);
2877			rps_unlock(sd);
2878			local_irq_restore(flags);
2879			return NET_RX_SUCCESS;
2880		}
2881
2882		/* Schedule NAPI for backlog device
2883		 * We can use non atomic operation since we own the queue lock
2884		 */
2885		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2886			if (!rps_ipi_queued(sd))
2887				____napi_schedule(sd, &sd->backlog);
2888		}
2889		goto enqueue;
2890	}
2891
 
2892	sd->dropped++;
2893	rps_unlock(sd);
2894
2895	local_irq_restore(flags);
2896
2897	atomic_long_inc(&skb->dev->rx_dropped);
2898	kfree_skb(skb);
2899	return NET_RX_DROP;
2900}
2901
2902/**
2903 *	netif_rx	-	post buffer to the network code
2904 *	@skb: buffer to post
2905 *
2906 *	This function receives a packet from a device driver and queues it for
2907 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2908 *	may be dropped during processing for congestion control or by the
2909 *	protocol layers.
2910 *
2911 *	return values:
2912 *	NET_RX_SUCCESS	(no congestion)
2913 *	NET_RX_DROP     (packet was dropped)
2914 *
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2915 */
 
 
 
 
 
 
2916
2917int netif_rx(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2918{
2919	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2920
2921	/* if netpoll wants it, pretend we never saw it */
2922	if (netpoll_rx(skb))
2923		return NET_RX_DROP;
2924
2925	net_timestamp_check(netdev_tstamp_prequeue, skb);
2926
2927	trace_netif_rx(skb);
 
2928#ifdef CONFIG_RPS
2929	if (static_key_false(&rps_needed)) {
2930		struct rps_dev_flow voidflow, *rflow = &voidflow;
2931		int cpu;
2932
2933		preempt_disable();
2934		rcu_read_lock();
2935
2936		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2937		if (cpu < 0)
2938			cpu = smp_processor_id();
2939
2940		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2941
2942		rcu_read_unlock();
2943		preempt_enable();
2944	} else
2945#endif
2946	{
2947		unsigned int qtail;
 
2948		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2949		put_cpu();
2950	}
2951	return ret;
2952}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2953EXPORT_SYMBOL(netif_rx);
2954
2955int netif_rx_ni(struct sk_buff *skb)
2956{
2957	int err;
2958
 
 
2959	preempt_disable();
2960	err = netif_rx(skb);
2961	if (local_softirq_pending())
2962		do_softirq();
2963	preempt_enable();
 
2964
2965	return err;
2966}
2967EXPORT_SYMBOL(netif_rx_ni);
2968
2969static void net_tx_action(struct softirq_action *h)
2970{
2971	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2972
2973	if (sd->completion_queue) {
2974		struct sk_buff *clist;
2975
2976		local_irq_disable();
2977		clist = sd->completion_queue;
2978		sd->completion_queue = NULL;
2979		local_irq_enable();
2980
2981		while (clist) {
2982			struct sk_buff *skb = clist;
 
2983			clist = clist->next;
2984
2985			WARN_ON(atomic_read(&skb->users));
2986			trace_kfree_skb(skb, net_tx_action);
2987			__kfree_skb(skb);
 
 
 
 
 
 
 
2988		}
 
 
2989	}
2990
2991	if (sd->output_queue) {
2992		struct Qdisc *head;
2993
2994		local_irq_disable();
2995		head = sd->output_queue;
2996		sd->output_queue = NULL;
2997		sd->output_queue_tailp = &sd->output_queue;
2998		local_irq_enable();
2999
3000		while (head) {
3001			struct Qdisc *q = head;
3002			spinlock_t *root_lock;
3003
3004			head = head->next_sched;
3005
3006			root_lock = qdisc_lock(q);
3007			if (spin_trylock(root_lock)) {
3008				smp_mb__before_clear_bit();
3009				clear_bit(__QDISC_STATE_SCHED,
3010					  &q->state);
3011				qdisc_run(q);
3012				spin_unlock(root_lock);
3013			} else {
3014				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3015					      &q->state)) {
3016					__netif_reschedule(q);
3017				} else {
3018					smp_mb__before_clear_bit();
3019					clear_bit(__QDISC_STATE_SCHED,
3020						  &q->state);
3021				}
3022			}
 
 
 
 
 
 
 
 
3023		}
3024	}
 
 
3025}
3026
3027#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3028    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3029/* This hook is defined here for ATM LANE */
3030int (*br_fdb_test_addr_hook)(struct net_device *dev,
3031			     unsigned char *addr) __read_mostly;
3032EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3033#endif
3034
3035#ifdef CONFIG_NET_CLS_ACT
3036/* TODO: Maybe we should just force sch_ingress to be compiled in
3037 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3038 * a compare and 2 stores extra right now if we dont have it on
3039 * but have CONFIG_NET_CLS_ACT
3040 * NOTE: This doesn't stop any functionality; if you dont have
3041 * the ingress scheduler, you just can't add policies on ingress.
3042 *
3043 */
3044static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3045{
3046	struct net_device *dev = skb->dev;
3047	u32 ttl = G_TC_RTTL(skb->tc_verd);
3048	int result = TC_ACT_OK;
3049	struct Qdisc *q;
3050
3051	if (unlikely(MAX_RED_LOOP < ttl++)) {
3052		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3053				     skb->skb_iif, dev->ifindex);
3054		return TC_ACT_SHOT;
3055	}
3056
3057	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3058	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3059
3060	q = rxq->qdisc;
3061	if (q != &noop_qdisc) {
3062		spin_lock(qdisc_lock(q));
3063		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3064			result = qdisc_enqueue_root(skb, q);
3065		spin_unlock(qdisc_lock(q));
3066	}
3067
3068	return result;
3069}
3070
3071static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3072					 struct packet_type **pt_prev,
3073					 int *ret, struct net_device *orig_dev)
3074{
3075	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
 
 
3076
3077	if (!rxq || rxq->qdisc == &noop_qdisc)
3078		goto out;
 
 
 
 
 
3079
3080	if (*pt_prev) {
3081		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3082		*pt_prev = NULL;
3083	}
3084
3085	switch (ing_filter(skb, rxq)) {
 
 
 
 
 
 
 
 
 
3086	case TC_ACT_SHOT:
3087	case TC_ACT_STOLEN:
3088		kfree_skb(skb);
3089		return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3090	}
3091
3092out:
3093	skb->tc_verd = 0;
3094	return skb;
3095}
3096#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3097
3098/**
3099 *	netdev_rx_handler_register - register receive handler
3100 *	@dev: device to register a handler for
3101 *	@rx_handler: receive handler to register
3102 *	@rx_handler_data: data pointer that is used by rx handler
3103 *
3104 *	Register a receive hander for a device. This handler will then be
3105 *	called from __netif_receive_skb. A negative errno code is returned
3106 *	on a failure.
3107 *
3108 *	The caller must hold the rtnl_mutex.
3109 *
3110 *	For a general description of rx_handler, see enum rx_handler_result.
3111 */
3112int netdev_rx_handler_register(struct net_device *dev,
3113			       rx_handler_func_t *rx_handler,
3114			       void *rx_handler_data)
3115{
3116	ASSERT_RTNL();
3117
3118	if (dev->rx_handler)
3119		return -EBUSY;
3120
 
 
 
 
3121	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3122	rcu_assign_pointer(dev->rx_handler, rx_handler);
3123
3124	return 0;
3125}
3126EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3127
3128/**
3129 *	netdev_rx_handler_unregister - unregister receive handler
3130 *	@dev: device to unregister a handler from
3131 *
3132 *	Unregister a receive hander from a device.
3133 *
3134 *	The caller must hold the rtnl_mutex.
3135 */
3136void netdev_rx_handler_unregister(struct net_device *dev)
3137{
3138
3139	ASSERT_RTNL();
3140	RCU_INIT_POINTER(dev->rx_handler, NULL);
 
 
 
 
 
3141	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3142}
3143EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3144
3145static int __netif_receive_skb(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3146{
3147	struct packet_type *ptype, *pt_prev;
3148	rx_handler_func_t *rx_handler;
 
3149	struct net_device *orig_dev;
3150	struct net_device *null_or_dev;
3151	bool deliver_exact = false;
3152	int ret = NET_RX_DROP;
3153	__be16 type;
3154
3155	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3156
3157	trace_netif_receive_skb(skb);
3158
3159	/* if we've gotten here through NAPI, check netpoll */
3160	if (netpoll_receive_skb(skb))
3161		return NET_RX_DROP;
3162
3163	if (!skb->skb_iif)
3164		skb->skb_iif = skb->dev->ifindex;
3165	orig_dev = skb->dev;
3166
3167	skb_reset_network_header(skb);
3168	skb_reset_transport_header(skb);
 
3169	skb_reset_mac_len(skb);
3170
3171	pt_prev = NULL;
3172
3173	rcu_read_lock();
3174
3175another_round:
 
3176
3177	__this_cpu_inc(softnet_data.processed);
3178
3179	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3180		skb = vlan_untag(skb);
3181		if (unlikely(!skb))
 
 
 
 
 
 
3182			goto out;
 
 
3183	}
3184
3185#ifdef CONFIG_NET_CLS_ACT
3186	if (skb->tc_verd & TC_NCLS) {
3187		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3188		goto ncls;
 
3189	}
3190#endif
 
 
 
 
 
3191
3192	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3193		if (!ptype->dev || ptype->dev == skb->dev) {
3194			if (pt_prev)
3195				ret = deliver_skb(skb, pt_prev, orig_dev);
3196			pt_prev = ptype;
3197		}
3198	}
3199
3200#ifdef CONFIG_NET_CLS_ACT
3201	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3202	if (!skb)
3203		goto out;
3204ncls:
 
 
 
 
 
 
 
 
 
 
 
3205#endif
 
 
 
 
3206
3207	rx_handler = rcu_dereference(skb->dev->rx_handler);
3208	if (vlan_tx_tag_present(skb)) {
3209		if (pt_prev) {
3210			ret = deliver_skb(skb, pt_prev, orig_dev);
3211			pt_prev = NULL;
3212		}
3213		if (vlan_do_receive(&skb, !rx_handler))
3214			goto another_round;
3215		else if (unlikely(!skb))
3216			goto out;
3217	}
3218
 
3219	if (rx_handler) {
3220		if (pt_prev) {
3221			ret = deliver_skb(skb, pt_prev, orig_dev);
3222			pt_prev = NULL;
3223		}
3224		switch (rx_handler(&skb)) {
3225		case RX_HANDLER_CONSUMED:
 
3226			goto out;
3227		case RX_HANDLER_ANOTHER:
3228			goto another_round;
3229		case RX_HANDLER_EXACT:
3230			deliver_exact = true;
3231		case RX_HANDLER_PASS:
3232			break;
3233		default:
3234			BUG();
3235		}
3236	}
3237
3238	/* deliver only exact match when indicated */
3239	null_or_dev = deliver_exact ? skb->dev : NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3240
3241	type = skb->protocol;
3242	list_for_each_entry_rcu(ptype,
3243			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3244		if (ptype->type == type &&
3245		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3246		     ptype->dev == orig_dev)) {
3247			if (pt_prev)
3248				ret = deliver_skb(skb, pt_prev, orig_dev);
3249			pt_prev = ptype;
3250		}
 
 
 
 
 
3251	}
3252
3253	if (pt_prev) {
3254		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 
 
3255	} else {
3256		atomic_long_inc(&skb->dev->rx_dropped);
 
 
 
 
3257		kfree_skb(skb);
3258		/* Jamal, now you will not able to escape explaining
3259		 * me how you were going to use this. :-)
3260		 */
3261		ret = NET_RX_DROP;
3262	}
3263
3264out:
3265	rcu_read_unlock();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3266	return ret;
3267}
3268
3269/**
3270 *	netif_receive_skb - process receive buffer from network
3271 *	@skb: buffer to process
3272 *
3273 *	netif_receive_skb() is the main receive data processing function.
3274 *	It always succeeds. The buffer may be dropped during processing
3275 *	for congestion control or by the protocol layers.
3276 *
3277 *	This function may only be called from softirq context and interrupts
3278 *	should be enabled.
3279 *
3280 *	Return values (usually ignored):
3281 *	NET_RX_SUCCESS: no congestion
3282 *	NET_RX_DROP: packet was dropped
3283 */
3284int netif_receive_skb(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3285{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3286	net_timestamp_check(netdev_tstamp_prequeue, skb);
3287
3288	if (skb_defer_rx_timestamp(skb))
3289		return NET_RX_SUCCESS;
3290
 
3291#ifdef CONFIG_RPS
3292	if (static_key_false(&rps_needed)) {
3293		struct rps_dev_flow voidflow, *rflow = &voidflow;
3294		int cpu, ret;
3295
3296		rcu_read_lock();
3297
3298		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3299
3300		if (cpu >= 0) {
3301			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3302			rcu_read_unlock();
3303			return ret;
3304		}
3305		rcu_read_unlock();
3306	}
3307#endif
3308	return __netif_receive_skb(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3309}
3310EXPORT_SYMBOL(netif_receive_skb);
3311
3312/* Network device is going away, flush any packets still pending
3313 * Called with irqs disabled.
 
 
 
 
 
 
 
3314 */
3315static void flush_backlog(void *arg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3316{
3317	struct net_device *dev = arg;
3318	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3319	struct sk_buff *skb, *tmp;
 
 
 
 
3320
 
3321	rps_lock(sd);
3322	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3323		if (skb->dev == dev) {
3324			__skb_unlink(skb, &sd->input_pkt_queue);
3325			kfree_skb(skb);
3326			input_queue_head_incr(sd);
3327		}
3328	}
3329	rps_unlock(sd);
 
3330
3331	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3332		if (skb->dev == dev) {
3333			__skb_unlink(skb, &sd->process_queue);
3334			kfree_skb(skb);
3335			input_queue_head_incr(sd);
3336		}
3337	}
 
3338}
3339
3340static int napi_gro_complete(struct sk_buff *skb)
3341{
3342	struct packet_type *ptype;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3343	__be16 type = skb->protocol;
3344	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3345	int err = -ENOENT;
3346
 
 
3347	if (NAPI_GRO_CB(skb)->count == 1) {
3348		skb_shinfo(skb)->gso_size = 0;
3349		goto out;
3350	}
3351
3352	rcu_read_lock();
3353	list_for_each_entry_rcu(ptype, head, list) {
3354		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3355			continue;
3356
3357		err = ptype->gro_complete(skb);
 
 
3358		break;
3359	}
3360	rcu_read_unlock();
3361
3362	if (err) {
3363		WARN_ON(&ptype->list == head);
3364		kfree_skb(skb);
3365		return NET_RX_SUCCESS;
3366	}
3367
3368out:
3369	return netif_receive_skb(skb);
 
3370}
3371
3372inline void napi_gro_flush(struct napi_struct *napi)
 
3373{
3374	struct sk_buff *skb, *next;
 
3375
3376	for (skb = napi->gro_list; skb; skb = next) {
3377		next = skb->next;
3378		skb->next = NULL;
3379		napi_gro_complete(skb);
 
 
3380	}
3381
3382	napi->gro_count = 0;
3383	napi->gro_list = NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3384}
3385EXPORT_SYMBOL(napi_gro_flush);
3386
3387enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 
3388{
3389	struct sk_buff **pp = NULL;
3390	struct packet_type *ptype;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3391	__be16 type = skb->protocol;
3392	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3393	int same_flow;
3394	int mac_len;
3395	enum gro_result ret;
 
 
3396
3397	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3398		goto normal;
3399
3400	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3401		goto normal;
3402
3403	rcu_read_lock();
3404	list_for_each_entry_rcu(ptype, head, list) {
3405		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3406			continue;
3407
3408		skb_set_network_header(skb, skb_gro_offset(skb));
3409		mac_len = skb->network_header - skb->mac_header;
3410		skb->mac_len = mac_len;
3411		NAPI_GRO_CB(skb)->same_flow = 0;
3412		NAPI_GRO_CB(skb)->flush = 0;
3413		NAPI_GRO_CB(skb)->free = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3414
3415		pp = ptype->gro_receive(&napi->gro_list, skb);
 
 
3416		break;
3417	}
3418	rcu_read_unlock();
3419
3420	if (&ptype->list == head)
3421		goto normal;
3422
 
 
 
 
 
3423	same_flow = NAPI_GRO_CB(skb)->same_flow;
3424	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3425
3426	if (pp) {
3427		struct sk_buff *nskb = *pp;
3428
3429		*pp = nskb->next;
3430		nskb->next = NULL;
3431		napi_gro_complete(nskb);
3432		napi->gro_count--;
3433	}
3434
3435	if (same_flow)
3436		goto ok;
3437
3438	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3439		goto normal;
3440
3441	napi->gro_count++;
 
 
 
 
3442	NAPI_GRO_CB(skb)->count = 1;
 
 
3443	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3444	skb->next = napi->gro_list;
3445	napi->gro_list = skb;
3446	ret = GRO_HELD;
3447
3448pull:
3449	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3450		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3451
3452		BUG_ON(skb->end - skb->tail < grow);
3453
3454		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3455
3456		skb->tail += grow;
3457		skb->data_len -= grow;
3458
3459		skb_shinfo(skb)->frags[0].page_offset += grow;
3460		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3461
3462		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3463			skb_frag_unref(skb, 0);
3464			memmove(skb_shinfo(skb)->frags,
3465				skb_shinfo(skb)->frags + 1,
3466				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3467		}
3468	}
3469
3470ok:
3471	return ret;
3472
3473normal:
3474	ret = GRO_NORMAL;
3475	goto pull;
3476}
3477EXPORT_SYMBOL(dev_gro_receive);
3478
3479static inline gro_result_t
3480__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3481{
3482	struct sk_buff *p;
3483	unsigned int maclen = skb->dev->hard_header_len;
3484
3485	for (p = napi->gro_list; p; p = p->next) {
3486		unsigned long diffs;
 
 
 
 
 
 
3487
3488		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3489		diffs |= p->vlan_tci ^ skb->vlan_tci;
3490		if (maclen == ETH_HLEN)
3491			diffs |= compare_ether_header(skb_mac_header(p),
3492						      skb_gro_mac_header(skb));
3493		else if (!diffs)
3494			diffs = memcmp(skb_mac_header(p),
3495				       skb_gro_mac_header(skb),
3496				       maclen);
3497		NAPI_GRO_CB(p)->same_flow = !diffs;
3498		NAPI_GRO_CB(p)->flush = 0;
3499	}
 
 
 
3500
3501	return dev_gro_receive(napi, skb);
 
 
 
 
3502}
3503
3504gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 
 
3505{
3506	switch (ret) {
3507	case GRO_NORMAL:
3508		if (netif_receive_skb(skb))
3509			ret = GRO_DROP;
3510		break;
3511
3512	case GRO_DROP:
3513		kfree_skb(skb);
3514		break;
3515
3516	case GRO_MERGED_FREE:
3517		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3518			kmem_cache_free(skbuff_head_cache, skb);
3519		else
3520			__kfree_skb(skb);
3521		break;
3522
3523	case GRO_HELD:
3524	case GRO_MERGED:
 
3525		break;
3526	}
3527
3528	return ret;
3529}
3530EXPORT_SYMBOL(napi_skb_finish);
3531
3532void skb_gro_reset_offset(struct sk_buff *skb)
3533{
3534	NAPI_GRO_CB(skb)->data_offset = 0;
3535	NAPI_GRO_CB(skb)->frag0 = NULL;
3536	NAPI_GRO_CB(skb)->frag0_len = 0;
3537
3538	if (skb->mac_header == skb->tail &&
3539	    !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3540		NAPI_GRO_CB(skb)->frag0 =
3541			skb_frag_address(&skb_shinfo(skb)->frags[0]);
3542		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3543	}
3544}
3545EXPORT_SYMBOL(skb_gro_reset_offset);
3546
3547gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3548{
3549	skb_gro_reset_offset(skb);
3550
3551	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
 
 
 
3552}
3553EXPORT_SYMBOL(napi_gro_receive);
3554
3555static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3556{
 
 
 
 
3557	__skb_pull(skb, skb_headlen(skb));
3558	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
3559	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3560	skb->vlan_tci = 0;
3561	skb->dev = napi->dev;
3562	skb->skb_iif = 0;
3563
 
 
 
 
 
 
 
 
3564	napi->skb = skb;
3565}
3566
3567struct sk_buff *napi_get_frags(struct napi_struct *napi)
3568{
3569	struct sk_buff *skb = napi->skb;
3570
3571	if (!skb) {
3572		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3573		if (skb)
3574			napi->skb = skb;
 
 
3575	}
3576	return skb;
3577}
3578EXPORT_SYMBOL(napi_get_frags);
3579
3580gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3581			       gro_result_t ret)
 
3582{
3583	switch (ret) {
3584	case GRO_NORMAL:
3585	case GRO_HELD:
 
3586		skb->protocol = eth_type_trans(skb, skb->dev);
3587
3588		if (ret == GRO_HELD)
3589			skb_gro_pull(skb, -ETH_HLEN);
3590		else if (netif_receive_skb(skb))
3591			ret = GRO_DROP;
3592		break;
3593
3594	case GRO_DROP:
3595	case GRO_MERGED_FREE:
3596		napi_reuse_skb(napi, skb);
3597		break;
3598
 
 
 
 
 
 
 
3599	case GRO_MERGED:
 
3600		break;
3601	}
3602
3603	return ret;
3604}
3605EXPORT_SYMBOL(napi_frags_finish);
3606
 
 
 
 
3607static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3608{
3609	struct sk_buff *skb = napi->skb;
3610	struct ethhdr *eth;
3611	unsigned int hlen;
3612	unsigned int off;
3613
3614	napi->skb = NULL;
3615
3616	skb_reset_mac_header(skb);
3617	skb_gro_reset_offset(skb);
3618
3619	off = skb_gro_offset(skb);
3620	hlen = off + sizeof(*eth);
3621	eth = skb_gro_header_fast(skb, off);
3622	if (skb_gro_header_hard(skb, hlen)) {
3623		eth = skb_gro_header_slow(skb, hlen, off);
3624		if (unlikely(!eth)) {
 
 
3625			napi_reuse_skb(napi, skb);
3626			skb = NULL;
3627			goto out;
3628		}
 
 
 
 
 
3629	}
3630
3631	skb_gro_pull(skb, sizeof(*eth));
3632
3633	/*
3634	 * This works because the only protocols we care about don't require
3635	 * special handling.  We'll fix it up properly at the end.
 
3636	 */
3637	skb->protocol = eth->h_proto;
3638
3639out:
3640	return skb;
3641}
3642
3643gro_result_t napi_gro_frags(struct napi_struct *napi)
3644{
 
3645	struct sk_buff *skb = napi_frags_skb(napi);
3646
3647	if (!skb)
3648		return GRO_DROP;
3649
3650	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
 
 
 
 
 
3651}
3652EXPORT_SYMBOL(napi_gro_frags);
3653
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3654/*
3655 * net_rps_action sends any pending IPI's for rps.
3656 * Note: called with local irq disabled, but exits with local irq enabled.
3657 */
3658static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3659{
3660#ifdef CONFIG_RPS
3661	struct softnet_data *remsd = sd->rps_ipi_list;
3662
3663	if (remsd) {
3664		sd->rps_ipi_list = NULL;
3665
3666		local_irq_enable();
3667
3668		/* Send pending IPI's to kick RPS processing on remote cpus. */
3669		while (remsd) {
3670			struct softnet_data *next = remsd->rps_ipi_next;
3671
3672			if (cpu_online(remsd->cpu))
3673				__smp_call_function_single(remsd->cpu,
3674							   &remsd->csd, 0);
3675			remsd = next;
3676		}
3677	} else
3678#endif
3679		local_irq_enable();
3680}
3681
 
 
 
 
 
 
 
 
 
3682static int process_backlog(struct napi_struct *napi, int quota)
3683{
3684	int work = 0;
3685	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 
 
3686
3687#ifdef CONFIG_RPS
3688	/* Check if we have pending ipi, its better to send them now,
3689	 * not waiting net_rx_action() end.
3690	 */
3691	if (sd->rps_ipi_list) {
3692		local_irq_disable();
3693		net_rps_action_and_irq_enable(sd);
3694	}
3695#endif
3696	napi->weight = weight_p;
3697	local_irq_disable();
3698	while (work < quota) {
3699		struct sk_buff *skb;
3700		unsigned int qlen;
3701
3702		while ((skb = __skb_dequeue(&sd->process_queue))) {
3703			local_irq_enable();
3704			__netif_receive_skb(skb);
3705			local_irq_disable();
3706			input_queue_head_incr(sd);
3707			if (++work >= quota) {
3708				local_irq_enable();
3709				return work;
3710			}
3711		}
3712
 
3713		rps_lock(sd);
3714		qlen = skb_queue_len(&sd->input_pkt_queue);
3715		if (qlen)
3716			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3717						   &sd->process_queue);
3718
3719		if (qlen < quota - work) {
3720			/*
3721			 * Inline a custom version of __napi_complete().
3722			 * only current cpu owns and manipulates this napi,
3723			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3724			 * we can use a plain write instead of clear_bit(),
 
3725			 * and we dont need an smp_mb() memory barrier.
3726			 */
3727			list_del(&napi->poll_list);
3728			napi->state = 0;
3729
3730			quota = work + qlen;
 
 
3731		}
3732		rps_unlock(sd);
 
3733	}
3734	local_irq_enable();
3735
3736	return work;
3737}
3738
3739/**
3740 * __napi_schedule - schedule for receive
3741 * @n: entry to schedule
3742 *
3743 * The entry's receive function will be scheduled to run
 
3744 */
3745void __napi_schedule(struct napi_struct *n)
3746{
3747	unsigned long flags;
3748
3749	local_irq_save(flags);
3750	____napi_schedule(&__get_cpu_var(softnet_data), n);
3751	local_irq_restore(flags);
3752}
3753EXPORT_SYMBOL(__napi_schedule);
3754
3755void __napi_complete(struct napi_struct *n)
 
 
 
 
 
 
 
 
 
3756{
3757	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3758	BUG_ON(n->gro_list);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3759
3760	list_del(&n->poll_list);
3761	smp_mb__before_clear_bit();
3762	clear_bit(NAPI_STATE_SCHED, &n->state);
 
 
 
 
 
 
3763}
3764EXPORT_SYMBOL(__napi_complete);
3765
3766void napi_complete(struct napi_struct *n)
3767{
3768	unsigned long flags;
 
3769
3770	/*
3771	 * don't let napi dequeue from the cpu poll list
3772	 * just in case its running on a different cpu
 
 
3773	 */
3774	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3775		return;
3776
3777	napi_gro_flush(n);
3778	local_irq_save(flags);
3779	__napi_complete(n);
3780	local_irq_restore(flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3781}
3782EXPORT_SYMBOL(napi_complete);
3783
3784void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3785		    int (*poll)(struct napi_struct *, int), int weight)
3786{
3787	INIT_LIST_HEAD(&napi->poll_list);
3788	napi->gro_count = 0;
3789	napi->gro_list = NULL;
 
3790	napi->skb = NULL;
 
 
3791	napi->poll = poll;
 
 
 
3792	napi->weight = weight;
3793	list_add(&napi->dev_list, &dev->napi_list);
3794	napi->dev = dev;
3795#ifdef CONFIG_NETPOLL
3796	spin_lock_init(&napi->poll_lock);
3797	napi->poll_owner = -1;
3798#endif
3799	set_bit(NAPI_STATE_SCHED, &napi->state);
 
 
 
3800}
3801EXPORT_SYMBOL(netif_napi_add);
3802
3803void netif_napi_del(struct napi_struct *napi)
3804{
3805	struct sk_buff *skb, *next;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3806
 
 
 
 
 
 
3807	list_del_init(&napi->dev_list);
3808	napi_free_frags(napi);
3809
3810	for (skb = napi->gro_list; skb; skb = next) {
3811		next = skb->next;
3812		skb->next = NULL;
3813		kfree_skb(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3814	}
3815
3816	napi->gro_list = NULL;
3817	napi->gro_count = 0;
 
 
 
 
3818}
3819EXPORT_SYMBOL(netif_napi_del);
3820
3821static void net_rx_action(struct softirq_action *h)
3822{
3823	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3824	unsigned long time_limit = jiffies + 2;
 
3825	int budget = netdev_budget;
3826	void *have;
 
3827
3828	local_irq_disable();
 
 
3829
3830	while (!list_empty(&sd->poll_list)) {
3831		struct napi_struct *n;
3832		int work, weight;
3833
3834		/* If softirq window is exhuasted then punt.
 
 
 
 
 
 
 
 
 
3835		 * Allow this to run for 2 jiffies since which will allow
3836		 * an average latency of 1.5/HZ.
3837		 */
3838		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3839			goto softnet_break;
 
 
 
 
3840
3841		local_irq_enable();
3842
3843		/* Even though interrupts have been re-enabled, this
3844		 * access is safe because interrupts can only add new
3845		 * entries to the tail of this list, and only ->poll()
3846		 * calls can remove this head entry from the list.
3847		 */
3848		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3849
3850		have = netpoll_poll_lock(n);
 
 
 
3851
3852		weight = n->weight;
 
3853
3854		/* This NAPI_STATE_SCHED test is for avoiding a race
3855		 * with netpoll's poll_napi().  Only the entity which
3856		 * obtains the lock and sees NAPI_STATE_SCHED set will
3857		 * actually make the ->poll() call.  Therefore we avoid
3858		 * accidentally calling ->poll() when NAPI is not scheduled.
3859		 */
3860		work = 0;
3861		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3862			work = n->poll(n, weight);
3863			trace_napi_poll(n);
3864		}
3865
3866		WARN_ON_ONCE(work > weight);
 
3867
3868		budget -= work;
 
3869
3870		local_irq_disable();
 
3871
3872		/* Drivers must not modify the NAPI state if they
3873		 * consume the entire weight.  In such cases this code
3874		 * still "owns" the NAPI instance and therefore can
3875		 * move the instance around on the list at-will.
3876		 */
3877		if (unlikely(work == weight)) {
3878			if (unlikely(napi_disable_pending(n))) {
3879				local_irq_enable();
3880				napi_complete(n);
3881				local_irq_disable();
3882			} else
3883				list_move_tail(&n->poll_list, &sd->poll_list);
3884		}
3885
3886		netpoll_poll_unlock(have);
3887	}
3888out:
3889	net_rps_action_and_irq_enable(sd);
3890
3891#ifdef CONFIG_NET_DMA
3892	/*
3893	 * There may not be any more sk_buffs coming right now, so push
3894	 * any pending DMA copies to hardware
3895	 */
3896	dma_issue_pending_all();
3897#endif
3898
3899	return;
 
 
 
3900
3901softnet_break:
3902	sd->time_squeeze++;
3903	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3904	goto out;
3905}
3906
3907static gifconf_func_t *gifconf_list[NPROTO];
3908
3909/**
3910 *	register_gifconf	-	register a SIOCGIF handler
3911 *	@family: Address family
3912 *	@gifconf: Function handler
3913 *
3914 *	Register protocol dependent address dumping routines. The handler
3915 *	that is passed must not be freed or reused until it has been replaced
3916 *	by another handler.
3917 */
3918int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
 
3919{
3920	if (family >= NPROTO)
3921		return -EINVAL;
3922	gifconf_list[family] = gifconf;
3923	return 0;
 
 
 
 
3924}
3925EXPORT_SYMBOL(register_gifconf);
3926
 
 
 
 
 
 
 
 
 
3927
3928/*
3929 *	Map an interface index to its name (SIOCGIFNAME)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3930 */
 
 
 
3931
3932/*
3933 *	We need this ioctl for efficient implementation of the
3934 *	if_indextoname() function required by the IPv6 API.  Without
3935 *	it, we would have to search all the interfaces to find a
3936 *	match.  --pb
 
 
 
 
 
3937 */
 
 
 
3938
3939static int dev_ifname(struct net *net, struct ifreq __user *arg)
 
 
 
 
 
 
 
 
 
 
 
 
 
3940{
3941	struct net_device *dev;
3942	struct ifreq ifr;
3943
3944	/*
3945	 *	Fetch the caller's info block.
3946	 */
3947
3948	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3949		return -EFAULT;
3950
3951	rcu_read_lock();
3952	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3953	if (!dev) {
3954		rcu_read_unlock();
3955		return -ENODEV;
3956	}
3957
3958	strcpy(ifr.ifr_name, dev->name);
3959	rcu_read_unlock();
 
 
 
 
 
 
 
 
3960
3961	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3962		return -EFAULT;
3963	return 0;
3964}
3965
3966/*
3967 *	Perform a SIOCGIFCONF call. This structure will change
3968 *	size eventually, and there is nothing I can do about it.
3969 *	Thus we will need a 'compatibility mode'.
 
 
 
 
 
 
 
 
 
 
 
 
 
3970 */
 
 
 
 
 
 
3971
3972static int dev_ifconf(struct net *net, char __user *arg)
 
 
 
 
 
 
 
 
 
 
 
 
 
3973{
3974	struct ifconf ifc;
3975	struct net_device *dev;
3976	char __user *pos;
3977	int len;
3978	int total;
3979	int i;
3980
3981	/*
3982	 *	Fetch the caller's info block.
3983	 */
3984
3985	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3986		return -EFAULT;
3987
3988	pos = ifc.ifc_buf;
3989	len = ifc.ifc_len;
3990
3991	/*
3992	 *	Loop over the interfaces, and write an info block for each.
3993	 */
3994
3995	total = 0;
3996	for_each_netdev(net, dev) {
3997		for (i = 0; i < NPROTO; i++) {
3998			if (gifconf_list[i]) {
3999				int done;
4000				if (!pos)
4001					done = gifconf_list[i](dev, NULL, 0);
4002				else
4003					done = gifconf_list[i](dev, pos + total,
4004							       len - total);
4005				if (done < 0)
4006					return -EFAULT;
4007				total += done;
4008			}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4009		}
4010	}
4011
4012	/*
4013	 *	All done.  Write the updated control block back to the caller.
4014	 */
4015	ifc.ifc_len = total;
 
 
 
4016
4017	/*
4018	 * 	Both BSD and Solaris return 0 here, so we do too.
4019	 */
4020	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4021}
4022
4023#ifdef CONFIG_PROC_FS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4024
4025#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
 
 
 
 
 
4026
4027#define get_bucket(x) ((x) >> BUCKET_SPACE)
4028#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4029#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
 
 
 
4030
4031static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4032{
4033	struct net *net = seq_file_net(seq);
4034	struct net_device *dev;
4035	struct hlist_node *p;
4036	struct hlist_head *h;
4037	unsigned int count = 0, offset = get_offset(*pos);
4038
4039	h = &net->dev_name_head[get_bucket(*pos)];
4040	hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4041		if (++count == offset)
4042			return dev;
4043	}
4044
4045	return NULL;
4046}
 
4047
4048static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
 
4049{
4050	struct net_device *dev;
4051	unsigned int bucket;
 
 
4052
4053	do {
4054		dev = dev_from_same_bucket(seq, pos);
4055		if (dev)
4056			return dev;
4057
4058		bucket = get_bucket(*pos) + 1;
4059		*pos = set_bucket_offset(bucket, 1);
4060	} while (bucket < NETDEV_HASHENTRIES);
4061
4062	return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4063}
 
4064
4065/*
4066 *	This is invoked by the /proc filesystem handler to display a device
4067 *	in detail.
 
 
 
 
 
 
4068 */
4069void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4070	__acquires(RCU)
4071{
4072	rcu_read_lock();
4073	if (!*pos)
4074		return SEQ_START_TOKEN;
4075
4076	if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
 
 
4077		return NULL;
4078
4079	return dev_from_bucket(seq, pos);
 
 
4080}
 
4081
4082void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 
 
 
 
 
 
 
 
 
 
4083{
4084	++*pos;
4085	return dev_from_bucket(seq, pos);
 
 
 
 
 
 
 
 
4086}
 
4087
4088void dev_seq_stop(struct seq_file *seq, void *v)
4089	__releases(RCU)
4090{
4091	rcu_read_unlock();
 
 
 
 
 
 
 
 
 
4092}
4093
4094static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
 
 
4095{
4096	struct rtnl_link_stats64 temp;
4097	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
 
 
 
 
 
 
 
4098
4099	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4100		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4101		   dev->name, stats->rx_bytes, stats->rx_packets,
4102		   stats->rx_errors,
4103		   stats->rx_dropped + stats->rx_missed_errors,
4104		   stats->rx_fifo_errors,
4105		   stats->rx_length_errors + stats->rx_over_errors +
4106		    stats->rx_crc_errors + stats->rx_frame_errors,
4107		   stats->rx_compressed, stats->multicast,
4108		   stats->tx_bytes, stats->tx_packets,
4109		   stats->tx_errors, stats->tx_dropped,
4110		   stats->tx_fifo_errors, stats->collisions,
4111		   stats->tx_carrier_errors +
4112		    stats->tx_aborted_errors +
4113		    stats->tx_window_errors +
4114		    stats->tx_heartbeat_errors,
4115		   stats->tx_compressed);
4116}
4117
4118/*
4119 *	Called from the PROCfs module. This now uses the new arbitrary sized
4120 *	/proc/net interface to create /proc/net/dev
4121 */
4122static int dev_seq_show(struct seq_file *seq, void *v)
4123{
4124	if (v == SEQ_START_TOKEN)
4125		seq_puts(seq, "Inter-|   Receive                            "
4126			      "                    |  Transmit\n"
4127			      " face |bytes    packets errs drop fifo frame "
4128			      "compressed multicast|bytes    packets errs "
4129			      "drop fifo colls carrier compressed\n");
4130	else
4131		dev_seq_printf_stats(seq, v);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4132	return 0;
4133}
 
4134
4135static struct softnet_data *softnet_get_online(loff_t *pos)
4136{
4137	struct softnet_data *sd = NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4138
4139	while (*pos < nr_cpu_ids)
4140		if (cpu_online(*pos)) {
4141			sd = &per_cpu(softnet_data, *pos);
 
4142			break;
4143		} else
4144			++*pos;
4145	return sd;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4146}
 
4147
4148static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4149{
4150	return softnet_get_online(pos);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4151}
4152
4153static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4154{
4155	++*pos;
4156	return softnet_get_online(pos);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4157}
4158
4159static void softnet_seq_stop(struct seq_file *seq, void *v)
 
4160{
 
 
4161}
4162
4163static int softnet_seq_show(struct seq_file *seq, void *v)
 
4164{
4165	struct softnet_data *sd = v;
 
 
 
 
4166
4167	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4168		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4169		   0, 0, 0, 0, /* was fastroute */
4170		   sd->cpu_collision, sd->received_rps);
 
4171	return 0;
4172}
4173
4174static const struct seq_operations dev_seq_ops = {
4175	.start = dev_seq_start,
4176	.next  = dev_seq_next,
4177	.stop  = dev_seq_stop,
4178	.show  = dev_seq_show,
4179};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4180
4181static int dev_seq_open(struct inode *inode, struct file *file)
4182{
4183	return seq_open_net(inode, file, &dev_seq_ops,
4184			    sizeof(struct seq_net_private));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4185}
 
4186
4187static const struct file_operations dev_seq_fops = {
4188	.owner	 = THIS_MODULE,
4189	.open    = dev_seq_open,
4190	.read    = seq_read,
4191	.llseek  = seq_lseek,
4192	.release = seq_release_net,
4193};
 
 
 
 
 
4194
4195static const struct seq_operations softnet_seq_ops = {
4196	.start = softnet_seq_start,
4197	.next  = softnet_seq_next,
4198	.stop  = softnet_seq_stop,
4199	.show  = softnet_seq_show,
4200};
 
4201
4202static int softnet_seq_open(struct inode *inode, struct file *file)
 
 
 
 
 
 
 
4203{
4204	return seq_open(file, &softnet_seq_ops);
 
 
 
 
 
 
4205}
 
4206
4207static const struct file_operations softnet_seq_fops = {
4208	.owner	 = THIS_MODULE,
4209	.open    = softnet_seq_open,
4210	.read    = seq_read,
4211	.llseek  = seq_lseek,
4212	.release = seq_release,
4213};
4214
4215static void *ptype_get_idx(loff_t pos)
 
 
 
 
 
 
 
4216{
4217	struct packet_type *pt = NULL;
4218	loff_t i = 0;
4219	int t;
4220
4221	list_for_each_entry_rcu(pt, &ptype_all, list) {
4222		if (i == pos)
4223			return pt;
4224		++i;
4225	}
4226
4227	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4228		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4229			if (i == pos)
4230				return pt;
4231			++i;
4232		}
4233	}
4234	return NULL;
4235}
4236
4237static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4238	__acquires(RCU)
 
4239{
4240	rcu_read_lock();
4241	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
 
4242}
4243
4244static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 
 
4245{
4246	struct packet_type *pt;
4247	struct list_head *nxt;
4248	int hash;
4249
4250	++*pos;
4251	if (v == SEQ_START_TOKEN)
4252		return ptype_get_idx(0);
4253
4254	pt = v;
4255	nxt = pt->list.next;
4256	if (pt->type == htons(ETH_P_ALL)) {
4257		if (nxt != &ptype_all)
4258			goto found;
4259		hash = 0;
4260		nxt = ptype_base[0].next;
4261	} else
4262		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4263
4264	while (nxt == &ptype_base[hash]) {
4265		if (++hash >= PTYPE_HASH_SIZE)
4266			return NULL;
4267		nxt = ptype_base[hash].next;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4268	}
4269found:
4270	return list_entry(nxt, struct packet_type, list);
 
 
 
 
 
 
 
 
 
4271}
4272
4273static void ptype_seq_stop(struct seq_file *seq, void *v)
4274	__releases(RCU)
 
 
4275{
4276	rcu_read_unlock();
4277}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4278
4279static int ptype_seq_show(struct seq_file *seq, void *v)
 
 
 
 
 
 
 
 
 
 
 
4280{
4281	struct packet_type *pt = v;
4282
4283	if (v == SEQ_START_TOKEN)
4284		seq_puts(seq, "Type Device      Function\n");
4285	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4286		if (pt->type == htons(ETH_P_ALL))
4287			seq_puts(seq, "ALL ");
4288		else
4289			seq_printf(seq, "%04x", ntohs(pt->type));
4290
4291		seq_printf(seq, " %-8s %pF\n",
4292			   pt->dev ? pt->dev->name : "", pt->func);
 
 
 
4293	}
4294
4295	return 0;
4296}
4297
4298static const struct seq_operations ptype_seq_ops = {
4299	.start = ptype_seq_start,
4300	.next  = ptype_seq_next,
4301	.stop  = ptype_seq_stop,
4302	.show  = ptype_seq_show,
4303};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4304
4305static int ptype_seq_open(struct inode *inode, struct file *file)
4306{
4307	return seq_open_net(inode, file, &ptype_seq_ops,
4308			sizeof(struct seq_net_private));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4309}
4310
4311static const struct file_operations ptype_seq_fops = {
4312	.owner	 = THIS_MODULE,
4313	.open    = ptype_seq_open,
4314	.read    = seq_read,
4315	.llseek  = seq_lseek,
4316	.release = seq_release_net,
4317};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4318
 
 
4319
4320static int __net_init dev_proc_net_init(struct net *net)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4321{
4322	int rc = -ENOMEM;
 
 
 
4323
4324	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4325		goto out;
4326	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4327		goto out_dev;
4328	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4329		goto out_softnet;
4330
4331	if (wext_proc_init(net))
4332		goto out_ptype;
4333	rc = 0;
4334out:
4335	return rc;
4336out_ptype:
4337	proc_net_remove(net, "ptype");
4338out_softnet:
4339	proc_net_remove(net, "softnet_stat");
4340out_dev:
4341	proc_net_remove(net, "dev");
4342	goto out;
 
4343}
4344
4345static void __net_exit dev_proc_net_exit(struct net *net)
 
4346{
4347	wext_proc_exit(net);
 
4348
4349	proc_net_remove(net, "ptype");
4350	proc_net_remove(net, "softnet_stat");
4351	proc_net_remove(net, "dev");
 
4352}
4353
4354static struct pernet_operations __net_initdata dev_proc_ops = {
4355	.init = dev_proc_net_init,
4356	.exit = dev_proc_net_exit,
4357};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4358
4359static int __init dev_proc_init(void)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4360{
4361	return register_pernet_subsys(&dev_proc_ops);
 
 
 
 
 
 
 
 
 
 
 
4362}
4363#else
4364#define dev_proc_init() 0
4365#endif	/* CONFIG_PROC_FS */
 
 
 
 
 
 
 
 
 
 
 
 
 
4366
 
 
 
 
 
 
4367
4368/**
4369 *	netdev_set_master	-	set up master pointer
4370 *	@slave: slave device
4371 *	@master: new master device
 
4372 *
4373 *	Changes the master device of the slave. Pass %NULL to break the
4374 *	bonding. The caller must hold the RTNL semaphore. On a failure
4375 *	a negative errno code is returned. On success the reference counts
4376 *	are adjusted and the function returns zero.
4377 */
4378int netdev_set_master(struct net_device *slave, struct net_device *master)
 
 
 
4379{
4380	struct net_device *old = slave->master;
4381
4382	ASSERT_RTNL();
 
 
 
 
4383
4384	if (master) {
4385		if (old)
4386			return -EBUSY;
4387		dev_hold(master);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4388	}
 
4389
4390	slave->master = master;
 
 
4391
4392	if (old)
4393		dev_put(old);
4394	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4395}
4396EXPORT_SYMBOL(netdev_set_master);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4397
4398/**
4399 *	netdev_set_bond_master	-	set up bonding master/slave pair
4400 *	@slave: slave device
4401 *	@master: new master device
4402 *
4403 *	Changes the master device of the slave. Pass %NULL to break the
4404 *	bonding. The caller must hold the RTNL semaphore. On a failure
4405 *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4406 *	to the routing socket and the function returns zero.
4407 */
4408int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
 
4409{
4410	int err;
 
 
4411
4412	ASSERT_RTNL();
4413
4414	err = netdev_set_master(slave, master);
4415	if (err)
4416		return err;
4417	if (master)
4418		slave->flags |= IFF_SLAVE;
4419	else
4420		slave->flags &= ~IFF_SLAVE;
4421
4422	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4423	return 0;
4424}
4425EXPORT_SYMBOL(netdev_set_bond_master);
4426
4427static void dev_change_rx_flags(struct net_device *dev, int flags)
4428{
4429	const struct net_device_ops *ops = dev->netdev_ops;
4430
4431	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4432		ops->ndo_change_rx_flags(dev, flags);
4433}
4434
4435static int __dev_set_promiscuity(struct net_device *dev, int inc)
4436{
4437	unsigned int old_flags = dev->flags;
4438	uid_t uid;
4439	gid_t gid;
4440
4441	ASSERT_RTNL();
4442
4443	dev->flags |= IFF_PROMISC;
4444	dev->promiscuity += inc;
4445	if (dev->promiscuity == 0) {
4446		/*
4447		 * Avoid overflow.
4448		 * If inc causes overflow, untouch promisc and return error.
4449		 */
4450		if (inc < 0)
4451			dev->flags &= ~IFF_PROMISC;
4452		else {
4453			dev->promiscuity -= inc;
4454			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4455				dev->name);
4456			return -EOVERFLOW;
4457		}
4458	}
4459	if (dev->flags != old_flags) {
4460		pr_info("device %s %s promiscuous mode\n",
4461			dev->name,
4462			dev->flags & IFF_PROMISC ? "entered" : "left");
4463		if (audit_enabled) {
4464			current_uid_gid(&uid, &gid);
4465			audit_log(current->audit_context, GFP_ATOMIC,
4466				AUDIT_ANOM_PROMISCUOUS,
4467				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4468				dev->name, (dev->flags & IFF_PROMISC),
4469				(old_flags & IFF_PROMISC),
4470				audit_get_loginuid(current),
4471				uid, gid,
4472				audit_get_sessionid(current));
 
4473		}
4474
4475		dev_change_rx_flags(dev, IFF_PROMISC);
4476	}
 
 
4477	return 0;
4478}
4479
4480/**
4481 *	dev_set_promiscuity	- update promiscuity count on a device
4482 *	@dev: device
4483 *	@inc: modifier
4484 *
4485 *	Add or remove promiscuity from a device. While the count in the device
4486 *	remains above zero the interface remains promiscuous. Once it hits zero
4487 *	the device reverts back to normal filtering operation. A negative inc
4488 *	value is used to drop promiscuity on the device.
4489 *	Return 0 if successful or a negative errno code on error.
4490 */
4491int dev_set_promiscuity(struct net_device *dev, int inc)
4492{
4493	unsigned int old_flags = dev->flags;
4494	int err;
4495
4496	err = __dev_set_promiscuity(dev, inc);
4497	if (err < 0)
4498		return err;
4499	if (dev->flags != old_flags)
4500		dev_set_rx_mode(dev);
4501	return err;
4502}
4503EXPORT_SYMBOL(dev_set_promiscuity);
4504
4505/**
4506 *	dev_set_allmulti	- update allmulti count on a device
4507 *	@dev: device
4508 *	@inc: modifier
4509 *
4510 *	Add or remove reception of all multicast frames to a device. While the
4511 *	count in the device remains above zero the interface remains listening
4512 *	to all interfaces. Once it hits zero the device reverts back to normal
4513 *	filtering operation. A negative @inc value is used to drop the counter
4514 *	when releasing a resource needing all multicasts.
4515 *	Return 0 if successful or a negative errno code on error.
4516 */
4517
4518int dev_set_allmulti(struct net_device *dev, int inc)
4519{
4520	unsigned int old_flags = dev->flags;
4521
4522	ASSERT_RTNL();
4523
4524	dev->flags |= IFF_ALLMULTI;
4525	dev->allmulti += inc;
4526	if (dev->allmulti == 0) {
4527		/*
4528		 * Avoid overflow.
4529		 * If inc causes overflow, untouch allmulti and return error.
4530		 */
4531		if (inc < 0)
4532			dev->flags &= ~IFF_ALLMULTI;
4533		else {
4534			dev->allmulti -= inc;
4535			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4536				dev->name);
4537			return -EOVERFLOW;
4538		}
4539	}
4540	if (dev->flags ^ old_flags) {
4541		dev_change_rx_flags(dev, IFF_ALLMULTI);
4542		dev_set_rx_mode(dev);
 
 
 
4543	}
4544	return 0;
4545}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4546EXPORT_SYMBOL(dev_set_allmulti);
4547
4548/*
4549 *	Upload unicast and multicast address lists to device and
4550 *	configure RX filtering. When the device doesn't support unicast
4551 *	filtering it is put in promiscuous mode while unicast addresses
4552 *	are present.
4553 */
4554void __dev_set_rx_mode(struct net_device *dev)
4555{
4556	const struct net_device_ops *ops = dev->netdev_ops;
4557
4558	/* dev_open will call this function so the list will stay sane. */
4559	if (!(dev->flags&IFF_UP))
4560		return;
4561
4562	if (!netif_device_present(dev))
4563		return;
4564
4565	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4566		/* Unicast addresses changes may only happen under the rtnl,
4567		 * therefore calling __dev_set_promiscuity here is safe.
4568		 */
4569		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4570			__dev_set_promiscuity(dev, 1);
4571			dev->uc_promisc = true;
4572		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4573			__dev_set_promiscuity(dev, -1);
4574			dev->uc_promisc = false;
4575		}
4576	}
4577
4578	if (ops->ndo_set_rx_mode)
4579		ops->ndo_set_rx_mode(dev);
4580}
4581
4582void dev_set_rx_mode(struct net_device *dev)
4583{
4584	netif_addr_lock_bh(dev);
4585	__dev_set_rx_mode(dev);
4586	netif_addr_unlock_bh(dev);
4587}
4588
4589/**
4590 *	dev_get_flags - get flags reported to userspace
4591 *	@dev: device
4592 *
4593 *	Get the combination of flag bits exported through APIs to userspace.
4594 */
4595unsigned int dev_get_flags(const struct net_device *dev)
4596{
4597	unsigned int flags;
4598
4599	flags = (dev->flags & ~(IFF_PROMISC |
4600				IFF_ALLMULTI |
4601				IFF_RUNNING |
4602				IFF_LOWER_UP |
4603				IFF_DORMANT)) |
4604		(dev->gflags & (IFF_PROMISC |
4605				IFF_ALLMULTI));
4606
4607	if (netif_running(dev)) {
4608		if (netif_oper_up(dev))
4609			flags |= IFF_RUNNING;
4610		if (netif_carrier_ok(dev))
4611			flags |= IFF_LOWER_UP;
4612		if (netif_dormant(dev))
4613			flags |= IFF_DORMANT;
4614	}
4615
4616	return flags;
4617}
4618EXPORT_SYMBOL(dev_get_flags);
4619
4620int __dev_change_flags(struct net_device *dev, unsigned int flags)
 
4621{
4622	unsigned int old_flags = dev->flags;
4623	int ret;
4624
4625	ASSERT_RTNL();
4626
4627	/*
4628	 *	Set the flags on our device.
4629	 */
4630
4631	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4632			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4633			       IFF_AUTOMEDIA)) |
4634		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4635				    IFF_ALLMULTI));
4636
4637	/*
4638	 *	Load in the correct multicast list now the flags have changed.
4639	 */
4640
4641	if ((old_flags ^ flags) & IFF_MULTICAST)
4642		dev_change_rx_flags(dev, IFF_MULTICAST);
4643
4644	dev_set_rx_mode(dev);
4645
4646	/*
4647	 *	Have we downed the interface. We handle IFF_UP ourselves
4648	 *	according to user attempts to set it, rather than blindly
4649	 *	setting it.
4650	 */
4651
4652	ret = 0;
4653	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4654		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4655
4656		if (!ret)
4657			dev_set_rx_mode(dev);
4658	}
4659
4660	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4661		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 
4662
4663		dev->gflags ^= IFF_PROMISC;
4664		dev_set_promiscuity(dev, inc);
 
 
 
4665	}
4666
4667	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4668	   is important. Some (broken) drivers set IFF_PROMISC, when
4669	   IFF_ALLMULTI is requested not asking us and not reporting.
4670	 */
4671	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4672		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4673
4674		dev->gflags ^= IFF_ALLMULTI;
4675		dev_set_allmulti(dev, inc);
4676	}
4677
4678	return ret;
4679}
4680
4681void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
 
4682{
4683	unsigned int changes = dev->flags ^ old_flags;
4684
 
 
 
4685	if (changes & IFF_UP) {
4686		if (dev->flags & IFF_UP)
4687			call_netdevice_notifiers(NETDEV_UP, dev);
4688		else
4689			call_netdevice_notifiers(NETDEV_DOWN, dev);
4690	}
4691
4692	if (dev->flags & IFF_UP &&
4693	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4694		call_netdevice_notifiers(NETDEV_CHANGE, dev);
 
 
 
 
 
 
 
 
4695}
4696
4697/**
4698 *	dev_change_flags - change device settings
4699 *	@dev: device
4700 *	@flags: device state flags
 
4701 *
4702 *	Change settings on device based state flags. The flags are
4703 *	in the userspace exported format.
4704 */
4705int dev_change_flags(struct net_device *dev, unsigned int flags)
 
4706{
4707	int ret;
4708	unsigned int changes, old_flags = dev->flags;
4709
4710	ret = __dev_change_flags(dev, flags);
4711	if (ret < 0)
4712		return ret;
4713
4714	changes = old_flags ^ dev->flags;
4715	if (changes)
4716		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4717
4718	__dev_notify_flags(dev, old_flags);
4719	return ret;
4720}
4721EXPORT_SYMBOL(dev_change_flags);
4722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4723/**
4724 *	dev_set_mtu - Change maximum transfer unit
4725 *	@dev: device
4726 *	@new_mtu: new transfer unit
 
4727 *
4728 *	Change the maximum transfer size of the network device.
4729 */
4730int dev_set_mtu(struct net_device *dev, int new_mtu)
 
4731{
4732	const struct net_device_ops *ops = dev->netdev_ops;
4733	int err;
4734
4735	if (new_mtu == dev->mtu)
4736		return 0;
4737
4738	/*	MTU must be positive.	 */
4739	if (new_mtu < 0)
4740		return -EINVAL;
4741
4742	if (!netif_device_present(dev))
4743		return -ENODEV;
4744
4745	err = 0;
4746	if (ops->ndo_change_mtu)
4747		err = ops->ndo_change_mtu(dev, new_mtu);
4748	else
4749		dev->mtu = new_mtu;
 
 
4750
4751	if (!err && dev->flags & IFF_UP)
4752		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4753	return err;
4754}
4755EXPORT_SYMBOL(dev_set_mtu);
4756
4757/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4758 *	dev_set_group - Change group this device belongs to
4759 *	@dev: device
4760 *	@new_group: group this device should belong to
4761 */
4762void dev_set_group(struct net_device *dev, int new_group)
4763{
4764	dev->group = new_group;
4765}
4766EXPORT_SYMBOL(dev_set_group);
4767
4768/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4769 *	dev_set_mac_address - Change Media Access Control Address
4770 *	@dev: device
4771 *	@sa: new address
 
4772 *
4773 *	Change the hardware (MAC) address of the device
4774 */
4775int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 
4776{
4777	const struct net_device_ops *ops = dev->netdev_ops;
4778	int err;
4779
4780	if (!ops->ndo_set_mac_address)
4781		return -EOPNOTSUPP;
4782	if (sa->sa_family != dev->type)
4783		return -EINVAL;
4784	if (!netif_device_present(dev))
4785		return -ENODEV;
 
 
 
4786	err = ops->ndo_set_mac_address(dev, sa);
4787	if (!err)
4788		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 
 
4789	add_device_randomness(dev->dev_addr, dev->addr_len);
4790	return err;
4791}
4792EXPORT_SYMBOL(dev_set_mac_address);
4793
4794/*
4795 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
 
 
 
 
4796 */
4797static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4798{
4799	int err;
4800	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4801
4802	if (!dev)
 
 
4803		return -ENODEV;
 
 
 
4804
4805	switch (cmd) {
4806	case SIOCGIFFLAGS:	/* Get interface flags */
4807		ifr->ifr_flags = (short) dev_get_flags(dev);
4808		return 0;
4809
4810	case SIOCGIFMETRIC:	/* Get the metric on the interface
4811				   (currently unused) */
4812		ifr->ifr_metric = 0;
4813		return 0;
 
 
4814
4815	case SIOCGIFMTU:	/* Get the MTU of a device */
4816		ifr->ifr_mtu = dev->mtu;
4817		return 0;
 
 
4818
4819	case SIOCGIFHWADDR:
4820		if (!dev->addr_len)
4821			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4822		else
4823			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4824			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4825		ifr->ifr_hwaddr.sa_family = dev->type;
4826		return 0;
 
 
 
 
 
4827
4828	case SIOCGIFSLAVE:
4829		err = -EINVAL;
4830		break;
 
 
 
 
 
4831
4832	case SIOCGIFMAP:
4833		ifr->ifr_map.mem_start = dev->mem_start;
4834		ifr->ifr_map.mem_end   = dev->mem_end;
4835		ifr->ifr_map.base_addr = dev->base_addr;
4836		ifr->ifr_map.irq       = dev->irq;
4837		ifr->ifr_map.dma       = dev->dma;
4838		ifr->ifr_map.port      = dev->if_port;
4839		return 0;
 
 
 
 
 
 
 
 
 
4840
4841	case SIOCGIFINDEX:
4842		ifr->ifr_ifindex = dev->ifindex;
4843		return 0;
 
 
4844
4845	case SIOCGIFTXQLEN:
4846		ifr->ifr_qlen = dev->tx_queue_len;
4847		return 0;
4848
4849	default:
4850		/* dev_ioctl() should ensure this case
4851		 * is never reached
4852		 */
4853		WARN_ON(1);
4854		err = -ENOTTY;
4855		break;
4856
 
 
 
 
 
 
 
 
4857	}
 
4858	return err;
4859}
 
4860
4861/*
4862 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
 
 
 
4863 */
4864static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4865{
4866	int err;
4867	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4868	const struct net_device_ops *ops;
4869
4870	if (!dev)
4871		return -ENODEV;
 
4872
4873	ops = dev->netdev_ops;
 
 
4874
4875	switch (cmd) {
4876	case SIOCSIFFLAGS:	/* Set interface flags */
4877		return dev_change_flags(dev, ifr->ifr_flags);
 
 
 
 
 
 
 
 
4878
4879	case SIOCSIFMETRIC:	/* Set the metric on the interface
4880				   (currently unused) */
4881		return -EOPNOTSUPP;
 
 
 
 
 
4882
4883	case SIOCSIFMTU:	/* Set the MTU of a device */
4884		return dev_set_mtu(dev, ifr->ifr_mtu);
4885
4886	case SIOCSIFHWADDR:
4887		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
 
 
 
 
 
 
 
 
 
 
 
 
 
4888
4889	case SIOCSIFHWBROADCAST:
4890		if (ifr->ifr_hwaddr.sa_family != dev->type)
4891			return -EINVAL;
4892		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4893		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4894		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4895		return 0;
 
 
 
 
4896
4897	case SIOCSIFMAP:
4898		if (ops->ndo_set_config) {
4899			if (!netif_device_present(dev))
4900				return -ENODEV;
4901			return ops->ndo_set_config(dev, &ifr->ifr_map);
 
 
 
4902		}
4903		return -EOPNOTSUPP;
 
 
4904
4905	case SIOCADDMULTI:
4906		if (!ops->ndo_set_rx_mode ||
4907		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4908			return -EINVAL;
4909		if (!netif_device_present(dev))
4910			return -ENODEV;
4911		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4912
4913	case SIOCDELMULTI:
4914		if (!ops->ndo_set_rx_mode ||
4915		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4916			return -EINVAL;
4917		if (!netif_device_present(dev))
4918			return -ENODEV;
4919		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4920
4921	case SIOCSIFTXQLEN:
4922		if (ifr->ifr_qlen < 0)
4923			return -EINVAL;
4924		dev->tx_queue_len = ifr->ifr_qlen;
4925		return 0;
4926
4927	case SIOCSIFNAME:
4928		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4929		return dev_change_name(dev, ifr->ifr_newname);
 
4930
4931	case SIOCSHWTSTAMP:
4932		err = net_hwtstamp_validate(ifr);
4933		if (err)
4934			return err;
4935		/* fall through */
4936
4937	/*
4938	 *	Unknown or private ioctl
4939	 */
4940	default:
4941		if ((cmd >= SIOCDEVPRIVATE &&
4942		    cmd <= SIOCDEVPRIVATE + 15) ||
4943		    cmd == SIOCBONDENSLAVE ||
4944		    cmd == SIOCBONDRELEASE ||
4945		    cmd == SIOCBONDSETHWADDR ||
4946		    cmd == SIOCBONDSLAVEINFOQUERY ||
4947		    cmd == SIOCBONDINFOQUERY ||
4948		    cmd == SIOCBONDCHANGEACTIVE ||
4949		    cmd == SIOCGMIIPHY ||
4950		    cmd == SIOCGMIIREG ||
4951		    cmd == SIOCSMIIREG ||
4952		    cmd == SIOCBRADDIF ||
4953		    cmd == SIOCBRDELIF ||
4954		    cmd == SIOCSHWTSTAMP ||
4955		    cmd == SIOCWANDEV) {
4956			err = -EOPNOTSUPP;
4957			if (ops->ndo_do_ioctl) {
4958				if (netif_device_present(dev))
4959					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4960				else
4961					err = -ENODEV;
4962			}
4963		} else
4964			err = -EINVAL;
4965
4966	}
4967	return err;
4968}
4969
4970/*
4971 *	This function handles all "interface"-type I/O control requests. The actual
4972 *	'doing' part of this is dev_ifsioc above.
4973 */
 
 
4974
4975/**
4976 *	dev_ioctl	-	network device ioctl
4977 *	@net: the applicable net namespace
4978 *	@cmd: command to issue
4979 *	@arg: pointer to a struct ifreq in user space
4980 *
4981 *	Issue ioctl functions to devices. This is normally called by the
4982 *	user space syscall interfaces but can sometimes be useful for
4983 *	other purposes. The return value is the return from the syscall if
4984 *	positive or a negative errno code on error.
4985 */
4986
4987int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 
 
4988{
4989	struct ifreq ifr;
4990	int ret;
4991	char *colon;
4992
4993	/* One special case: SIOCGIFCONF takes ifconf argument
4994	   and requires shared lock, because it sleeps writing
4995	   to user space.
4996	 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4997
4998	if (cmd == SIOCGIFCONF) {
4999		rtnl_lock();
5000		ret = dev_ifconf(net, (char __user *) arg);
5001		rtnl_unlock();
5002		return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5003	}
5004	if (cmd == SIOCGIFNAME)
5005		return dev_ifname(net, (struct ifreq __user *)arg);
5006
5007	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5008		return -EFAULT;
 
 
 
 
 
 
5009
5010	ifr.ifr_name[IFNAMSIZ-1] = 0;
5011
5012	colon = strchr(ifr.ifr_name, ':');
5013	if (colon)
5014		*colon = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5015
5016	/*
5017	 *	See which interface the caller is talking about.
5018	 */
 
 
 
5019
5020	switch (cmd) {
5021	/*
5022	 *	These ioctl calls:
5023	 *	- can be done by all.
5024	 *	- atomic and do not require locking.
5025	 *	- return a value
5026	 */
5027	case SIOCGIFFLAGS:
5028	case SIOCGIFMETRIC:
5029	case SIOCGIFMTU:
5030	case SIOCGIFHWADDR:
5031	case SIOCGIFSLAVE:
5032	case SIOCGIFMAP:
5033	case SIOCGIFINDEX:
5034	case SIOCGIFTXQLEN:
5035		dev_load(net, ifr.ifr_name);
5036		rcu_read_lock();
5037		ret = dev_ifsioc_locked(net, &ifr, cmd);
5038		rcu_read_unlock();
5039		if (!ret) {
5040			if (colon)
5041				*colon = ':';
5042			if (copy_to_user(arg, &ifr,
5043					 sizeof(struct ifreq)))
5044				ret = -EFAULT;
5045		}
5046		return ret;
5047
5048	case SIOCETHTOOL:
5049		dev_load(net, ifr.ifr_name);
5050		rtnl_lock();
5051		ret = dev_ethtool(net, &ifr);
5052		rtnl_unlock();
5053		if (!ret) {
5054			if (colon)
5055				*colon = ':';
5056			if (copy_to_user(arg, &ifr,
5057					 sizeof(struct ifreq)))
5058				ret = -EFAULT;
 
5059		}
5060		return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5061
5062	/*
5063	 *	These ioctl calls:
5064	 *	- require superuser power.
5065	 *	- require strict serialization.
5066	 *	- return a value
5067	 */
5068	case SIOCGMIIPHY:
5069	case SIOCGMIIREG:
5070	case SIOCSIFNAME:
5071		if (!capable(CAP_NET_ADMIN))
5072			return -EPERM;
5073		dev_load(net, ifr.ifr_name);
5074		rtnl_lock();
5075		ret = dev_ifsioc(net, &ifr, cmd);
5076		rtnl_unlock();
5077		if (!ret) {
5078			if (colon)
5079				*colon = ':';
5080			if (copy_to_user(arg, &ifr,
5081					 sizeof(struct ifreq)))
5082				ret = -EFAULT;
5083		}
5084		return ret;
5085
5086	/*
5087	 *	These ioctl calls:
5088	 *	- require superuser power.
5089	 *	- require strict serialization.
5090	 *	- do not return a value
5091	 */
5092	case SIOCSIFFLAGS:
5093	case SIOCSIFMETRIC:
5094	case SIOCSIFMTU:
5095	case SIOCSIFMAP:
5096	case SIOCSIFHWADDR:
5097	case SIOCSIFSLAVE:
5098	case SIOCADDMULTI:
5099	case SIOCDELMULTI:
5100	case SIOCSIFHWBROADCAST:
5101	case SIOCSIFTXQLEN:
5102	case SIOCSMIIREG:
5103	case SIOCBONDENSLAVE:
5104	case SIOCBONDRELEASE:
5105	case SIOCBONDSETHWADDR:
5106	case SIOCBONDCHANGEACTIVE:
5107	case SIOCBRADDIF:
5108	case SIOCBRDELIF:
5109	case SIOCSHWTSTAMP:
5110		if (!capable(CAP_NET_ADMIN))
5111			return -EPERM;
5112		/* fall through */
5113	case SIOCBONDSLAVEINFOQUERY:
5114	case SIOCBONDINFOQUERY:
5115		dev_load(net, ifr.ifr_name);
5116		rtnl_lock();
5117		ret = dev_ifsioc(net, &ifr, cmd);
5118		rtnl_unlock();
5119		return ret;
5120
5121	case SIOCGIFMEM:
5122		/* Get the per device memory space. We can add this but
5123		 * currently do not support it */
5124	case SIOCSIFMEM:
5125		/* Set the per device memory buffer space.
5126		 * Not applicable in our case */
5127	case SIOCSIFLINK:
5128		return -ENOTTY;
5129
5130	/*
5131	 *	Unknown or private ioctl.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5132	 */
5133	default:
5134		if (cmd == SIOCWANDEV ||
5135		    (cmd >= SIOCDEVPRIVATE &&
5136		     cmd <= SIOCDEVPRIVATE + 15)) {
5137			dev_load(net, ifr.ifr_name);
5138			rtnl_lock();
5139			ret = dev_ifsioc(net, &ifr, cmd);
5140			rtnl_unlock();
5141			if (!ret && copy_to_user(arg, &ifr,
5142						 sizeof(struct ifreq)))
5143				ret = -EFAULT;
5144			return ret;
5145		}
5146		/* Take care of Wireless Extensions */
5147		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5148			return wext_handle_ioctl(net, &ifr, cmd, arg);
5149		return -ENOTTY;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5150	}
 
 
 
 
 
 
 
 
 
5151}
5152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5153
5154/**
5155 *	dev_new_index	-	allocate an ifindex
5156 *	@net: the applicable net namespace
5157 *
5158 *	Returns a suitable unique value for a new device interface
5159 *	number.  The caller must hold the rtnl semaphore or the
5160 *	dev_base_lock to be sure it remains unique.
5161 */
5162static int dev_new_index(struct net *net)
5163{
5164	static int ifindex;
 
5165	for (;;) {
5166		if (++ifindex <= 0)
5167			ifindex = 1;
5168		if (!__dev_get_by_index(net, ifindex))
5169			return ifindex;
5170	}
5171}
5172
5173/* Delayed registration/unregisteration */
5174static LIST_HEAD(net_todo_list);
 
5175
5176static void net_set_todo(struct net_device *dev)
5177{
5178	list_add_tail(&dev->todo_list, &net_todo_list);
 
5179}
5180
5181static void rollback_registered_many(struct list_head *head)
5182{
5183	struct net_device *dev, *tmp;
 
5184
5185	BUG_ON(dev_boot_phase);
5186	ASSERT_RTNL();
5187
5188	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5189		/* Some devices call without registering
5190		 * for initialization unwind. Remove those
5191		 * devices and proceed with the remaining.
5192		 */
5193		if (dev->reg_state == NETREG_UNINITIALIZED) {
5194			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5195				 dev->name, dev);
5196
5197			WARN_ON(1);
5198			list_del(&dev->unreg_list);
5199			continue;
5200		}
5201		dev->dismantle = true;
5202		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5203	}
5204
5205	/* If device is running, close it first. */
5206	dev_close_many(head);
 
 
5207
5208	list_for_each_entry(dev, head, unreg_list) {
5209		/* And unlink it from device chain. */
5210		unlist_netdevice(dev);
5211
5212		dev->reg_state = NETREG_UNREGISTERING;
5213	}
 
5214
5215	synchronize_net();
5216
5217	list_for_each_entry(dev, head, unreg_list) {
 
 
5218		/* Shutdown queueing discipline. */
5219		dev_shutdown(dev);
5220
 
5221
5222		/* Notify protocols, that we are about to destroy
5223		   this device. They should clean all the things.
5224		*/
5225		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5226
5227		if (!dev->rtnl_link_ops ||
5228		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5229			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
 
5230
5231		/*
5232		 *	Flush the unicast and multicast chains
5233		 */
5234		dev_uc_flush(dev);
5235		dev_mc_flush(dev);
5236
 
 
 
5237		if (dev->netdev_ops->ndo_uninit)
5238			dev->netdev_ops->ndo_uninit(dev);
5239
5240		/* Notifier chain MUST detach us from master device. */
5241		WARN_ON(dev->master);
 
 
 
 
5242
5243		/* Remove entries from kobject tree */
5244		netdev_unregister_kobject(dev);
 
 
 
 
5245	}
5246
5247	/* Process any work delayed until the end of the batch */
5248	dev = list_first_entry(head, struct net_device, unreg_list);
5249	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5250
5251	synchronize_net();
5252
5253	list_for_each_entry(dev, head, unreg_list)
5254		dev_put(dev);
5255}
5256
5257static void rollback_registered(struct net_device *dev)
5258{
5259	LIST_HEAD(single);
5260
5261	list_add(&dev->unreg_list, &single);
5262	rollback_registered_many(&single);
5263	list_del(&single);
5264}
5265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5266static netdev_features_t netdev_fix_features(struct net_device *dev,
5267	netdev_features_t features)
5268{
5269	/* Fix illegal checksum combinations */
5270	if ((features & NETIF_F_HW_CSUM) &&
5271	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5272		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5273		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5274	}
5275
5276	/* Fix illegal SG+CSUM combinations. */
5277	if ((features & NETIF_F_SG) &&
5278	    !(features & NETIF_F_ALL_CSUM)) {
5279		netdev_dbg(dev,
5280			"Dropping NETIF_F_SG since no checksum feature.\n");
5281		features &= ~NETIF_F_SG;
5282	}
5283
5284	/* TSO requires that SG is present as well. */
5285	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5286		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5287		features &= ~NETIF_F_ALL_TSO;
5288	}
5289
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5290	/* TSO ECN requires that TSO is present as well. */
5291	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5292		features &= ~NETIF_F_TSO_ECN;
5293
5294	/* Software GSO depends on SG. */
5295	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5296		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5297		features &= ~NETIF_F_GSO;
5298	}
5299
5300	/* UFO needs SG and checksumming */
5301	if (features & NETIF_F_UFO) {
5302		/* maybe split UFO into V4 and V6? */
5303		if (!((features & NETIF_F_GEN_CSUM) ||
5304		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5305			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5306			netdev_dbg(dev,
5307				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5308			features &= ~NETIF_F_UFO;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5309		}
5310
5311		if (!(features & NETIF_F_SG)) {
5312			netdev_dbg(dev,
5313				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5314			features &= ~NETIF_F_UFO;
5315		}
5316	}
5317
5318	return features;
5319}
5320
5321int __netdev_update_features(struct net_device *dev)
5322{
 
5323	netdev_features_t features;
5324	int err = 0;
 
5325
5326	ASSERT_RTNL();
5327
5328	features = netdev_get_wanted_features(dev);
5329
5330	if (dev->netdev_ops->ndo_fix_features)
5331		features = dev->netdev_ops->ndo_fix_features(dev, features);
5332
5333	/* driver might be less strict about feature dependencies */
5334	features = netdev_fix_features(dev, features);
5335
 
 
 
 
5336	if (dev->features == features)
5337		return 0;
5338
5339	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5340		&dev->features, &features);
5341
5342	if (dev->netdev_ops->ndo_set_features)
5343		err = dev->netdev_ops->ndo_set_features(dev, features);
 
 
5344
5345	if (unlikely(err < 0)) {
5346		netdev_err(dev,
5347			"set_features() failed (%d); wanted %pNF, left %pNF\n",
5348			err, &features, &dev->features);
 
 
 
5349		return -1;
5350	}
5351
5352	if (!err)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5353		dev->features = features;
 
5354
5355	return 1;
5356}
5357
5358/**
5359 *	netdev_update_features - recalculate device features
5360 *	@dev: the device to check
5361 *
5362 *	Recalculate dev->features set and send notifications if it
5363 *	has changed. Should be called after driver or hardware dependent
5364 *	conditions might have changed that influence the features.
5365 */
5366void netdev_update_features(struct net_device *dev)
5367{
5368	if (__netdev_update_features(dev))
5369		netdev_features_change(dev);
5370}
5371EXPORT_SYMBOL(netdev_update_features);
5372
5373/**
5374 *	netdev_change_features - recalculate device features
5375 *	@dev: the device to check
5376 *
5377 *	Recalculate dev->features set and send notifications even
5378 *	if they have not changed. Should be called instead of
5379 *	netdev_update_features() if also dev->vlan_features might
5380 *	have changed to allow the changes to be propagated to stacked
5381 *	VLAN devices.
5382 */
5383void netdev_change_features(struct net_device *dev)
5384{
5385	__netdev_update_features(dev);
5386	netdev_features_change(dev);
5387}
5388EXPORT_SYMBOL(netdev_change_features);
5389
5390/**
5391 *	netif_stacked_transfer_operstate -	transfer operstate
5392 *	@rootdev: the root or lower level device to transfer state from
5393 *	@dev: the device to transfer operstate to
5394 *
5395 *	Transfer operational state from root to device. This is normally
5396 *	called when a stacking relationship exists between the root
5397 *	device and the device(a leaf device).
5398 */
5399void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5400					struct net_device *dev)
5401{
5402	if (rootdev->operstate == IF_OPER_DORMANT)
5403		netif_dormant_on(dev);
5404	else
5405		netif_dormant_off(dev);
5406
5407	if (netif_carrier_ok(rootdev)) {
5408		if (!netif_carrier_ok(dev))
5409			netif_carrier_on(dev);
5410	} else {
5411		if (netif_carrier_ok(dev))
5412			netif_carrier_off(dev);
5413	}
 
 
5414}
5415EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5416
5417#ifdef CONFIG_RPS
5418static int netif_alloc_rx_queues(struct net_device *dev)
5419{
5420	unsigned int i, count = dev->num_rx_queues;
5421	struct netdev_rx_queue *rx;
 
 
5422
5423	BUG_ON(count < 1);
5424
5425	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5426	if (!rx) {
5427		pr_err("netdev: Unable to allocate %u rx queues\n", count);
5428		return -ENOMEM;
5429	}
5430	dev->_rx = rx;
5431
5432	for (i = 0; i < count; i++)
5433		rx[i].dev = dev;
 
 
 
 
 
 
5434	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5435}
5436#endif
5437
5438static void netdev_init_one_queue(struct net_device *dev,
5439				  struct netdev_queue *queue, void *_unused)
5440{
5441	/* Initialize queue lock */
5442	spin_lock_init(&queue->_xmit_lock);
5443	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5444	queue->xmit_lock_owner = -1;
5445	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5446	queue->dev = dev;
5447#ifdef CONFIG_BQL
5448	dql_init(&queue->dql, HZ);
5449#endif
5450}
5451
 
 
 
 
 
5452static int netif_alloc_netdev_queues(struct net_device *dev)
5453{
5454	unsigned int count = dev->num_tx_queues;
5455	struct netdev_queue *tx;
 
5456
5457	BUG_ON(count < 1);
 
5458
5459	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5460	if (!tx) {
5461		pr_err("netdev: Unable to allocate %u tx queues\n", count);
5462		return -ENOMEM;
5463	}
5464	dev->_tx = tx;
5465
5466	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5467	spin_lock_init(&dev->tx_global_lock);
5468
5469	return 0;
5470}
5471
 
 
 
 
 
 
 
 
 
 
 
 
5472/**
5473 *	register_netdevice	- register a network device
5474 *	@dev: device to register
5475 *
5476 *	Take a completed network device structure and add it to the kernel
5477 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5478 *	chain. 0 is returned on success. A negative errno code is returned
5479 *	on a failure to set up the device, or if the name is a duplicate.
5480 *
5481 *	Callers must hold the rtnl semaphore. You may want
5482 *	register_netdev() instead of this.
5483 *
5484 *	BUGS:
5485 *	The locking appears insufficient to guarantee two parallel registers
5486 *	will not get the same name.
5487 */
5488
5489int register_netdevice(struct net_device *dev)
5490{
5491	int ret;
5492	struct net *net = dev_net(dev);
5493
 
 
5494	BUG_ON(dev_boot_phase);
5495	ASSERT_RTNL();
5496
5497	might_sleep();
5498
5499	/* When net_device's are persistent, this will be fatal. */
5500	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5501	BUG_ON(!net);
5502
 
 
 
 
5503	spin_lock_init(&dev->addr_list_lock);
5504	netdev_set_addr_lockdep_class(dev);
5505
5506	dev->iflink = -1;
5507
5508	ret = dev_get_valid_name(dev, dev->name);
5509	if (ret < 0)
5510		goto out;
5511
 
 
 
 
 
5512	/* Init, if this function is available */
5513	if (dev->netdev_ops->ndo_init) {
5514		ret = dev->netdev_ops->ndo_init(dev);
5515		if (ret) {
5516			if (ret > 0)
5517				ret = -EIO;
5518			goto out;
5519		}
5520	}
5521
5522	dev->ifindex = dev_new_index(net);
5523	if (dev->iflink == -1)
5524		dev->iflink = dev->ifindex;
 
 
 
 
 
 
 
 
 
 
 
5525
5526	/* Transfer changeable features to wanted_features and enable
5527	 * software offloads (GSO and GRO).
5528	 */
5529	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5530	dev->features |= NETIF_F_SOFT_FEATURES;
 
 
 
 
 
 
5531	dev->wanted_features = dev->features & dev->hw_features;
5532
5533	/* Turn on no cache copy if HW is doing checksum */
5534	if (!(dev->flags & IFF_LOOPBACK)) {
5535		dev->hw_features |= NETIF_F_NOCACHE_COPY;
5536		if (dev->features & NETIF_F_ALL_CSUM) {
5537			dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5538			dev->features |= NETIF_F_NOCACHE_COPY;
5539		}
5540	}
 
 
 
 
 
 
 
 
 
5541
5542	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5543	 */
5544	dev->vlan_features |= NETIF_F_HIGHDMA;
5545
 
 
 
 
 
 
 
 
5546	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5547	ret = notifier_to_errno(ret);
5548	if (ret)
5549		goto err_uninit;
5550
5551	ret = netdev_register_kobject(dev);
5552	if (ret)
 
5553		goto err_uninit;
 
5554	dev->reg_state = NETREG_REGISTERED;
5555
5556	__netdev_update_features(dev);
5557
5558	/*
5559	 *	Default initial state at registry is that the
5560	 *	device is present.
5561	 */
5562
5563	set_bit(__LINK_STATE_PRESENT, &dev->state);
5564
 
 
5565	dev_init_scheduler(dev);
5566	dev_hold(dev);
5567	list_netdevice(dev);
5568	add_device_randomness(dev->dev_addr, dev->addr_len);
5569
 
 
 
 
 
 
 
5570	/* Notify protocols, that a new device appeared. */
5571	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5572	ret = notifier_to_errno(ret);
5573	if (ret) {
5574		rollback_registered(dev);
 
 
5575		dev->reg_state = NETREG_UNREGISTERED;
 
 
 
 
 
 
 
5576	}
5577	/*
5578	 *	Prevent userspace races by waiting until the network
5579	 *	device is fully setup before sending notifications.
5580	 */
5581	if (!dev->rtnl_link_ops ||
5582	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5583		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5584
5585out:
5586	return ret;
5587
5588err_uninit:
5589	if (dev->netdev_ops->ndo_uninit)
5590		dev->netdev_ops->ndo_uninit(dev);
 
 
 
 
5591	goto out;
5592}
5593EXPORT_SYMBOL(register_netdevice);
5594
5595/**
5596 *	init_dummy_netdev	- init a dummy network device for NAPI
5597 *	@dev: device to init
5598 *
5599 *	This takes a network device structure and initialize the minimum
5600 *	amount of fields so it can be used to schedule NAPI polls without
5601 *	registering a full blown interface. This is to be used by drivers
5602 *	that need to tie several hardware interfaces to a single NAPI
5603 *	poll scheduler due to HW limitations.
5604 */
5605int init_dummy_netdev(struct net_device *dev)
5606{
5607	/* Clear everything. Note we don't initialize spinlocks
5608	 * are they aren't supposed to be taken by any of the
5609	 * NAPI code and this dummy netdev is supposed to be
5610	 * only ever used for NAPI polls
5611	 */
5612	memset(dev, 0, sizeof(struct net_device));
5613
5614	/* make sure we BUG if trying to hit standard
5615	 * register/unregister code path
5616	 */
5617	dev->reg_state = NETREG_DUMMY;
5618
5619	/* NAPI wants this */
5620	INIT_LIST_HEAD(&dev->napi_list);
5621
5622	/* a dummy interface is started by default */
5623	set_bit(__LINK_STATE_PRESENT, &dev->state);
5624	set_bit(__LINK_STATE_START, &dev->state);
5625
 
 
 
5626	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5627	 * because users of this 'device' dont need to change
5628	 * its refcount.
5629	 */
5630
5631	return 0;
5632}
5633EXPORT_SYMBOL_GPL(init_dummy_netdev);
5634
5635
5636/**
5637 *	register_netdev	- register a network device
5638 *	@dev: device to register
5639 *
5640 *	Take a completed network device structure and add it to the kernel
5641 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5642 *	chain. 0 is returned on success. A negative errno code is returned
5643 *	on a failure to set up the device, or if the name is a duplicate.
5644 *
5645 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5646 *	and expands the device name if you passed a format string to
5647 *	alloc_netdev.
5648 */
5649int register_netdev(struct net_device *dev)
5650{
5651	int err;
5652
5653	rtnl_lock();
 
5654	err = register_netdevice(dev);
5655	rtnl_unlock();
5656	return err;
5657}
5658EXPORT_SYMBOL(register_netdev);
5659
5660int netdev_refcnt_read(const struct net_device *dev)
5661{
5662	int i, refcnt = 0;
5663
5664	for_each_possible_cpu(i)
5665		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5666	return refcnt;
5667}
5668EXPORT_SYMBOL(netdev_refcnt_read);
5669
5670/*
5671 * netdev_wait_allrefs - wait until all references are gone.
 
5672 *
5673 * This is called when unregistering network devices.
5674 *
5675 * Any protocol or device that holds a reference should register
5676 * for netdevice notification, and cleanup and put back the
5677 * reference if they receive an UNREGISTER event.
5678 * We can get stuck here if buggy protocols don't correctly
5679 * call dev_put.
5680 */
5681static void netdev_wait_allrefs(struct net_device *dev)
5682{
5683	unsigned long rebroadcast_time, warning_time;
5684	int refcnt;
5685
5686	linkwatch_forget_dev(dev);
5687
5688	rebroadcast_time = warning_time = jiffies;
5689	refcnt = netdev_refcnt_read(dev);
5690
5691	while (refcnt != 0) {
5692		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5693			rtnl_lock();
5694
5695			/* Rebroadcast unregister notification */
5696			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5697			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5698			 * should have already handle it the first time */
 
 
5699
5700			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5701				     &dev->state)) {
5702				/* We must not have linkwatch events
5703				 * pending on unregister. If this
5704				 * happens, we simply run the queue
5705				 * unscheduled, resulting in a noop
5706				 * for this device.
5707				 */
5708				linkwatch_run_queue();
5709			}
5710
5711			__rtnl_unlock();
5712
5713			rebroadcast_time = jiffies;
5714		}
5715
5716		msleep(250);
5717
5718		refcnt = netdev_refcnt_read(dev);
5719
5720		if (time_after(jiffies, warning_time + 10 * HZ)) {
5721			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5722				 dev->name, refcnt);
5723			warning_time = jiffies;
5724		}
5725	}
5726}
5727
5728/* The sequence is:
5729 *
5730 *	rtnl_lock();
5731 *	...
5732 *	register_netdevice(x1);
5733 *	register_netdevice(x2);
5734 *	...
5735 *	unregister_netdevice(y1);
5736 *	unregister_netdevice(y2);
5737 *      ...
5738 *	rtnl_unlock();
5739 *	free_netdev(y1);
5740 *	free_netdev(y2);
5741 *
5742 * We are invoked by rtnl_unlock().
5743 * This allows us to deal with problems:
5744 * 1) We can delete sysfs objects which invoke hotplug
5745 *    without deadlocking with linkwatch via keventd.
5746 * 2) Since we run with the RTNL semaphore not held, we can sleep
5747 *    safely in order to wait for the netdev refcnt to drop to zero.
5748 *
5749 * We must not return until all unregister events added during
5750 * the interval the lock was held have been completed.
5751 */
5752void netdev_run_todo(void)
5753{
5754	struct list_head list;
 
 
 
 
 
 
 
 
 
 
 
 
 
5755
5756	/* Snapshot list, allow later requests */
5757	list_replace_init(&net_todo_list, &list);
5758
5759	__rtnl_unlock();
5760
5761	/* Wait for rcu callbacks to finish before attempting to drain
5762	 * the device list.  This usually avoids a 250ms wait.
5763	 */
5764	if (!list_empty(&list))
5765		rcu_barrier();
5766
5767	while (!list_empty(&list)) {
5768		struct net_device *dev
5769			= list_first_entry(&list, struct net_device, todo_list);
5770		list_del(&dev->todo_list);
5771
5772		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5773			pr_err("network todo '%s' but state %d\n",
5774			       dev->name, dev->reg_state);
5775			dump_stack();
5776			continue;
5777		}
5778
5779		dev->reg_state = NETREG_UNREGISTERED;
5780
5781		on_each_cpu(flush_backlog, dev, 1);
5782
5783		netdev_wait_allrefs(dev);
5784
5785		/* paranoia */
5786		BUG_ON(netdev_refcnt_read(dev));
 
 
5787		WARN_ON(rcu_access_pointer(dev->ip_ptr));
5788		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
 
5789		WARN_ON(dev->dn_ptr);
 
 
 
 
 
5790
5791		if (dev->destructor)
5792			dev->destructor(dev);
 
 
 
5793
5794		/* Free network device */
5795		kobject_put(&dev->dev.kobj);
5796	}
5797}
5798
5799/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5800 * fields in the same order, with only the type differing.
 
 
5801 */
5802void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5803			     const struct net_device_stats *netdev_stats)
5804{
5805#if BITS_PER_LONG == 64
5806	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5807	memcpy(stats64, netdev_stats, sizeof(*stats64));
 
 
 
5808#else
5809	size_t i, n = sizeof(*stats64) / sizeof(u64);
5810	const unsigned long *src = (const unsigned long *)netdev_stats;
5811	u64 *dst = (u64 *)stats64;
5812
5813	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5814		     sizeof(*stats64) / sizeof(u64));
5815	for (i = 0; i < n; i++)
5816		dst[i] = src[i];
 
 
 
5817#endif
5818}
5819EXPORT_SYMBOL(netdev_stats_to_stats64);
5820
5821/**
5822 *	dev_get_stats	- get network device statistics
5823 *	@dev: device to get statistics from
5824 *	@storage: place to store stats
5825 *
5826 *	Get network statistics from device. Return @storage.
5827 *	The device driver may provide its own method by setting
5828 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5829 *	otherwise the internal statistics structure is used.
5830 */
5831struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5832					struct rtnl_link_stats64 *storage)
5833{
5834	const struct net_device_ops *ops = dev->netdev_ops;
5835
5836	if (ops->ndo_get_stats64) {
5837		memset(storage, 0, sizeof(*storage));
5838		ops->ndo_get_stats64(dev, storage);
5839	} else if (ops->ndo_get_stats) {
5840		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5841	} else {
5842		netdev_stats_to_stats64(storage, &dev->stats);
5843	}
5844	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
 
 
5845	return storage;
5846}
5847EXPORT_SYMBOL(dev_get_stats);
5848
5849struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5850{
5851	struct netdev_queue *queue = dev_ingress_queue(dev);
5852
5853#ifdef CONFIG_NET_CLS_ACT
5854	if (queue)
5855		return queue;
5856	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5857	if (!queue)
5858		return NULL;
5859	netdev_init_one_queue(dev, queue, NULL);
5860	queue->qdisc = &noop_qdisc;
5861	queue->qdisc_sleeping = &noop_qdisc;
5862	rcu_assign_pointer(dev->ingress_queue, queue);
5863#endif
5864	return queue;
5865}
5866
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5867/**
5868 *	alloc_netdev_mqs - allocate network device
5869 *	@sizeof_priv:	size of private data to allocate space for
5870 *	@name:		device name format string
5871 *	@setup:		callback to initialize device
5872 *	@txqs:		the number of TX subqueues to allocate
5873 *	@rxqs:		the number of RX subqueues to allocate
5874 *
5875 *	Allocates a struct net_device with private data area for driver use
5876 *	and performs basic initialization.  Also allocates subquue structs
5877 *	for each queue on the device.
 
5878 */
5879struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
5880		void (*setup)(struct net_device *),
5881		unsigned int txqs, unsigned int rxqs)
5882{
5883	struct net_device *dev;
5884	size_t alloc_size;
5885	struct net_device *p;
5886
5887	BUG_ON(strlen(name) >= sizeof(dev->name));
5888
5889	if (txqs < 1) {
5890		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5891		return NULL;
5892	}
5893
5894#ifdef CONFIG_RPS
5895	if (rxqs < 1) {
5896		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5897		return NULL;
5898	}
5899#endif
5900
5901	alloc_size = sizeof(struct net_device);
5902	if (sizeof_priv) {
5903		/* ensure 32-byte alignment of private area */
5904		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5905		alloc_size += sizeof_priv;
5906	}
5907	/* ensure 32-byte alignment of whole construct */
5908	alloc_size += NETDEV_ALIGN - 1;
5909
5910	p = kzalloc(alloc_size, GFP_KERNEL);
5911	if (!p) {
5912		pr_err("alloc_netdev: Unable to allocate device\n");
5913		return NULL;
5914	}
5915
5916	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5917	dev->padded = (char *)dev - (char *)p;
5918
5919	dev->pcpu_refcnt = alloc_percpu(int);
5920	if (!dev->pcpu_refcnt)
5921		goto free_p;
5922
5923	if (dev_addr_init(dev))
5924		goto free_pcpu;
5925
5926	dev_mc_init(dev);
5927	dev_uc_init(dev);
5928
5929	dev_net_set(dev, &init_net);
5930
5931	dev->gso_max_size = GSO_MAX_SIZE;
5932	dev->gso_max_segs = GSO_MAX_SEGS;
 
 
 
 
 
 
5933
5934	INIT_LIST_HEAD(&dev->napi_list);
5935	INIT_LIST_HEAD(&dev->unreg_list);
 
5936	INIT_LIST_HEAD(&dev->link_watch_list);
5937	dev->priv_flags = IFF_XMIT_DST_RELEASE;
 
 
 
 
 
 
 
 
5938	setup(dev);
5939
 
 
 
 
 
5940	dev->num_tx_queues = txqs;
5941	dev->real_num_tx_queues = txqs;
5942	if (netif_alloc_netdev_queues(dev))
5943		goto free_all;
5944
5945#ifdef CONFIG_RPS
5946	dev->num_rx_queues = rxqs;
5947	dev->real_num_rx_queues = rxqs;
5948	if (netif_alloc_rx_queues(dev))
5949		goto free_all;
5950#endif
5951
5952	strcpy(dev->name, name);
 
5953	dev->group = INIT_NETDEV_GROUP;
 
 
 
 
 
5954	return dev;
5955
5956free_all:
5957	free_netdev(dev);
5958	return NULL;
5959
5960free_pcpu:
5961	free_percpu(dev->pcpu_refcnt);
5962	kfree(dev->_tx);
5963#ifdef CONFIG_RPS
5964	kfree(dev->_rx);
5965#endif
5966
5967free_p:
5968	kfree(p);
5969	return NULL;
5970}
5971EXPORT_SYMBOL(alloc_netdev_mqs);
5972
5973/**
5974 *	free_netdev - free network device
5975 *	@dev: device
5976 *
5977 *	This function does the last stage of destroying an allocated device
5978 * 	interface. The reference to the device object is released.
5979 *	If this is the last reference then it will be freed.
 
5980 */
5981void free_netdev(struct net_device *dev)
5982{
5983	struct napi_struct *p, *n;
5984
5985	release_net(dev_net(dev));
5986
5987	kfree(dev->_tx);
5988#ifdef CONFIG_RPS
5989	kfree(dev->_rx);
5990#endif
5991
5992	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5993
5994	/* Flush device addresses */
5995	dev_addr_flush(dev);
5996
5997	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5998		netif_napi_del(p);
5999
6000	free_percpu(dev->pcpu_refcnt);
6001	dev->pcpu_refcnt = NULL;
 
 
6002
6003	/*  Compatibility with error handling in drivers */
6004	if (dev->reg_state == NETREG_UNINITIALIZED) {
6005		kfree((char *)dev - dev->padded);
6006		return;
6007	}
6008
6009	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6010	dev->reg_state = NETREG_RELEASED;
6011
6012	/* will free via device release */
6013	put_device(&dev->dev);
6014}
6015EXPORT_SYMBOL(free_netdev);
6016
6017/**
6018 *	synchronize_net -  Synchronize with packet receive processing
6019 *
6020 *	Wait for packets currently being received to be done.
6021 *	Does not block later packets from starting.
6022 */
6023void synchronize_net(void)
6024{
6025	might_sleep();
6026	if (rtnl_is_locked())
6027		synchronize_rcu_expedited();
6028	else
6029		synchronize_rcu();
6030}
6031EXPORT_SYMBOL(synchronize_net);
6032
6033/**
6034 *	unregister_netdevice_queue - remove device from the kernel
6035 *	@dev: device
6036 *	@head: list
6037 *
6038 *	This function shuts down a device interface and removes it
6039 *	from the kernel tables.
6040 *	If head not NULL, device is queued to be unregistered later.
6041 *
6042 *	Callers must hold the rtnl semaphore.  You may want
6043 *	unregister_netdev() instead of this.
6044 */
6045
6046void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6047{
6048	ASSERT_RTNL();
6049
6050	if (head) {
6051		list_move_tail(&dev->unreg_list, head);
6052	} else {
6053		rollback_registered(dev);
6054		/* Finish processing unregister after unlock */
6055		net_set_todo(dev);
6056	}
6057}
6058EXPORT_SYMBOL(unregister_netdevice_queue);
6059
6060/**
6061 *	unregister_netdevice_many - unregister many devices
6062 *	@head: list of devices
 
 
 
6063 */
6064void unregister_netdevice_many(struct list_head *head)
6065{
6066	struct net_device *dev;
6067
6068	if (!list_empty(head)) {
6069		rollback_registered_many(head);
6070		list_for_each_entry(dev, head, unreg_list)
6071			net_set_todo(dev);
 
6072	}
6073}
6074EXPORT_SYMBOL(unregister_netdevice_many);
6075
6076/**
6077 *	unregister_netdev - remove device from the kernel
6078 *	@dev: device
6079 *
6080 *	This function shuts down a device interface and removes it
6081 *	from the kernel tables.
6082 *
6083 *	This is just a wrapper for unregister_netdevice that takes
6084 *	the rtnl semaphore.  In general you want to use this and not
6085 *	unregister_netdevice.
6086 */
6087void unregister_netdev(struct net_device *dev)
6088{
6089	rtnl_lock();
6090	unregister_netdevice(dev);
6091	rtnl_unlock();
6092}
6093EXPORT_SYMBOL(unregister_netdev);
6094
6095/**
6096 *	dev_change_net_namespace - move device to different nethost namespace
6097 *	@dev: device
6098 *	@net: network namespace
6099 *	@pat: If not NULL name pattern to try if the current device name
6100 *	      is already taken in the destination network namespace.
6101 *
6102 *	This function shuts down a device interface and moves it
6103 *	to a new network namespace. On success 0 is returned, on
6104 *	a failure a netagive errno code is returned.
6105 *
6106 *	Callers must hold the rtnl semaphore.
6107 */
6108
6109int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6110{
6111	int err;
 
6112
6113	ASSERT_RTNL();
6114
6115	/* Don't allow namespace local devices to be moved. */
6116	err = -EINVAL;
6117	if (dev->features & NETIF_F_NETNS_LOCAL)
6118		goto out;
6119
6120	/* Ensure the device has been registrered */
6121	err = -EINVAL;
6122	if (dev->reg_state != NETREG_REGISTERED)
6123		goto out;
6124
6125	/* Get out if there is nothing todo */
6126	err = 0;
6127	if (net_eq(dev_net(dev), net))
6128		goto out;
6129
6130	/* Pick the destination device name, and ensure
6131	 * we can use it in the destination network namespace.
6132	 */
6133	err = -EEXIST;
6134	if (__dev_get_by_name(net, dev->name)) {
6135		/* We get here if we can't use the current device name */
6136		if (!pat)
6137			goto out;
6138		if (dev_get_valid_name(dev, pat) < 0)
 
6139			goto out;
6140	}
6141
6142	/*
6143	 * And now a mini version of register_netdevice unregister_netdevice.
6144	 */
6145
6146	/* If device is running close it first. */
6147	dev_close(dev);
6148
6149	/* And unlink it from device chain */
6150	err = -ENODEV;
6151	unlist_netdevice(dev);
6152
6153	synchronize_net();
6154
6155	/* Shutdown queueing discipline. */
6156	dev_shutdown(dev);
6157
6158	/* Notify protocols, that we are about to destroy
6159	   this device. They should clean all the things.
6160
6161	   Note that dev->reg_state stays at NETREG_REGISTERED.
6162	   This is wanted because this way 8021q and macvlan know
6163	   the device is just moving and can keep their slaves up.
6164	*/
6165	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6166	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6167	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
 
 
 
 
 
 
 
 
 
6168
6169	/*
6170	 *	Flush the unicast and multicast chains
6171	 */
6172	dev_uc_flush(dev);
6173	dev_mc_flush(dev);
6174
 
 
 
 
 
 
 
6175	/* Actually switch the network namespace */
6176	dev_net_set(dev, net);
 
6177
6178	/* If there is an ifindex conflict assign a new one */
6179	if (__dev_get_by_index(net, dev->ifindex)) {
6180		int iflink = (dev->iflink == dev->ifindex);
6181		dev->ifindex = dev_new_index(net);
6182		if (iflink)
6183			dev->iflink = dev->ifindex;
6184	}
6185
6186	/* Fixup kobjects */
6187	err = device_rename(&dev->dev, dev->name);
6188	WARN_ON(err);
6189
 
 
 
 
 
 
6190	/* Add the device back in the hashes */
6191	list_netdevice(dev);
6192
6193	/* Notify protocols, that a new device appeared. */
6194	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6195
6196	/*
6197	 *	Prevent userspace races by waiting until the network
6198	 *	device is fully setup before sending notifications.
6199	 */
6200	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6201
6202	synchronize_net();
6203	err = 0;
6204out:
6205	return err;
6206}
6207EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6208
6209static int dev_cpu_callback(struct notifier_block *nfb,
6210			    unsigned long action,
6211			    void *ocpu)
6212{
6213	struct sk_buff **list_skb;
6214	struct sk_buff *skb;
6215	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6216	struct softnet_data *sd, *oldsd;
6217
6218	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6219		return NOTIFY_OK;
6220
6221	local_irq_disable();
6222	cpu = smp_processor_id();
6223	sd = &per_cpu(softnet_data, cpu);
6224	oldsd = &per_cpu(softnet_data, oldcpu);
6225
6226	/* Find end of our completion_queue. */
6227	list_skb = &sd->completion_queue;
6228	while (*list_skb)
6229		list_skb = &(*list_skb)->next;
6230	/* Append completion queue from offline CPU. */
6231	*list_skb = oldsd->completion_queue;
6232	oldsd->completion_queue = NULL;
6233
6234	/* Append output queue from offline CPU. */
6235	if (oldsd->output_queue) {
6236		*sd->output_queue_tailp = oldsd->output_queue;
6237		sd->output_queue_tailp = oldsd->output_queue_tailp;
6238		oldsd->output_queue = NULL;
6239		oldsd->output_queue_tailp = &oldsd->output_queue;
6240	}
6241	/* Append NAPI poll list from offline CPU. */
6242	if (!list_empty(&oldsd->poll_list)) {
6243		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6244		raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 
 
 
 
 
 
 
 
 
6245	}
6246
6247	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6248	local_irq_enable();
6249
 
 
 
 
 
 
 
6250	/* Process offline CPU's input_pkt_queue */
6251	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6252		netif_rx(skb);
6253		input_queue_head_incr(oldsd);
6254	}
6255	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6256		netif_rx(skb);
6257		input_queue_head_incr(oldsd);
6258	}
6259
6260	return NOTIFY_OK;
6261}
6262
6263
6264/**
6265 *	netdev_increment_features - increment feature set by one
6266 *	@all: current feature set
6267 *	@one: new feature set
6268 *	@mask: mask feature set
6269 *
6270 *	Computes a new feature set after adding a device with feature set
6271 *	@one to the master device with current feature set @all.  Will not
6272 *	enable anything that is off in @mask. Returns the new feature set.
6273 */
6274netdev_features_t netdev_increment_features(netdev_features_t all,
6275	netdev_features_t one, netdev_features_t mask)
6276{
6277	if (mask & NETIF_F_GEN_CSUM)
6278		mask |= NETIF_F_ALL_CSUM;
6279	mask |= NETIF_F_VLAN_CHALLENGED;
6280
6281	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6282	all &= one | ~NETIF_F_ALL_FOR_ALL;
6283
6284	/* If one device supports hw checksumming, set for all. */
6285	if (all & NETIF_F_GEN_CSUM)
6286		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6287
6288	return all;
6289}
6290EXPORT_SYMBOL(netdev_increment_features);
6291
6292static struct hlist_head *netdev_create_hash(void)
6293{
6294	int i;
6295	struct hlist_head *hash;
6296
6297	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6298	if (hash != NULL)
6299		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6300			INIT_HLIST_HEAD(&hash[i]);
6301
6302	return hash;
6303}
6304
6305/* Initialize per network namespace state */
6306static int __net_init netdev_init(struct net *net)
6307{
 
 
 
6308	if (net != &init_net)
6309		INIT_LIST_HEAD(&net->dev_base_head);
6310
6311	net->dev_name_head = netdev_create_hash();
6312	if (net->dev_name_head == NULL)
6313		goto err_name;
6314
6315	net->dev_index_head = netdev_create_hash();
6316	if (net->dev_index_head == NULL)
6317		goto err_idx;
6318
 
 
6319	return 0;
6320
6321err_idx:
6322	kfree(net->dev_name_head);
6323err_name:
6324	return -ENOMEM;
6325}
6326
6327/**
6328 *	netdev_drivername - network driver for the device
6329 *	@dev: network device
6330 *
6331 *	Determine network driver for device.
6332 */
6333const char *netdev_drivername(const struct net_device *dev)
6334{
6335	const struct device_driver *driver;
6336	const struct device *parent;
6337	const char *empty = "";
6338
6339	parent = dev->dev.parent;
6340	if (!parent)
6341		return empty;
6342
6343	driver = parent->driver;
6344	if (driver && driver->name)
6345		return driver->name;
6346	return empty;
6347}
6348
6349int __netdev_printk(const char *level, const struct net_device *dev,
6350			   struct va_format *vaf)
6351{
6352	int r;
6353
6354	if (dev && dev->dev.parent)
6355		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6356			       netdev_name(dev), vaf);
6357	else if (dev)
6358		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6359	else
6360		r = printk("%s(NULL net_device): %pV", level, vaf);
6361
6362	return r;
 
 
 
6363}
6364EXPORT_SYMBOL(__netdev_printk);
6365
6366int netdev_printk(const char *level, const struct net_device *dev,
6367		  const char *format, ...)
6368{
6369	struct va_format vaf;
6370	va_list args;
6371	int r;
6372
6373	va_start(args, format);
6374
6375	vaf.fmt = format;
6376	vaf.va = &args;
6377
6378	r = __netdev_printk(level, dev, &vaf);
6379	va_end(args);
6380
6381	return r;
6382}
6383EXPORT_SYMBOL(netdev_printk);
6384
6385#define define_netdev_printk_level(func, level)			\
6386int func(const struct net_device *dev, const char *fmt, ...)	\
6387{								\
6388	int r;							\
6389	struct va_format vaf;					\
6390	va_list args;						\
6391								\
6392	va_start(args, fmt);					\
6393								\
6394	vaf.fmt = fmt;						\
6395	vaf.va = &args;						\
6396								\
6397	r = __netdev_printk(level, dev, &vaf);			\
6398	va_end(args);						\
6399								\
6400	return r;						\
6401}								\
6402EXPORT_SYMBOL(func);
6403
6404define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6405define_netdev_printk_level(netdev_alert, KERN_ALERT);
6406define_netdev_printk_level(netdev_crit, KERN_CRIT);
6407define_netdev_printk_level(netdev_err, KERN_ERR);
6408define_netdev_printk_level(netdev_warn, KERN_WARNING);
6409define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6410define_netdev_printk_level(netdev_info, KERN_INFO);
6411
6412static void __net_exit netdev_exit(struct net *net)
6413{
6414	kfree(net->dev_name_head);
6415	kfree(net->dev_index_head);
 
 
6416}
6417
6418static struct pernet_operations __net_initdata netdev_net_ops = {
6419	.init = netdev_init,
6420	.exit = netdev_exit,
6421};
6422
6423static void __net_exit default_device_exit(struct net *net)
6424{
6425	struct net_device *dev, *aux;
6426	/*
6427	 * Push all migratable network devices back to the
6428	 * initial network namespace
6429	 */
6430	rtnl_lock();
6431	for_each_netdev_safe(net, dev, aux) {
6432		int err;
6433		char fb_name[IFNAMSIZ];
6434
6435		/* Ignore unmoveable devices (i.e. loopback) */
6436		if (dev->features & NETIF_F_NETNS_LOCAL)
6437			continue;
6438
6439		/* Leave virtual devices for the generic cleanup */
6440		if (dev->rtnl_link_ops)
6441			continue;
6442
6443		/* Push remaining network devices to init_net */
6444		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
 
 
6445		err = dev_change_net_namespace(dev, &init_net, fb_name);
6446		if (err) {
6447			pr_emerg("%s: failed to move %s to init_net: %d\n",
6448				 __func__, dev->name, err);
6449			BUG();
6450		}
6451	}
6452	rtnl_unlock();
6453}
6454
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6455static void __net_exit default_device_exit_batch(struct list_head *net_list)
6456{
6457	/* At exit all network devices most be removed from a network
6458	 * namespace.  Do this in the reverse order of registration.
6459	 * Do this across as many network namespaces as possible to
6460	 * improve batching efficiency.
6461	 */
6462	struct net_device *dev;
6463	struct net *net;
6464	LIST_HEAD(dev_kill_list);
6465
6466	rtnl_lock();
 
 
 
 
 
 
 
 
 
 
 
6467	list_for_each_entry(net, net_list, exit_list) {
6468		for_each_netdev_reverse(net, dev) {
6469			if (dev->rtnl_link_ops)
6470				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6471			else
6472				unregister_netdevice_queue(dev, &dev_kill_list);
6473		}
6474	}
6475	unregister_netdevice_many(&dev_kill_list);
6476	list_del(&dev_kill_list);
6477	rtnl_unlock();
6478}
6479
6480static struct pernet_operations __net_initdata default_device_ops = {
6481	.exit = default_device_exit,
6482	.exit_batch = default_device_exit_batch,
6483};
6484
6485/*
6486 *	Initialize the DEV module. At boot time this walks the device list and
6487 *	unhooks any devices that fail to initialise (normally hardware not
6488 *	present) and leaves us with a valid list of present and active devices.
6489 *
6490 */
6491
6492/*
6493 *       This is called single threaded during boot, so no need
6494 *       to take the rtnl semaphore.
6495 */
6496static int __init net_dev_init(void)
6497{
6498	int i, rc = -ENOMEM;
6499
6500	BUG_ON(!dev_boot_phase);
6501
6502	if (dev_proc_init())
6503		goto out;
6504
6505	if (netdev_kobject_init())
6506		goto out;
6507
6508	INIT_LIST_HEAD(&ptype_all);
6509	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6510		INIT_LIST_HEAD(&ptype_base[i]);
6511
 
 
6512	if (register_pernet_subsys(&netdev_net_ops))
6513		goto out;
6514
6515	/*
6516	 *	Initialise the packet receive queues.
6517	 */
6518
6519	for_each_possible_cpu(i) {
 
6520		struct softnet_data *sd = &per_cpu(softnet_data, i);
6521
6522		memset(sd, 0, sizeof(*sd));
 
6523		skb_queue_head_init(&sd->input_pkt_queue);
6524		skb_queue_head_init(&sd->process_queue);
6525		sd->completion_queue = NULL;
 
 
6526		INIT_LIST_HEAD(&sd->poll_list);
6527		sd->output_queue = NULL;
6528		sd->output_queue_tailp = &sd->output_queue;
6529#ifdef CONFIG_RPS
6530		sd->csd.func = rps_trigger_softirq;
6531		sd->csd.info = sd;
6532		sd->csd.flags = 0;
6533		sd->cpu = i;
6534#endif
6535
 
6536		sd->backlog.poll = process_backlog;
6537		sd->backlog.weight = weight_p;
6538		sd->backlog.gro_list = NULL;
6539		sd->backlog.gro_count = 0;
6540	}
6541
6542	dev_boot_phase = 0;
6543
6544	/* The loopback device is special if any other network devices
6545	 * is present in a network namespace the loopback device must
6546	 * be present. Since we now dynamically allocate and free the
6547	 * loopback device ensure this invariant is maintained by
6548	 * keeping the loopback device as the first device on the
6549	 * list of network devices.  Ensuring the loopback devices
6550	 * is the first device that appears and the last network device
6551	 * that disappears.
6552	 */
6553	if (register_pernet_device(&loopback_net_ops))
6554		goto out;
6555
6556	if (register_pernet_device(&default_device_ops))
6557		goto out;
6558
6559	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6560	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6561
6562	hotcpu_notifier(dev_cpu_callback, 0);
6563	dst_init();
6564	dev_mcast_init();
6565	rc = 0;
6566out:
6567	return rc;
6568}
6569
6570subsys_initcall(net_dev_init);
6571
6572static int __init initialize_hashrnd(void)
6573{
6574	get_random_bytes(&hashrnd, sizeof(hashrnd));
6575	return 0;
6576}
6577
6578late_initcall_sync(initialize_hashrnd);
6579

    1// SPDX-License-Identifier: GPL-2.0-or-later
    2/*
    3 *      NET3    Protocol independent device support routines.
 
 
 
 
 
    4 *
    5 *	Derived from the non IP parts of dev.c 1.0.19
    6 *              Authors:	Ross Biro
    7 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
    8 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
    9 *
   10 *	Additional Authors:
   11 *		Florian la Roche <rzsfl@rz.uni-sb.de>
   12 *		Alan Cox <gw4pts@gw4pts.ampr.org>
   13 *		David Hinds <dahinds@users.sourceforge.net>
   14 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
   15 *		Adam Sulmicki <adam@cfar.umd.edu>
   16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
   17 *
   18 *	Changes:
   19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
   20 *                                      to 2 if register_netdev gets called
   21 *                                      before net_dev_init & also removed a
   22 *                                      few lines of code in the process.
   23 *		Alan Cox	:	device private ioctl copies fields back.
   24 *		Alan Cox	:	Transmit queue code does relevant
   25 *					stunts to keep the queue safe.
   26 *		Alan Cox	:	Fixed double lock.
   27 *		Alan Cox	:	Fixed promisc NULL pointer trap
   28 *		????????	:	Support the full private ioctl range
   29 *		Alan Cox	:	Moved ioctl permission check into
   30 *					drivers
   31 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
   32 *		Alan Cox	:	100 backlog just doesn't cut it when
   33 *					you start doing multicast video 8)
   34 *		Alan Cox	:	Rewrote net_bh and list manager.
   35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
   36 *		Alan Cox	:	Took out transmit every packet pass
   37 *					Saved a few bytes in the ioctl handler
   38 *		Alan Cox	:	Network driver sets packet type before
   39 *					calling netif_rx. Saves a function
   40 *					call a packet.
   41 *		Alan Cox	:	Hashed net_bh()
   42 *		Richard Kooijman:	Timestamp fixes.
   43 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
   44 *		Alan Cox	:	Device lock protection.
   45 *              Alan Cox        :       Fixed nasty side effect of device close
   46 *					changes.
   47 *		Rudi Cilibrasi	:	Pass the right thing to
   48 *					set_mac_address()
   49 *		Dave Miller	:	32bit quantity for the device lock to
   50 *					make it work out on a Sparc.
   51 *		Bjorn Ekwall	:	Added KERNELD hack.
   52 *		Alan Cox	:	Cleaned up the backlog initialise.
   53 *		Craig Metz	:	SIOCGIFCONF fix if space for under
   54 *					1 device.
   55 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
   56 *					is no device open function.
   57 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
   58 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
   59 *		Cyrus Durgin	:	Cleaned for KMOD
   60 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
   61 *					A network device unload needs to purge
   62 *					the backlog queue.
   63 *	Paul Rusty Russell	:	SIOCSIFNAME
   64 *              Pekka Riikonen  :	Netdev boot-time settings code
   65 *              Andrew Morton   :       Make unregister_netdevice wait
   66 *                                      indefinitely on dev->refcnt
   67 *              J Hadi Salim    :       - Backlog queue sampling
   68 *				        - netif_rx() feedback
   69 */
   70
   71#include <linux/uaccess.h>
   72#include <linux/bitops.h>
   73#include <linux/capability.h>
   74#include <linux/cpu.h>
   75#include <linux/types.h>
   76#include <linux/kernel.h>
   77#include <linux/hash.h>
   78#include <linux/slab.h>
   79#include <linux/sched.h>
   80#include <linux/sched/mm.h>
   81#include <linux/mutex.h>
   82#include <linux/rwsem.h>
   83#include <linux/string.h>
   84#include <linux/mm.h>
   85#include <linux/socket.h>
   86#include <linux/sockios.h>
   87#include <linux/errno.h>
   88#include <linux/interrupt.h>
   89#include <linux/if_ether.h>
   90#include <linux/netdevice.h>
   91#include <linux/etherdevice.h>
   92#include <linux/ethtool.h>
 
   93#include <linux/skbuff.h>
   94#include <linux/bpf.h>
   95#include <linux/bpf_trace.h>
   96#include <net/net_namespace.h>
   97#include <net/sock.h>
   98#include <net/busy_poll.h>
   99#include <linux/rtnetlink.h>
 
 
  100#include <linux/stat.h>
  101#include <net/dst.h>
  102#include <net/dst_metadata.h>
  103#include <net/pkt_sched.h>
  104#include <net/pkt_cls.h>
  105#include <net/checksum.h>
  106#include <net/xfrm.h>
  107#include <linux/highmem.h>
  108#include <linux/init.h>
 
  109#include <linux/module.h>
  110#include <linux/netpoll.h>
  111#include <linux/rcupdate.h>
  112#include <linux/delay.h>
 
  113#include <net/iw_handler.h>
  114#include <asm/current.h>
  115#include <linux/audit.h>
  116#include <linux/dmaengine.h>
  117#include <linux/err.h>
  118#include <linux/ctype.h>
  119#include <linux/if_arp.h>
  120#include <linux/if_vlan.h>
  121#include <linux/ip.h>
  122#include <net/ip.h>
  123#include <net/mpls.h>
  124#include <linux/ipv6.h>
  125#include <linux/in.h>
  126#include <linux/jhash.h>
  127#include <linux/random.h>
  128#include <trace/events/napi.h>
  129#include <trace/events/net.h>
  130#include <trace/events/skb.h>
 
  131#include <linux/inetdevice.h>
  132#include <linux/cpu_rmap.h>
 
  133#include <linux/static_key.h>
  134#include <linux/hashtable.h>
  135#include <linux/vmalloc.h>
  136#include <linux/if_macvlan.h>
  137#include <linux/errqueue.h>
  138#include <linux/hrtimer.h>
  139#include <linux/netfilter_ingress.h>
  140#include <linux/crash_dump.h>
  141#include <linux/sctp.h>
  142#include <net/udp_tunnel.h>
  143#include <linux/net_namespace.h>
  144#include <linux/indirect_call_wrapper.h>
  145#include <net/devlink.h>
  146#include <linux/pm_runtime.h>
  147
  148#include "net-sysfs.h"
  149
 
  150#define MAX_GRO_SKBS 8
  151
  152/* This should be increased if a protocol with a bigger head is added. */
  153#define GRO_MAX_HEAD (MAX_HEADER + 128)
  154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  155static DEFINE_SPINLOCK(ptype_lock);
  156static DEFINE_SPINLOCK(offload_lock);
  157struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
  158struct list_head ptype_all __read_mostly;	/* Taps */
  159static struct list_head offload_base __read_mostly;
  160
  161static int netif_rx_internal(struct sk_buff *skb);
  162static int call_netdevice_notifiers_info(unsigned long val,
  163					 struct netdev_notifier_info *info);
  164static int call_netdevice_notifiers_extack(unsigned long val,
  165					   struct net_device *dev,
  166					   struct netlink_ext_ack *extack);
  167static struct napi_struct *napi_by_id(unsigned int napi_id);
  168
  169/*
  170 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
  171 * semaphore.
  172 *
  173 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
  174 *
  175 * Writers must hold the rtnl semaphore while they loop through the
  176 * dev_base_head list, and hold dev_base_lock for writing when they do the
  177 * actual updates.  This allows pure readers to access the list even
  178 * while a writer is preparing to update it.
  179 *
  180 * To put it another way, dev_base_lock is held for writing only to
  181 * protect against pure readers; the rtnl semaphore provides the
  182 * protection against other writers.
  183 *
  184 * See, for example usages, register_netdevice() and
  185 * unregister_netdevice(), which must be called with the rtnl
  186 * semaphore held.
  187 */
  188DEFINE_RWLOCK(dev_base_lock);
  189EXPORT_SYMBOL(dev_base_lock);
  190
  191static DEFINE_MUTEX(ifalias_mutex);
  192
  193/* protects napi_hash addition/deletion and napi_gen_id */
  194static DEFINE_SPINLOCK(napi_hash_lock);
  195
  196static unsigned int napi_gen_id = NR_CPUS;
  197static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
  198
  199static DECLARE_RWSEM(devnet_rename_sem);
  200
  201static inline void dev_base_seq_inc(struct net *net)
  202{
  203	while (++net->dev_base_seq == 0)
  204		;
  205}
  206
  207static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
  208{
  209	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
  210
  211	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
  212}
  213
  214static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
  215{
  216	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
  217}
  218
  219static inline void rps_lock(struct softnet_data *sd)
  220{
  221#ifdef CONFIG_RPS
  222	spin_lock(&sd->input_pkt_queue.lock);
  223#endif
  224}
  225
  226static inline void rps_unlock(struct softnet_data *sd)
  227{
  228#ifdef CONFIG_RPS
  229	spin_unlock(&sd->input_pkt_queue.lock);
  230#endif
  231}
  232
  233static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
  234						       const char *name)
  235{
  236	struct netdev_name_node *name_node;
  237
  238	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
  239	if (!name_node)
  240		return NULL;
  241	INIT_HLIST_NODE(&name_node->hlist);
  242	name_node->dev = dev;
  243	name_node->name = name;
  244	return name_node;
  245}
  246
  247static struct netdev_name_node *
  248netdev_name_node_head_alloc(struct net_device *dev)
  249{
  250	struct netdev_name_node *name_node;
  251
  252	name_node = netdev_name_node_alloc(dev, dev->name);
  253	if (!name_node)
  254		return NULL;
  255	INIT_LIST_HEAD(&name_node->list);
  256	return name_node;
  257}
  258
  259static void netdev_name_node_free(struct netdev_name_node *name_node)
  260{
  261	kfree(name_node);
  262}
  263
  264static void netdev_name_node_add(struct net *net,
  265				 struct netdev_name_node *name_node)
  266{
  267	hlist_add_head_rcu(&name_node->hlist,
  268			   dev_name_hash(net, name_node->name));
  269}
  270
  271static void netdev_name_node_del(struct netdev_name_node *name_node)
  272{
  273	hlist_del_rcu(&name_node->hlist);
  274}
  275
  276static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
  277							const char *name)
  278{
  279	struct hlist_head *head = dev_name_hash(net, name);
  280	struct netdev_name_node *name_node;
  281
  282	hlist_for_each_entry(name_node, head, hlist)
  283		if (!strcmp(name_node->name, name))
  284			return name_node;
  285	return NULL;
  286}
  287
  288static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
  289							    const char *name)
  290{
  291	struct hlist_head *head = dev_name_hash(net, name);
  292	struct netdev_name_node *name_node;
  293
  294	hlist_for_each_entry_rcu(name_node, head, hlist)
  295		if (!strcmp(name_node->name, name))
  296			return name_node;
  297	return NULL;
  298}
  299
  300int netdev_name_node_alt_create(struct net_device *dev, const char *name)
  301{
  302	struct netdev_name_node *name_node;
  303	struct net *net = dev_net(dev);
  304
  305	name_node = netdev_name_node_lookup(net, name);
  306	if (name_node)
  307		return -EEXIST;
  308	name_node = netdev_name_node_alloc(dev, name);
  309	if (!name_node)
  310		return -ENOMEM;
  311	netdev_name_node_add(net, name_node);
  312	/* The node that holds dev->name acts as a head of per-device list. */
  313	list_add_tail(&name_node->list, &dev->name_node->list);
  314
  315	return 0;
  316}
  317EXPORT_SYMBOL(netdev_name_node_alt_create);
  318
  319static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
  320{
  321	list_del(&name_node->list);
  322	netdev_name_node_del(name_node);
  323	kfree(name_node->name);
  324	netdev_name_node_free(name_node);
  325}
  326
  327int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
  328{
  329	struct netdev_name_node *name_node;
  330	struct net *net = dev_net(dev);
  331
  332	name_node = netdev_name_node_lookup(net, name);
  333	if (!name_node)
  334		return -ENOENT;
  335	/* lookup might have found our primary name or a name belonging
  336	 * to another device.
  337	 */
  338	if (name_node == dev->name_node || name_node->dev != dev)
  339		return -EINVAL;
  340
  341	__netdev_name_node_alt_destroy(name_node);
  342
  343	return 0;
  344}
  345EXPORT_SYMBOL(netdev_name_node_alt_destroy);
  346
  347static void netdev_name_node_alt_flush(struct net_device *dev)
  348{
  349	struct netdev_name_node *name_node, *tmp;
  350
  351	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
  352		__netdev_name_node_alt_destroy(name_node);
  353}
  354
  355/* Device list insertion */
  356static void list_netdevice(struct net_device *dev)
  357{
  358	struct net *net = dev_net(dev);
  359
  360	ASSERT_RTNL();
  361
  362	write_lock_bh(&dev_base_lock);
  363	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
  364	netdev_name_node_add(net, dev->name_node);
  365	hlist_add_head_rcu(&dev->index_hlist,
  366			   dev_index_hash(net, dev->ifindex));
  367	write_unlock_bh(&dev_base_lock);
  368
  369	dev_base_seq_inc(net);
 
 
  370}
  371
  372/* Device list removal
  373 * caller must respect a RCU grace period before freeing/reusing dev
  374 */
  375static void unlist_netdevice(struct net_device *dev)
  376{
  377	ASSERT_RTNL();
  378
  379	/* Unlink dev from the device chain */
  380	write_lock_bh(&dev_base_lock);
  381	list_del_rcu(&dev->dev_list);
  382	netdev_name_node_del(dev->name_node);
  383	hlist_del_rcu(&dev->index_hlist);
  384	write_unlock_bh(&dev_base_lock);
  385
  386	dev_base_seq_inc(dev_net(dev));
  387}
  388
  389/*
  390 *	Our notifier list
  391 */
  392
  393static RAW_NOTIFIER_HEAD(netdev_chain);
  394
  395/*
  396 *	Device drivers call our routines to queue packets here. We empty the
  397 *	queue in the local softnet handler.
  398 */
  399
  400DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
  401EXPORT_PER_CPU_SYMBOL(softnet_data);
  402
  403#ifdef CONFIG_LOCKDEP
  404/*
  405 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
  406 * according to dev->type
  407 */
  408static const unsigned short netdev_lock_type[] = {
  409	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
  410	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
  411	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
  412	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
  413	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
  414	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
  415	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
  416	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
  417	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
  418	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
  419	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
  420	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
  421	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
  422	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
  423	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
  424
  425static const char *const netdev_lock_name[] = {
  426	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
  427	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
  428	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
  429	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
  430	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
  431	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
  432	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
  433	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
  434	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
  435	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
  436	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
  437	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
  438	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
  439	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
  440	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
  441
  442static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
  443static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
  444
  445static inline unsigned short netdev_lock_pos(unsigned short dev_type)
  446{
  447	int i;
  448
  449	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
  450		if (netdev_lock_type[i] == dev_type)
  451			return i;
  452	/* the last key is used by default */
  453	return ARRAY_SIZE(netdev_lock_type) - 1;
  454}
  455
  456static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  457						 unsigned short dev_type)
  458{
  459	int i;
  460
  461	i = netdev_lock_pos(dev_type);
  462	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
  463				   netdev_lock_name[i]);
  464}
  465
  466static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  467{
  468	int i;
  469
  470	i = netdev_lock_pos(dev->type);
  471	lockdep_set_class_and_name(&dev->addr_list_lock,
  472				   &netdev_addr_lock_key[i],
  473				   netdev_lock_name[i]);
  474}
  475#else
  476static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  477						 unsigned short dev_type)
  478{
  479}
  480
  481static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  482{
  483}
  484#endif
  485
  486/*******************************************************************************
  487 *
  488 *		Protocol management and registration routines
  489 *
  490 *******************************************************************************/
  491
 
 
 
  492
  493/*
  494 *	Add a protocol ID to the list. Now that the input handler is
  495 *	smarter we can dispense with all the messy stuff that used to be
  496 *	here.
  497 *
  498 *	BEWARE!!! Protocol handlers, mangling input packets,
  499 *	MUST BE last in hash buckets and checking protocol handlers
  500 *	MUST start from promiscuous ptype_all chain in net_bh.
  501 *	It is true now, do not change it.
  502 *	Explanation follows: if protocol handler, mangling packet, will
  503 *	be the first on list, it is not able to sense, that packet
  504 *	is cloned and should be copied-on-write, so that it will
  505 *	change it and subsequent readers will get broken packet.
  506 *							--ANK (980803)
  507 */
  508
  509static inline struct list_head *ptype_head(const struct packet_type *pt)
  510{
  511	if (pt->type == htons(ETH_P_ALL))
  512		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
  513	else
  514		return pt->dev ? &pt->dev->ptype_specific :
  515				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
  516}
  517
  518/**
  519 *	dev_add_pack - add packet handler
  520 *	@pt: packet type declaration
  521 *
  522 *	Add a protocol handler to the networking stack. The passed &packet_type
  523 *	is linked into kernel lists and may not be freed until it has been
  524 *	removed from the kernel lists.
  525 *
  526 *	This call does not sleep therefore it can not
  527 *	guarantee all CPU's that are in middle of receiving packets
  528 *	will see the new packet type (until the next received packet).
  529 */
  530
  531void dev_add_pack(struct packet_type *pt)
  532{
  533	struct list_head *head = ptype_head(pt);
  534
  535	spin_lock(&ptype_lock);
  536	list_add_rcu(&pt->list, head);
  537	spin_unlock(&ptype_lock);
  538}
  539EXPORT_SYMBOL(dev_add_pack);
  540
  541/**
  542 *	__dev_remove_pack	 - remove packet handler
  543 *	@pt: packet type declaration
  544 *
  545 *	Remove a protocol handler that was previously added to the kernel
  546 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  547 *	from the kernel lists and can be freed or reused once this function
  548 *	returns.
  549 *
  550 *      The packet type might still be in use by receivers
  551 *	and must not be freed until after all the CPU's have gone
  552 *	through a quiescent state.
  553 */
  554void __dev_remove_pack(struct packet_type *pt)
  555{
  556	struct list_head *head = ptype_head(pt);
  557	struct packet_type *pt1;
  558
  559	spin_lock(&ptype_lock);
  560
  561	list_for_each_entry(pt1, head, list) {
  562		if (pt == pt1) {
  563			list_del_rcu(&pt->list);
  564			goto out;
  565		}
  566	}
  567
  568	pr_warn("dev_remove_pack: %p not found\n", pt);
  569out:
  570	spin_unlock(&ptype_lock);
  571}
  572EXPORT_SYMBOL(__dev_remove_pack);
  573
  574/**
  575 *	dev_remove_pack	 - remove packet handler
  576 *	@pt: packet type declaration
  577 *
  578 *	Remove a protocol handler that was previously added to the kernel
  579 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  580 *	from the kernel lists and can be freed or reused once this function
  581 *	returns.
  582 *
  583 *	This call sleeps to guarantee that no CPU is looking at the packet
  584 *	type after return.
  585 */
  586void dev_remove_pack(struct packet_type *pt)
  587{
  588	__dev_remove_pack(pt);
  589
  590	synchronize_net();
  591}
  592EXPORT_SYMBOL(dev_remove_pack);
  593
 
  594
  595/**
  596 *	dev_add_offload - register offload handlers
  597 *	@po: protocol offload declaration
  598 *
  599 *	Add protocol offload handlers to the networking stack. The passed
  600 *	&proto_offload is linked into kernel lists and may not be freed until
  601 *	it has been removed from the kernel lists.
  602 *
  603 *	This call does not sleep therefore it can not
  604 *	guarantee all CPU's that are in middle of receiving packets
  605 *	will see the new offload handlers (until the next received packet).
  606 */
  607void dev_add_offload(struct packet_offload *po)
  608{
  609	struct packet_offload *elem;
  610
  611	spin_lock(&offload_lock);
  612	list_for_each_entry(elem, &offload_base, list) {
  613		if (po->priority < elem->priority)
  614			break;
  615	}
  616	list_add_rcu(&po->list, elem->list.prev);
  617	spin_unlock(&offload_lock);
  618}
  619EXPORT_SYMBOL(dev_add_offload);
  620
  621/**
  622 *	__dev_remove_offload	 - remove offload handler
  623 *	@po: packet offload declaration
  624 *
  625 *	Remove a protocol offload handler that was previously added to the
  626 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
  627 *	is removed from the kernel lists and can be freed or reused once this
  628 *	function returns.
  629 *
  630 *      The packet type might still be in use by receivers
  631 *	and must not be freed until after all the CPU's have gone
  632 *	through a quiescent state.
  633 */
  634static void __dev_remove_offload(struct packet_offload *po)
  635{
  636	struct list_head *head = &offload_base;
  637	struct packet_offload *po1;
  638
  639	spin_lock(&offload_lock);
  640
  641	list_for_each_entry(po1, head, list) {
  642		if (po == po1) {
  643			list_del_rcu(&po->list);
  644			goto out;
  645		}
  646	}
  647
  648	pr_warn("dev_remove_offload: %p not found\n", po);
  649out:
  650	spin_unlock(&offload_lock);
  651}
  652
  653/**
  654 *	dev_remove_offload	 - remove packet offload handler
  655 *	@po: packet offload declaration
  656 *
  657 *	Remove a packet offload handler that was previously added to the kernel
  658 *	offload handlers by dev_add_offload(). The passed &offload_type is
  659 *	removed from the kernel lists and can be freed or reused once this
  660 *	function returns.
  661 *
  662 *	This call sleeps to guarantee that no CPU is looking at the packet
  663 *	type after return.
  664 */
  665void dev_remove_offload(struct packet_offload *po)
  666{
  667	__dev_remove_offload(po);
  668
  669	synchronize_net();
  670}
  671EXPORT_SYMBOL(dev_remove_offload);
  672
  673/******************************************************************************
  674 *
  675 *		      Device Boot-time Settings Routines
  676 *
  677 ******************************************************************************/
  678
  679/* Boot time configuration table */
  680static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
  681
  682/**
  683 *	netdev_boot_setup_add	- add new setup entry
  684 *	@name: name of the device
  685 *	@map: configured settings for the device
  686 *
  687 *	Adds new setup entry to the dev_boot_setup list.  The function
  688 *	returns 0 on error and 1 on success.  This is a generic routine to
  689 *	all netdevices.
  690 */
  691static int netdev_boot_setup_add(char *name, struct ifmap *map)
  692{
  693	struct netdev_boot_setup *s;
  694	int i;
  695
  696	s = dev_boot_setup;
  697	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
  698		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
  699			memset(s[i].name, 0, sizeof(s[i].name));
  700			strlcpy(s[i].name, name, IFNAMSIZ);
  701			memcpy(&s[i].map, map, sizeof(s[i].map));
  702			break;
  703		}
  704	}
  705
  706	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
  707}
  708
  709/**
  710 * netdev_boot_setup_check	- check boot time settings
  711 * @dev: the netdevice
  712 *
  713 * Check boot time settings for the device.
  714 * The found settings are set for the device to be used
  715 * later in the device probing.
  716 * Returns 0 if no settings found, 1 if they are.
  717 */
  718int netdev_boot_setup_check(struct net_device *dev)
  719{
  720	struct netdev_boot_setup *s = dev_boot_setup;
  721	int i;
  722
  723	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
  724		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
  725		    !strcmp(dev->name, s[i].name)) {
  726			dev->irq = s[i].map.irq;
  727			dev->base_addr = s[i].map.base_addr;
  728			dev->mem_start = s[i].map.mem_start;
  729			dev->mem_end = s[i].map.mem_end;
  730			return 1;
  731		}
  732	}
  733	return 0;
  734}
  735EXPORT_SYMBOL(netdev_boot_setup_check);
  736
  737
  738/**
  739 * netdev_boot_base	- get address from boot time settings
  740 * @prefix: prefix for network device
  741 * @unit: id for network device
  742 *
  743 * Check boot time settings for the base address of device.
  744 * The found settings are set for the device to be used
  745 * later in the device probing.
  746 * Returns 0 if no settings found.
  747 */
  748unsigned long netdev_boot_base(const char *prefix, int unit)
  749{
  750	const struct netdev_boot_setup *s = dev_boot_setup;
  751	char name[IFNAMSIZ];
  752	int i;
  753
  754	sprintf(name, "%s%d", prefix, unit);
  755
  756	/*
  757	 * If device already registered then return base of 1
  758	 * to indicate not to probe for this interface
  759	 */
  760	if (__dev_get_by_name(&init_net, name))
  761		return 1;
  762
  763	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
  764		if (!strcmp(name, s[i].name))
  765			return s[i].map.base_addr;
  766	return 0;
  767}
  768
  769/*
  770 * Saves at boot time configured settings for any netdevice.
  771 */
  772int __init netdev_boot_setup(char *str)
  773{
  774	int ints[5];
  775	struct ifmap map;
  776
  777	str = get_options(str, ARRAY_SIZE(ints), ints);
  778	if (!str || !*str)
  779		return 0;
  780
  781	/* Save settings */
  782	memset(&map, 0, sizeof(map));
  783	if (ints[0] > 0)
  784		map.irq = ints[1];
  785	if (ints[0] > 1)
  786		map.base_addr = ints[2];
  787	if (ints[0] > 2)
  788		map.mem_start = ints[3];
  789	if (ints[0] > 3)
  790		map.mem_end = ints[4];
  791
  792	/* Add new entry to the list */
  793	return netdev_boot_setup_add(str, &map);
  794}
  795
  796__setup("netdev=", netdev_boot_setup);
  797
  798/*******************************************************************************
  799 *
  800 *			    Device Interface Subroutines
  801 *
  802 *******************************************************************************/
  803
  804/**
  805 *	dev_get_iflink	- get 'iflink' value of a interface
  806 *	@dev: targeted interface
  807 *
  808 *	Indicates the ifindex the interface is linked to.
  809 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
  810 */
  811
  812int dev_get_iflink(const struct net_device *dev)
  813{
  814	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
  815		return dev->netdev_ops->ndo_get_iflink(dev);
  816
  817	return dev->ifindex;
  818}
  819EXPORT_SYMBOL(dev_get_iflink);
  820
  821/**
  822 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
  823 *	@dev: targeted interface
  824 *	@skb: The packet.
  825 *
  826 *	For better visibility of tunnel traffic OVS needs to retrieve
  827 *	egress tunnel information for a packet. Following API allows
  828 *	user to get this info.
  829 */
  830int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
  831{
  832	struct ip_tunnel_info *info;
  833
  834	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
  835		return -EINVAL;
  836
  837	info = skb_tunnel_info_unclone(skb);
  838	if (!info)
  839		return -ENOMEM;
  840	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
  841		return -EINVAL;
  842
  843	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
  844}
  845EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
  846
  847/**
  848 *	__dev_get_by_name	- find a device by its name
  849 *	@net: the applicable net namespace
  850 *	@name: name to find
  851 *
  852 *	Find an interface by name. Must be called under RTNL semaphore
  853 *	or @dev_base_lock. If the name is found a pointer to the device
  854 *	is returned. If the name is not found then %NULL is returned. The
  855 *	reference counters are not incremented so the caller must be
  856 *	careful with locks.
  857 */
  858
  859struct net_device *__dev_get_by_name(struct net *net, const char *name)
  860{
  861	struct netdev_name_node *node_name;
 
 
  862
  863	node_name = netdev_name_node_lookup(net, name);
  864	return node_name ? node_name->dev : NULL;
 
 
 
  865}
  866EXPORT_SYMBOL(__dev_get_by_name);
  867
  868/**
  869 * dev_get_by_name_rcu	- find a device by its name
  870 * @net: the applicable net namespace
  871 * @name: name to find
  872 *
  873 * Find an interface by name.
  874 * If the name is found a pointer to the device is returned.
  875 * If the name is not found then %NULL is returned.
  876 * The reference counters are not incremented so the caller must be
  877 * careful with locks. The caller must hold RCU lock.
  878 */
  879
  880struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
  881{
  882	struct netdev_name_node *node_name;
 
 
 
 
 
 
  883
  884	node_name = netdev_name_node_lookup_rcu(net, name);
  885	return node_name ? node_name->dev : NULL;
  886}
  887EXPORT_SYMBOL(dev_get_by_name_rcu);
  888
  889/**
  890 *	dev_get_by_name		- find a device by its name
  891 *	@net: the applicable net namespace
  892 *	@name: name to find
  893 *
  894 *	Find an interface by name. This can be called from any
  895 *	context and does its own locking. The returned handle has
  896 *	the usage count incremented and the caller must use dev_put() to
  897 *	release it when it is no longer needed. %NULL is returned if no
  898 *	matching device is found.
  899 */
  900
  901struct net_device *dev_get_by_name(struct net *net, const char *name)
  902{
  903	struct net_device *dev;
  904
  905	rcu_read_lock();
  906	dev = dev_get_by_name_rcu(net, name);
  907	if (dev)
  908		dev_hold(dev);
  909	rcu_read_unlock();
  910	return dev;
  911}
  912EXPORT_SYMBOL(dev_get_by_name);
  913
  914/**
  915 *	__dev_get_by_index - find a device by its ifindex
  916 *	@net: the applicable net namespace
  917 *	@ifindex: index of device
  918 *
  919 *	Search for an interface by index. Returns %NULL if the device
  920 *	is not found or a pointer to the device. The device has not
  921 *	had its reference counter increased so the caller must be careful
  922 *	about locking. The caller must hold either the RTNL semaphore
  923 *	or @dev_base_lock.
  924 */
  925
  926struct net_device *__dev_get_by_index(struct net *net, int ifindex)
  927{
 
  928	struct net_device *dev;
  929	struct hlist_head *head = dev_index_hash(net, ifindex);
  930
  931	hlist_for_each_entry(dev, head, index_hlist)
  932		if (dev->ifindex == ifindex)
  933			return dev;
  934
  935	return NULL;
  936}
  937EXPORT_SYMBOL(__dev_get_by_index);
  938
  939/**
  940 *	dev_get_by_index_rcu - find a device by its ifindex
  941 *	@net: the applicable net namespace
  942 *	@ifindex: index of device
  943 *
  944 *	Search for an interface by index. Returns %NULL if the device
  945 *	is not found or a pointer to the device. The device has not
  946 *	had its reference counter increased so the caller must be careful
  947 *	about locking. The caller must hold RCU lock.
  948 */
  949
  950struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
  951{
 
  952	struct net_device *dev;
  953	struct hlist_head *head = dev_index_hash(net, ifindex);
  954
  955	hlist_for_each_entry_rcu(dev, head, index_hlist)
  956		if (dev->ifindex == ifindex)
  957			return dev;
  958
  959	return NULL;
  960}
  961EXPORT_SYMBOL(dev_get_by_index_rcu);
  962
  963
  964/**
  965 *	dev_get_by_index - find a device by its ifindex
  966 *	@net: the applicable net namespace
  967 *	@ifindex: index of device
  968 *
  969 *	Search for an interface by index. Returns NULL if the device
  970 *	is not found or a pointer to the device. The device returned has
  971 *	had a reference added and the pointer is safe until the user calls
  972 *	dev_put to indicate they have finished with it.
  973 */
  974
  975struct net_device *dev_get_by_index(struct net *net, int ifindex)
  976{
  977	struct net_device *dev;
  978
  979	rcu_read_lock();
  980	dev = dev_get_by_index_rcu(net, ifindex);
  981	if (dev)
  982		dev_hold(dev);
  983	rcu_read_unlock();
  984	return dev;
  985}
  986EXPORT_SYMBOL(dev_get_by_index);
  987
  988/**
  989 *	dev_get_by_napi_id - find a device by napi_id
  990 *	@napi_id: ID of the NAPI struct
  991 *
  992 *	Search for an interface by NAPI ID. Returns %NULL if the device
  993 *	is not found or a pointer to the device. The device has not had
  994 *	its reference counter increased so the caller must be careful
  995 *	about locking. The caller must hold RCU lock.
  996 */
  997
  998struct net_device *dev_get_by_napi_id(unsigned int napi_id)
  999{
 1000	struct napi_struct *napi;
 1001
 1002	WARN_ON_ONCE(!rcu_read_lock_held());
 1003
 1004	if (napi_id < MIN_NAPI_ID)
 1005		return NULL;
 1006
 1007	napi = napi_by_id(napi_id);
 1008
 1009	return napi ? napi->dev : NULL;
 1010}
 1011EXPORT_SYMBOL(dev_get_by_napi_id);
 1012
 1013/**
 1014 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 1015 *	@net: network namespace
 1016 *	@name: a pointer to the buffer where the name will be stored.
 1017 *	@ifindex: the ifindex of the interface to get the name from.
 1018 */
 1019int netdev_get_name(struct net *net, char *name, int ifindex)
 1020{
 1021	struct net_device *dev;
 1022	int ret;
 1023
 1024	down_read(&devnet_rename_sem);
 1025	rcu_read_lock();
 1026
 1027	dev = dev_get_by_index_rcu(net, ifindex);
 1028	if (!dev) {
 1029		ret = -ENODEV;
 1030		goto out;
 1031	}
 1032
 1033	strcpy(name, dev->name);
 1034
 1035	ret = 0;
 1036out:
 1037	rcu_read_unlock();
 1038	up_read(&devnet_rename_sem);
 1039	return ret;
 1040}
 1041
 1042/**
 1043 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 1044 *	@net: the applicable net namespace
 1045 *	@type: media type of device
 1046 *	@ha: hardware address
 1047 *
 1048 *	Search for an interface by MAC address. Returns NULL if the device
 1049 *	is not found or a pointer to the device.
 1050 *	The caller must hold RCU or RTNL.
 1051 *	The returned device has not had its ref count increased
 1052 *	and the caller must therefore be careful about locking
 1053 *
 1054 */
 1055
 1056struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 1057				       const char *ha)
 1058{
 1059	struct net_device *dev;
 1060
 1061	for_each_netdev_rcu(net, dev)
 1062		if (dev->type == type &&
 1063		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 1064			return dev;
 1065
 1066	return NULL;
 1067}
 1068EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 1069
 1070struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 1071{
 1072	struct net_device *dev;
 1073
 1074	ASSERT_RTNL();
 1075	for_each_netdev(net, dev)
 1076		if (dev->type == type)
 1077			return dev;
 1078
 1079	return NULL;
 1080}
 1081EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 1082
 1083struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 1084{
 1085	struct net_device *dev, *ret = NULL;
 1086
 1087	rcu_read_lock();
 1088	for_each_netdev_rcu(net, dev)
 1089		if (dev->type == type) {
 1090			dev_hold(dev);
 1091			ret = dev;
 1092			break;
 1093		}
 1094	rcu_read_unlock();
 1095	return ret;
 1096}
 1097EXPORT_SYMBOL(dev_getfirstbyhwtype);
 1098
 1099/**
 1100 *	__dev_get_by_flags - find any device with given flags
 1101 *	@net: the applicable net namespace
 1102 *	@if_flags: IFF_* values
 1103 *	@mask: bitmask of bits in if_flags to check
 1104 *
 1105 *	Search for any interface with the given flags. Returns NULL if a device
 1106 *	is not found or a pointer to the device. Must be called inside
 1107 *	rtnl_lock(), and result refcount is unchanged.
 1108 */
 1109
 1110struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 1111				      unsigned short mask)
 1112{
 1113	struct net_device *dev, *ret;
 1114
 1115	ASSERT_RTNL();
 1116
 1117	ret = NULL;
 1118	for_each_netdev(net, dev) {
 1119		if (((dev->flags ^ if_flags) & mask) == 0) {
 1120			ret = dev;
 1121			break;
 1122		}
 1123	}
 1124	return ret;
 1125}
 1126EXPORT_SYMBOL(__dev_get_by_flags);
 1127
 1128/**
 1129 *	dev_valid_name - check if name is okay for network device
 1130 *	@name: name string
 1131 *
 1132 *	Network device names need to be valid file names to
 1133 *	to allow sysfs to work.  We also disallow any kind of
 1134 *	whitespace.
 1135 */
 1136bool dev_valid_name(const char *name)
 1137{
 1138	if (*name == '\0')
 1139		return false;
 1140	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
 1141		return false;
 1142	if (!strcmp(name, ".") || !strcmp(name, ".."))
 1143		return false;
 1144
 1145	while (*name) {
 1146		if (*name == '/' || *name == ':' || isspace(*name))
 1147			return false;
 1148		name++;
 1149	}
 1150	return true;
 1151}
 1152EXPORT_SYMBOL(dev_valid_name);
 1153
 1154/**
 1155 *	__dev_alloc_name - allocate a name for a device
 1156 *	@net: network namespace to allocate the device name in
 1157 *	@name: name format string
 1158 *	@buf:  scratch buffer and result name string
 1159 *
 1160 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1161 *	id. It scans list of devices to build up a free map, then chooses
 1162 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1163 *	while allocating the name and adding the device in order to avoid
 1164 *	duplicates.
 1165 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1166 *	Returns the number of the unit assigned or a negative errno code.
 1167 */
 1168
 1169static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 1170{
 1171	int i = 0;
 1172	const char *p;
 1173	const int max_netdevices = 8*PAGE_SIZE;
 1174	unsigned long *inuse;
 1175	struct net_device *d;
 1176
 1177	if (!dev_valid_name(name))
 1178		return -EINVAL;
 1179
 1180	p = strchr(name, '%');
 1181	if (p) {
 1182		/*
 1183		 * Verify the string as this thing may have come from
 1184		 * the user.  There must be either one "%d" and no other "%"
 1185		 * characters.
 1186		 */
 1187		if (p[1] != 'd' || strchr(p + 2, '%'))
 1188			return -EINVAL;
 1189
 1190		/* Use one page as a bit array of possible slots */
 1191		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 1192		if (!inuse)
 1193			return -ENOMEM;
 1194
 1195		for_each_netdev(net, d) {
 1196			if (!sscanf(d->name, name, &i))
 1197				continue;
 1198			if (i < 0 || i >= max_netdevices)
 1199				continue;
 1200
 1201			/*  avoid cases where sscanf is not exact inverse of printf */
 1202			snprintf(buf, IFNAMSIZ, name, i);
 1203			if (!strncmp(buf, d->name, IFNAMSIZ))
 1204				set_bit(i, inuse);
 1205		}
 1206
 1207		i = find_first_zero_bit(inuse, max_netdevices);
 1208		free_page((unsigned long) inuse);
 1209	}
 1210
 1211	snprintf(buf, IFNAMSIZ, name, i);
 
 1212	if (!__dev_get_by_name(net, buf))
 1213		return i;
 1214
 1215	/* It is possible to run out of possible slots
 1216	 * when the name is long and there isn't enough space left
 1217	 * for the digits, or if all bits are used.
 1218	 */
 1219	return -ENFILE;
 1220}
 1221
 1222static int dev_alloc_name_ns(struct net *net,
 1223			     struct net_device *dev,
 1224			     const char *name)
 1225{
 1226	char buf[IFNAMSIZ];
 1227	int ret;
 1228
 1229	BUG_ON(!net);
 1230	ret = __dev_alloc_name(net, name, buf);
 1231	if (ret >= 0)
 1232		strlcpy(dev->name, buf, IFNAMSIZ);
 1233	return ret;
 1234}
 1235
 1236/**
 1237 *	dev_alloc_name - allocate a name for a device
 1238 *	@dev: device
 1239 *	@name: name format string
 1240 *
 1241 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1242 *	id. It scans list of devices to build up a free map, then chooses
 1243 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1244 *	while allocating the name and adding the device in order to avoid
 1245 *	duplicates.
 1246 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1247 *	Returns the number of the unit assigned or a negative errno code.
 1248 */
 1249
 1250int dev_alloc_name(struct net_device *dev, const char *name)
 1251{
 1252	return dev_alloc_name_ns(dev_net(dev), dev, name);
 
 
 
 
 
 
 
 
 
 1253}
 1254EXPORT_SYMBOL(dev_alloc_name);
 1255
 1256static int dev_get_valid_name(struct net *net, struct net_device *dev,
 1257			      const char *name)
 1258{
 1259	BUG_ON(!net);
 
 
 
 1260
 1261	if (!dev_valid_name(name))
 1262		return -EINVAL;
 1263
 1264	if (strchr(name, '%'))
 1265		return dev_alloc_name_ns(net, dev, name);
 1266	else if (__dev_get_by_name(net, name))
 1267		return -EEXIST;
 1268	else if (dev->name != name)
 1269		strlcpy(dev->name, name, IFNAMSIZ);
 1270
 1271	return 0;
 1272}
 1273
 1274/**
 1275 *	dev_change_name - change name of a device
 1276 *	@dev: device
 1277 *	@newname: name (or format string) must be at least IFNAMSIZ
 1278 *
 1279 *	Change name of a device, can pass format strings "eth%d".
 1280 *	for wildcarding.
 1281 */
 1282int dev_change_name(struct net_device *dev, const char *newname)
 1283{
 1284	unsigned char old_assign_type;
 1285	char oldname[IFNAMSIZ];
 1286	int err = 0;
 1287	int ret;
 1288	struct net *net;
 1289
 1290	ASSERT_RTNL();
 1291	BUG_ON(!dev_net(dev));
 1292
 1293	net = dev_net(dev);
 1294
 1295	/* Some auto-enslaved devices e.g. failover slaves are
 1296	 * special, as userspace might rename the device after
 1297	 * the interface had been brought up and running since
 1298	 * the point kernel initiated auto-enslavement. Allow
 1299	 * live name change even when these slave devices are
 1300	 * up and running.
 1301	 *
 1302	 * Typically, users of these auto-enslaving devices
 1303	 * don't actually care about slave name change, as
 1304	 * they are supposed to operate on master interface
 1305	 * directly.
 1306	 */
 1307	if (dev->flags & IFF_UP &&
 1308	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
 1309		return -EBUSY;
 1310
 1311	down_write(&devnet_rename_sem);
 1312
 1313	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
 1314		up_write(&devnet_rename_sem);
 1315		return 0;
 1316	}
 1317
 1318	memcpy(oldname, dev->name, IFNAMSIZ);
 1319
 1320	err = dev_get_valid_name(net, dev, newname);
 1321	if (err < 0) {
 1322		up_write(&devnet_rename_sem);
 1323		return err;
 1324	}
 1325
 1326	if (oldname[0] && !strchr(oldname, '%'))
 1327		netdev_info(dev, "renamed from %s\n", oldname);
 1328
 1329	old_assign_type = dev->name_assign_type;
 1330	dev->name_assign_type = NET_NAME_RENAMED;
 1331
 1332rollback:
 1333	ret = device_rename(&dev->dev, dev->name);
 1334	if (ret) {
 1335		memcpy(dev->name, oldname, IFNAMSIZ);
 1336		dev->name_assign_type = old_assign_type;
 1337		up_write(&devnet_rename_sem);
 1338		return ret;
 1339	}
 1340
 1341	up_write(&devnet_rename_sem);
 1342
 1343	netdev_adjacent_rename_links(dev, oldname);
 1344
 1345	write_lock_bh(&dev_base_lock);
 1346	netdev_name_node_del(dev->name_node);
 1347	write_unlock_bh(&dev_base_lock);
 1348
 1349	synchronize_rcu();
 1350
 1351	write_lock_bh(&dev_base_lock);
 1352	netdev_name_node_add(net, dev->name_node);
 1353	write_unlock_bh(&dev_base_lock);
 1354
 1355	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 1356	ret = notifier_to_errno(ret);
 1357
 1358	if (ret) {
 1359		/* err >= 0 after dev_alloc_name() or stores the first errno */
 1360		if (err >= 0) {
 1361			err = ret;
 1362			down_write(&devnet_rename_sem);
 1363			memcpy(dev->name, oldname, IFNAMSIZ);
 1364			memcpy(oldname, newname, IFNAMSIZ);
 1365			dev->name_assign_type = old_assign_type;
 1366			old_assign_type = NET_NAME_RENAMED;
 1367			goto rollback;
 1368		} else {
 1369			pr_err("%s: name change rollback failed: %d\n",
 1370			       dev->name, ret);
 1371		}
 1372	}
 1373
 1374	return err;
 1375}
 1376
 1377/**
 1378 *	dev_set_alias - change ifalias of a device
 1379 *	@dev: device
 1380 *	@alias: name up to IFALIASZ
 1381 *	@len: limit of bytes to copy from info
 1382 *
 1383 *	Set ifalias for a device,
 1384 */
 1385int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 1386{
 1387	struct dev_ifalias *new_alias = NULL;
 
 
 1388
 1389	if (len >= IFALIASZ)
 1390		return -EINVAL;
 1391
 1392	if (len) {
 1393		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
 1394		if (!new_alias)
 1395			return -ENOMEM;
 1396
 1397		memcpy(new_alias->ifalias, alias, len);
 1398		new_alias->ifalias[len] = 0;
 1399	}
 1400
 1401	mutex_lock(&ifalias_mutex);
 1402	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
 1403					mutex_is_locked(&ifalias_mutex));
 1404	mutex_unlock(&ifalias_mutex);
 1405
 1406	if (new_alias)
 1407		kfree_rcu(new_alias, rcuhead);
 1408
 
 1409	return len;
 1410}
 1411EXPORT_SYMBOL(dev_set_alias);
 1412
 1413/**
 1414 *	dev_get_alias - get ifalias of a device
 1415 *	@dev: device
 1416 *	@name: buffer to store name of ifalias
 1417 *	@len: size of buffer
 1418 *
 1419 *	get ifalias for a device.  Caller must make sure dev cannot go
 1420 *	away,  e.g. rcu read lock or own a reference count to device.
 1421 */
 1422int dev_get_alias(const struct net_device *dev, char *name, size_t len)
 1423{
 1424	const struct dev_ifalias *alias;
 1425	int ret = 0;
 1426
 1427	rcu_read_lock();
 1428	alias = rcu_dereference(dev->ifalias);
 1429	if (alias)
 1430		ret = snprintf(name, len, "%s", alias->ifalias);
 1431	rcu_read_unlock();
 1432
 1433	return ret;
 1434}
 1435
 1436/**
 1437 *	netdev_features_change - device changes features
 1438 *	@dev: device to cause notification
 1439 *
 1440 *	Called to indicate a device has changed features.
 1441 */
 1442void netdev_features_change(struct net_device *dev)
 1443{
 1444	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 1445}
 1446EXPORT_SYMBOL(netdev_features_change);
 1447
 1448/**
 1449 *	netdev_state_change - device changes state
 1450 *	@dev: device to cause notification
 1451 *
 1452 *	Called to indicate a device has changed state. This function calls
 1453 *	the notifier chains for netdev_chain and sends a NEWLINK message
 1454 *	to the routing socket.
 1455 */
 1456void netdev_state_change(struct net_device *dev)
 1457{
 1458	if (dev->flags & IFF_UP) {
 1459		struct netdev_notifier_change_info change_info = {
 1460			.info.dev = dev,
 1461		};
 1462
 1463		call_netdevice_notifiers_info(NETDEV_CHANGE,
 1464					      &change_info.info);
 1465		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
 1466	}
 1467}
 1468EXPORT_SYMBOL(netdev_state_change);
 1469
 
 
 
 
 
 
 1470/**
 1471 * netdev_notify_peers - notify network peers about existence of @dev
 1472 * @dev: network device
 
 1473 *
 1474 * Generate traffic such that interested network peers are aware of
 1475 * @dev, such as by generating a gratuitous ARP. This may be used when
 1476 * a device wants to inform the rest of the network about some sort of
 1477 * reconfiguration such as a failover event or virtual machine
 1478 * migration.
 1479 */
 1480void netdev_notify_peers(struct net_device *dev)
 
 1481{
 1482	rtnl_lock();
 1483	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
 1484	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
 1485	rtnl_unlock();
 
 
 
 
 
 
 
 
 
 
 
 1486}
 1487EXPORT_SYMBOL(netdev_notify_peers);
 1488
 1489static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1490{
 1491	const struct net_device_ops *ops = dev->netdev_ops;
 1492	int ret;
 1493
 1494	ASSERT_RTNL();
 1495
 1496	if (!netif_device_present(dev)) {
 1497		/* may be detached because parent is runtime-suspended */
 1498		if (dev->dev.parent)
 1499			pm_runtime_resume(dev->dev.parent);
 1500		if (!netif_device_present(dev))
 1501			return -ENODEV;
 1502	}
 1503
 1504	/* Block netpoll from trying to do any rx path servicing.
 1505	 * If we don't do this there is a chance ndo_poll_controller
 1506	 * or ndo_poll may be running while we open the device
 1507	 */
 1508	netpoll_poll_disable(dev);
 1509
 1510	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
 1511	ret = notifier_to_errno(ret);
 1512	if (ret)
 1513		return ret;
 1514
 1515	set_bit(__LINK_STATE_START, &dev->state);
 1516
 1517	if (ops->ndo_validate_addr)
 1518		ret = ops->ndo_validate_addr(dev);
 1519
 1520	if (!ret && ops->ndo_open)
 1521		ret = ops->ndo_open(dev);
 1522
 1523	netpoll_poll_enable(dev);
 1524
 1525	if (ret)
 1526		clear_bit(__LINK_STATE_START, &dev->state);
 1527	else {
 1528		dev->flags |= IFF_UP;
 
 1529		dev_set_rx_mode(dev);
 1530		dev_activate(dev);
 1531		add_device_randomness(dev->dev_addr, dev->addr_len);
 1532	}
 1533
 1534	return ret;
 1535}
 1536
 1537/**
 1538 *	dev_open	- prepare an interface for use.
 1539 *	@dev: device to open
 1540 *	@extack: netlink extended ack
 1541 *
 1542 *	Takes a device from down to up state. The device's private open
 1543 *	function is invoked and then the multicast lists are loaded. Finally
 1544 *	the device is moved into the up state and a %NETDEV_UP message is
 1545 *	sent to the netdev notifier chain.
 1546 *
 1547 *	Calling this function on an active interface is a nop. On a failure
 1548 *	a negative errno code is returned.
 1549 */
 1550int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1551{
 1552	int ret;
 1553
 1554	if (dev->flags & IFF_UP)
 1555		return 0;
 1556
 1557	ret = __dev_open(dev, extack);
 1558	if (ret < 0)
 1559		return ret;
 1560
 1561	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
 1562	call_netdevice_notifiers(NETDEV_UP, dev);
 1563
 1564	return ret;
 1565}
 1566EXPORT_SYMBOL(dev_open);
 1567
 1568static void __dev_close_many(struct list_head *head)
 1569{
 1570	struct net_device *dev;
 1571
 1572	ASSERT_RTNL();
 1573	might_sleep();
 1574
 1575	list_for_each_entry(dev, head, close_list) {
 1576		/* Temporarily disable netpoll until the interface is down */
 1577		netpoll_poll_disable(dev);
 1578
 1579		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
 1580
 1581		clear_bit(__LINK_STATE_START, &dev->state);
 1582
 1583		/* Synchronize to scheduled poll. We cannot touch poll list, it
 1584		 * can be even on different cpu. So just clear netif_running().
 1585		 *
 1586		 * dev->stop() will invoke napi_disable() on all of it's
 1587		 * napi_struct instances on this device.
 1588		 */
 1589		smp_mb__after_atomic(); /* Commit netif_running(). */
 1590	}
 1591
 1592	dev_deactivate_many(head);
 1593
 1594	list_for_each_entry(dev, head, close_list) {
 1595		const struct net_device_ops *ops = dev->netdev_ops;
 1596
 1597		/*
 1598		 *	Call the device specific close. This cannot fail.
 1599		 *	Only if device is UP
 1600		 *
 1601		 *	We allow it to be called even after a DETACH hot-plug
 1602		 *	event.
 1603		 */
 1604		if (ops->ndo_stop)
 1605			ops->ndo_stop(dev);
 1606
 1607		dev->flags &= ~IFF_UP;
 1608		netpoll_poll_enable(dev);
 1609	}
 
 
 1610}
 1611
 1612static void __dev_close(struct net_device *dev)
 1613{
 
 1614	LIST_HEAD(single);
 1615
 1616	list_add(&dev->close_list, &single);
 1617	__dev_close_many(&single);
 1618	list_del(&single);
 
 1619}
 1620
 1621void dev_close_many(struct list_head *head, bool unlink)
 1622{
 1623	struct net_device *dev, *tmp;
 
 1624
 1625	/* Remove the devices that don't need to be closed */
 1626	list_for_each_entry_safe(dev, tmp, head, close_list)
 1627		if (!(dev->flags & IFF_UP))
 1628			list_del_init(&dev->close_list);
 1629
 1630	__dev_close_many(head);
 1631
 1632	list_for_each_entry_safe(dev, tmp, head, close_list) {
 1633		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
 1634		call_netdevice_notifiers(NETDEV_DOWN, dev);
 1635		if (unlink)
 1636			list_del_init(&dev->close_list);
 1637	}
 
 
 
 
 1638}
 1639EXPORT_SYMBOL(dev_close_many);
 1640
 1641/**
 1642 *	dev_close - shutdown an interface.
 1643 *	@dev: device to shutdown
 1644 *
 1645 *	This function moves an active device into down state. A
 1646 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 1647 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 1648 *	chain.
 1649 */
 1650void dev_close(struct net_device *dev)
 1651{
 1652	if (dev->flags & IFF_UP) {
 1653		LIST_HEAD(single);
 1654
 1655		list_add(&dev->close_list, &single);
 1656		dev_close_many(&single, true);
 1657		list_del(&single);
 1658	}
 
 1659}
 1660EXPORT_SYMBOL(dev_close);
 1661
 1662
 1663/**
 1664 *	dev_disable_lro - disable Large Receive Offload on a device
 1665 *	@dev: device
 1666 *
 1667 *	Disable Large Receive Offload (LRO) on a net device.  Must be
 1668 *	called under RTNL.  This is needed if received packets may be
 1669 *	forwarded to another interface.
 1670 */
 1671void dev_disable_lro(struct net_device *dev)
 1672{
 1673	struct net_device *lower_dev;
 1674	struct list_head *iter;
 
 
 
 
 1675
 1676	dev->wanted_features &= ~NETIF_F_LRO;
 1677	netdev_update_features(dev);
 1678
 1679	if (unlikely(dev->features & NETIF_F_LRO))
 1680		netdev_WARN(dev, "failed to disable LRO!\n");
 1681
 1682	netdev_for_each_lower_dev(dev, lower_dev, iter)
 1683		dev_disable_lro(lower_dev);
 1684}
 1685EXPORT_SYMBOL(dev_disable_lro);
 1686
 1687/**
 1688 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 1689 *	@dev: device
 1690 *
 1691 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 1692 *	called under RTNL.  This is needed if Generic XDP is installed on
 1693 *	the device.
 1694 */
 1695static void dev_disable_gro_hw(struct net_device *dev)
 1696{
 1697	dev->wanted_features &= ~NETIF_F_GRO_HW;
 1698	netdev_update_features(dev);
 1699
 1700	if (unlikely(dev->features & NETIF_F_GRO_HW))
 1701		netdev_WARN(dev, "failed to disable GRO_HW!\n");
 1702}
 1703
 1704const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 1705{
 1706#define N(val) 						\
 1707	case NETDEV_##val:				\
 1708		return "NETDEV_" __stringify(val);
 1709	switch (cmd) {
 1710	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
 1711	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
 1712	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
 1713	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
 1714	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
 1715	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
 1716	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 1717	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 1718	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 1719	N(PRE_CHANGEADDR)
 1720	}
 1721#undef N
 1722	return "UNKNOWN_NETDEV_EVENT";
 1723}
 1724EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
 1725
 1726static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 1727				   struct net_device *dev)
 1728{
 1729	struct netdev_notifier_info info = {
 1730		.dev = dev,
 1731	};
 1732
 1733	return nb->notifier_call(nb, val, &info);
 1734}
 1735
 1736static int call_netdevice_register_notifiers(struct notifier_block *nb,
 1737					     struct net_device *dev)
 1738{
 1739	int err;
 1740
 1741	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
 1742	err = notifier_to_errno(err);
 1743	if (err)
 1744		return err;
 1745
 1746	if (!(dev->flags & IFF_UP))
 1747		return 0;
 1748
 1749	call_netdevice_notifier(nb, NETDEV_UP, dev);
 1750	return 0;
 1751}
 1752
 1753static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
 1754						struct net_device *dev)
 1755{
 1756	if (dev->flags & IFF_UP) {
 1757		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
 1758					dev);
 1759		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
 1760	}
 1761	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 1762}
 1763
 1764static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
 1765						 struct net *net)
 1766{
 1767	struct net_device *dev;
 1768	int err;
 1769
 1770	for_each_netdev(net, dev) {
 1771		err = call_netdevice_register_notifiers(nb, dev);
 1772		if (err)
 1773			goto rollback;
 1774	}
 1775	return 0;
 1776
 1777rollback:
 1778	for_each_netdev_continue_reverse(net, dev)
 1779		call_netdevice_unregister_notifiers(nb, dev);
 1780	return err;
 1781}
 1782
 1783static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
 1784						    struct net *net)
 1785{
 1786	struct net_device *dev;
 1787
 1788	for_each_netdev(net, dev)
 1789		call_netdevice_unregister_notifiers(nb, dev);
 1790}
 1791
 1792static int dev_boot_phase = 1;
 1793
 1794/**
 1795 * register_netdevice_notifier - register a network notifier block
 1796 * @nb: notifier
 1797 *
 1798 * Register a notifier to be called when network device events occur.
 1799 * The notifier passed is linked into the kernel structures and must
 1800 * not be reused until it has been unregistered. A negative errno code
 1801 * is returned on a failure.
 1802 *
 1803 * When registered all registration and up events are replayed
 1804 * to the new notifier to allow device to have a race free
 1805 * view of the network device list.
 1806 */
 1807
 1808int register_netdevice_notifier(struct notifier_block *nb)
 1809{
 
 
 1810	struct net *net;
 1811	int err;
 1812
 1813	/* Close race with setup_net() and cleanup_net() */
 1814	down_write(&pernet_ops_rwsem);
 1815	rtnl_lock();
 1816	err = raw_notifier_chain_register(&netdev_chain, nb);
 1817	if (err)
 1818		goto unlock;
 1819	if (dev_boot_phase)
 1820		goto unlock;
 1821	for_each_net(net) {
 1822		err = call_netdevice_register_net_notifiers(nb, net);
 1823		if (err)
 1824			goto rollback;
 
 
 
 
 
 
 
 
 1825	}
 1826
 1827unlock:
 1828	rtnl_unlock();
 1829	up_write(&pernet_ops_rwsem);
 1830	return err;
 1831
 1832rollback:
 1833	for_each_net_continue_reverse(net)
 1834		call_netdevice_unregister_net_notifiers(nb, net);
 
 
 
 
 
 
 
 
 
 
 
 
 1835
 
 1836	raw_notifier_chain_unregister(&netdev_chain, nb);
 1837	goto unlock;
 1838}
 1839EXPORT_SYMBOL(register_netdevice_notifier);
 1840
 1841/**
 1842 * unregister_netdevice_notifier - unregister a network notifier block
 1843 * @nb: notifier
 1844 *
 1845 * Unregister a notifier previously registered by
 1846 * register_netdevice_notifier(). The notifier is unlinked into the
 1847 * kernel structures and may then be reused. A negative errno code
 1848 * is returned on a failure.
 1849 *
 1850 * After unregistering unregister and down device events are synthesized
 1851 * for all devices on the device list to the removed notifier to remove
 1852 * the need for special case cleanup code.
 1853 */
 1854
 1855int unregister_netdevice_notifier(struct notifier_block *nb)
 1856{
 
 1857	struct net *net;
 1858	int err;
 1859
 1860	/* Close race with setup_net() and cleanup_net() */
 1861	down_write(&pernet_ops_rwsem);
 1862	rtnl_lock();
 1863	err = raw_notifier_chain_unregister(&netdev_chain, nb);
 1864	if (err)
 1865		goto unlock;
 1866
 1867	for_each_net(net)
 1868		call_netdevice_unregister_net_notifiers(nb, net);
 1869
 
 
 
 
 
 
 
 1870unlock:
 1871	rtnl_unlock();
 1872	up_write(&pernet_ops_rwsem);
 1873	return err;
 1874}
 1875EXPORT_SYMBOL(unregister_netdevice_notifier);
 1876
 1877static int __register_netdevice_notifier_net(struct net *net,
 1878					     struct notifier_block *nb,
 1879					     bool ignore_call_fail)
 1880{
 1881	int err;
 1882
 1883	err = raw_notifier_chain_register(&net->netdev_chain, nb);
 1884	if (err)
 1885		return err;
 1886	if (dev_boot_phase)
 1887		return 0;
 1888
 1889	err = call_netdevice_register_net_notifiers(nb, net);
 1890	if (err && !ignore_call_fail)
 1891		goto chain_unregister;
 1892
 1893	return 0;
 1894
 1895chain_unregister:
 1896	raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1897	return err;
 1898}
 1899
 1900static int __unregister_netdevice_notifier_net(struct net *net,
 1901					       struct notifier_block *nb)
 1902{
 1903	int err;
 1904
 1905	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1906	if (err)
 1907		return err;
 1908
 1909	call_netdevice_unregister_net_notifiers(nb, net);
 1910	return 0;
 1911}
 1912
 1913/**
 1914 * register_netdevice_notifier_net - register a per-netns network notifier block
 1915 * @net: network namespace
 1916 * @nb: notifier
 1917 *
 1918 * Register a notifier to be called when network device events occur.
 1919 * The notifier passed is linked into the kernel structures and must
 1920 * not be reused until it has been unregistered. A negative errno code
 1921 * is returned on a failure.
 1922 *
 1923 * When registered all registration and up events are replayed
 1924 * to the new notifier to allow device to have a race free
 1925 * view of the network device list.
 1926 */
 1927
 1928int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
 1929{
 1930	int err;
 1931
 1932	rtnl_lock();
 1933	err = __register_netdevice_notifier_net(net, nb, false);
 1934	rtnl_unlock();
 1935	return err;
 1936}
 1937EXPORT_SYMBOL(register_netdevice_notifier_net);
 1938
 1939/**
 1940 * unregister_netdevice_notifier_net - unregister a per-netns
 1941 *                                     network notifier block
 1942 * @net: network namespace
 1943 * @nb: notifier
 1944 *
 1945 * Unregister a notifier previously registered by
 1946 * register_netdevice_notifier(). The notifier is unlinked into the
 1947 * kernel structures and may then be reused. A negative errno code
 1948 * is returned on a failure.
 1949 *
 1950 * After unregistering unregister and down device events are synthesized
 1951 * for all devices on the device list to the removed notifier to remove
 1952 * the need for special case cleanup code.
 1953 */
 1954
 1955int unregister_netdevice_notifier_net(struct net *net,
 1956				      struct notifier_block *nb)
 1957{
 1958	int err;
 1959
 1960	rtnl_lock();
 1961	err = __unregister_netdevice_notifier_net(net, nb);
 1962	rtnl_unlock();
 1963	return err;
 1964}
 1965EXPORT_SYMBOL(unregister_netdevice_notifier_net);
 1966
 1967int register_netdevice_notifier_dev_net(struct net_device *dev,
 1968					struct notifier_block *nb,
 1969					struct netdev_net_notifier *nn)
 1970{
 1971	int err;
 1972
 1973	rtnl_lock();
 1974	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
 1975	if (!err) {
 1976		nn->nb = nb;
 1977		list_add(&nn->list, &dev->net_notifier_list);
 1978	}
 1979	rtnl_unlock();
 1980	return err;
 1981}
 1982EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
 1983
 1984int unregister_netdevice_notifier_dev_net(struct net_device *dev,
 1985					  struct notifier_block *nb,
 1986					  struct netdev_net_notifier *nn)
 1987{
 1988	int err;
 1989
 1990	rtnl_lock();
 1991	list_del(&nn->list);
 1992	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
 1993	rtnl_unlock();
 1994	return err;
 1995}
 1996EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
 1997
 1998static void move_netdevice_notifiers_dev_net(struct net_device *dev,
 1999					     struct net *net)
 2000{
 2001	struct netdev_net_notifier *nn;
 2002
 2003	list_for_each_entry(nn, &dev->net_notifier_list, list) {
 2004		__unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
 2005		__register_netdevice_notifier_net(net, nn->nb, true);
 2006	}
 2007}
 2008
 2009/**
 2010 *	call_netdevice_notifiers_info - call all network notifier blocks
 2011 *	@val: value passed unmodified to notifier function
 2012 *	@info: notifier information data
 2013 *
 2014 *	Call all network notifier blocks.  Parameters and return value
 2015 *	are as for raw_notifier_call_chain().
 2016 */
 2017
 2018static int call_netdevice_notifiers_info(unsigned long val,
 2019					 struct netdev_notifier_info *info)
 2020{
 2021	struct net *net = dev_net(info->dev);
 2022	int ret;
 2023
 2024	ASSERT_RTNL();
 2025
 2026	/* Run per-netns notifier block chain first, then run the global one.
 2027	 * Hopefully, one day, the global one is going to be removed after
 2028	 * all notifier block registrators get converted to be per-netns.
 2029	 */
 2030	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
 2031	if (ret & NOTIFY_STOP_MASK)
 2032		return ret;
 2033	return raw_notifier_call_chain(&netdev_chain, val, info);
 2034}
 2035
 2036static int call_netdevice_notifiers_extack(unsigned long val,
 2037					   struct net_device *dev,
 2038					   struct netlink_ext_ack *extack)
 2039{
 2040	struct netdev_notifier_info info = {
 2041		.dev = dev,
 2042		.extack = extack,
 2043	};
 2044
 2045	return call_netdevice_notifiers_info(val, &info);
 2046}
 2047
 2048/**
 2049 *	call_netdevice_notifiers - call all network notifier blocks
 2050 *      @val: value passed unmodified to notifier function
 2051 *      @dev: net_device pointer passed unmodified to notifier function
 2052 *
 2053 *	Call all network notifier blocks.  Parameters and return value
 2054 *	are as for raw_notifier_call_chain().
 2055 */
 2056
 2057int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 2058{
 2059	return call_netdevice_notifiers_extack(val, dev, NULL);
 
 2060}
 2061EXPORT_SYMBOL(call_netdevice_notifiers);
 2062
 2063/**
 2064 *	call_netdevice_notifiers_mtu - call all network notifier blocks
 2065 *	@val: value passed unmodified to notifier function
 2066 *	@dev: net_device pointer passed unmodified to notifier function
 2067 *	@arg: additional u32 argument passed to the notifier function
 2068 *
 2069 *	Call all network notifier blocks.  Parameters and return value
 2070 *	are as for raw_notifier_call_chain().
 2071 */
 2072static int call_netdevice_notifiers_mtu(unsigned long val,
 2073					struct net_device *dev, u32 arg)
 2074{
 2075	struct netdev_notifier_info_ext info = {
 2076		.info.dev = dev,
 2077		.ext.mtu = arg,
 2078	};
 2079
 2080	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
 2081
 2082	return call_netdevice_notifiers_info(val, &info.info);
 2083}
 2084
 2085#ifdef CONFIG_NET_INGRESS
 2086static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
 2087
 2088void net_inc_ingress_queue(void)
 2089{
 2090	static_branch_inc(&ingress_needed_key);
 2091}
 2092EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
 2093
 2094void net_dec_ingress_queue(void)
 2095{
 2096	static_branch_dec(&ingress_needed_key);
 2097}
 2098EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
 2099#endif
 2100
 2101#ifdef CONFIG_NET_EGRESS
 2102static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
 2103
 2104void net_inc_egress_queue(void)
 2105{
 2106	static_branch_inc(&egress_needed_key);
 2107}
 2108EXPORT_SYMBOL_GPL(net_inc_egress_queue);
 2109
 2110void net_dec_egress_queue(void)
 2111{
 2112	static_branch_dec(&egress_needed_key);
 2113}
 2114EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 2115#endif
 2116
 2117static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
 2118#ifdef CONFIG_JUMP_LABEL
 2119static atomic_t netstamp_needed_deferred;
 2120static atomic_t netstamp_wanted;
 2121static void netstamp_clear(struct work_struct *work)
 2122{
 2123	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
 2124	int wanted;
 2125
 2126	wanted = atomic_add_return(deferred, &netstamp_wanted);
 2127	if (wanted > 0)
 2128		static_branch_enable(&netstamp_needed_key);
 2129	else
 2130		static_branch_disable(&netstamp_needed_key);
 2131}
 2132static DECLARE_WORK(netstamp_work, netstamp_clear);
 2133#endif
 2134
 2135void net_enable_timestamp(void)
 2136{
 2137#ifdef CONFIG_JUMP_LABEL
 2138	int wanted;
 2139
 2140	while (1) {
 2141		wanted = atomic_read(&netstamp_wanted);
 2142		if (wanted <= 0)
 2143			break;
 2144		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
 2145			return;
 2146	}
 2147	atomic_inc(&netstamp_needed_deferred);
 2148	schedule_work(&netstamp_work);
 2149#else
 2150	static_branch_inc(&netstamp_needed_key);
 2151#endif
 
 
 2152}
 2153EXPORT_SYMBOL(net_enable_timestamp);
 2154
 2155void net_disable_timestamp(void)
 2156{
 2157#ifdef CONFIG_JUMP_LABEL
 2158	int wanted;
 2159
 2160	while (1) {
 2161		wanted = atomic_read(&netstamp_wanted);
 2162		if (wanted <= 1)
 2163			break;
 2164		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
 2165			return;
 2166	}
 2167	atomic_dec(&netstamp_needed_deferred);
 2168	schedule_work(&netstamp_work);
 2169#else
 2170	static_branch_dec(&netstamp_needed_key);
 2171#endif
 
 2172}
 2173EXPORT_SYMBOL(net_disable_timestamp);
 2174
 2175static inline void net_timestamp_set(struct sk_buff *skb)
 2176{
 2177	skb->tstamp = 0;
 2178	if (static_branch_unlikely(&netstamp_needed_key))
 2179		__net_timestamp(skb);
 2180}
 2181
 2182#define net_timestamp_check(COND, SKB)				\
 2183	if (static_branch_unlikely(&netstamp_needed_key)) {	\
 2184		if ((COND) && !(SKB)->tstamp)			\
 2185			__net_timestamp(SKB);			\
 2186	}							\
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2187
 2188bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
 
 2189{
 2190	unsigned int len;
 2191
 2192	if (!(dev->flags & IFF_UP))
 2193		return false;
 2194
 2195	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
 2196	if (skb->len <= len)
 2197		return true;
 2198
 2199	/* if TSO is enabled, we don't care about the length as the packet
 2200	 * could be forwarded without being segmented before
 2201	 */
 2202	if (skb_is_gso(skb))
 2203		return true;
 2204
 2205	return false;
 2206}
 2207EXPORT_SYMBOL_GPL(is_skb_forwardable);
 2208
 2209int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2210{
 2211	int ret = ____dev_forward_skb(dev, skb);
 2212
 2213	if (likely(!ret)) {
 2214		skb->protocol = eth_type_trans(skb, dev);
 2215		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 2216	}
 2217
 2218	return ret;
 2219}
 2220EXPORT_SYMBOL_GPL(__dev_forward_skb);
 2221
 2222/**
 2223 * dev_forward_skb - loopback an skb to another netif
 2224 *
 2225 * @dev: destination network device
 2226 * @skb: buffer to forward
 2227 *
 2228 * return values:
 2229 *	NET_RX_SUCCESS	(no congestion)
 2230 *	NET_RX_DROP     (packet was dropped, but freed)
 2231 *
 2232 * dev_forward_skb can be used for injecting an skb from the
 2233 * start_xmit function of one device into the receive queue
 2234 * of another device.
 2235 *
 2236 * The receiving device may be in another namespace, so
 2237 * we have to clear all information in the skb that could
 2238 * impact namespace isolation.
 2239 */
 2240int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2241{
 2242	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2243}
 2244EXPORT_SYMBOL_GPL(dev_forward_skb);
 2245
 2246static inline int deliver_skb(struct sk_buff *skb,
 2247			      struct packet_type *pt_prev,
 2248			      struct net_device *orig_dev)
 2249{
 2250	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 2251		return -ENOMEM;
 2252	refcount_inc(&skb->users);
 2253	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 2254}
 2255
 2256static inline void deliver_ptype_list_skb(struct sk_buff *skb,
 2257					  struct packet_type **pt,
 2258					  struct net_device *orig_dev,
 2259					  __be16 type,
 2260					  struct list_head *ptype_list)
 2261{
 2262	struct packet_type *ptype, *pt_prev = *pt;
 2263
 2264	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2265		if (ptype->type != type)
 2266			continue;
 2267		if (pt_prev)
 2268			deliver_skb(skb, pt_prev, orig_dev);
 2269		pt_prev = ptype;
 2270	}
 2271	*pt = pt_prev;
 2272}
 2273
 2274static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 2275{
 2276	if (!ptype->af_packet_priv || !skb->sk)
 2277		return false;
 2278
 2279	if (ptype->id_match)
 2280		return ptype->id_match(ptype, skb->sk);
 2281	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
 2282		return true;
 2283
 2284	return false;
 2285}
 2286
 2287/**
 2288 * dev_nit_active - return true if any network interface taps are in use
 2289 *
 2290 * @dev: network device to check for the presence of taps
 2291 */
 2292bool dev_nit_active(struct net_device *dev)
 2293{
 2294	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
 2295}
 2296EXPORT_SYMBOL_GPL(dev_nit_active);
 2297
 2298/*
 2299 *	Support routine. Sends outgoing frames to any network
 2300 *	taps currently in use.
 2301 */
 2302
 2303void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 2304{
 2305	struct packet_type *ptype;
 2306	struct sk_buff *skb2 = NULL;
 2307	struct packet_type *pt_prev = NULL;
 2308	struct list_head *ptype_list = &ptype_all;
 2309
 2310	rcu_read_lock();
 2311again:
 2312	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2313		if (ptype->ignore_outgoing)
 2314			continue;
 2315
 2316		/* Never send packets back to the socket
 2317		 * they originated from - MvS (miquels@drinkel.ow.org)
 2318		 */
 2319		if (skb_loop_sk(ptype, skb))
 2320			continue;
 
 
 
 
 
 2321
 2322		if (pt_prev) {
 2323			deliver_skb(skb2, pt_prev, skb->dev);
 2324			pt_prev = ptype;
 2325			continue;
 2326		}
 2327
 2328		/* need to clone skb, done only once */
 2329		skb2 = skb_clone(skb, GFP_ATOMIC);
 2330		if (!skb2)
 2331			goto out_unlock;
 2332
 2333		net_timestamp_set(skb2);
 2334
 2335		/* skb->nh should be correctly
 2336		 * set by sender, so that the second statement is
 2337		 * just protection against buggy protocols.
 2338		 */
 2339		skb_reset_mac_header(skb2);
 2340
 2341		if (skb_network_header(skb2) < skb2->data ||
 2342		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
 2343			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
 2344					     ntohs(skb2->protocol),
 2345					     dev->name);
 2346			skb_reset_network_header(skb2);
 2347		}
 2348
 2349		skb2->transport_header = skb2->network_header;
 2350		skb2->pkt_type = PACKET_OUTGOING;
 2351		pt_prev = ptype;
 2352	}
 
 
 
 2353
 2354	if (ptype_list == &ptype_all) {
 2355		ptype_list = &dev->ptype_all;
 2356		goto again;
 2357	}
 2358out_unlock:
 2359	if (pt_prev) {
 2360		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
 2361			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 2362		else
 2363			kfree_skb(skb2);
 2364	}
 
 
 2365	rcu_read_unlock();
 2366}
 2367EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
 2368
 2369/**
 2370 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 2371 * @dev: Network device
 2372 * @txq: number of queues available
 2373 *
 2374 * If real_num_tx_queues is changed the tc mappings may no longer be
 2375 * valid. To resolve this verify the tc mapping remains valid and if
 2376 * not NULL the mapping. With no priorities mapping to this
 2377 * offset/count pair it will no longer be used. In the worst case TC0
 2378 * is invalid nothing can be done so disable priority mappings. If is
 2379 * expected that drivers will fix this mapping if they can before
 2380 * calling netif_set_real_num_tx_queues.
 2381 */
 2382static void netif_setup_tc(struct net_device *dev, unsigned int txq)
 2383{
 2384	int i;
 2385	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2386
 2387	/* If TC0 is invalidated disable TC mapping */
 2388	if (tc->offset + tc->count > txq) {
 2389		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
 2390		dev->num_tc = 0;
 2391		return;
 2392	}
 2393
 2394	/* Invalidated prio to tc mappings set to TC0 */
 2395	for (i = 1; i < TC_BITMASK + 1; i++) {
 2396		int q = netdev_get_prio_tc_map(dev, i);
 2397
 2398		tc = &dev->tc_to_txq[q];
 2399		if (tc->offset + tc->count > txq) {
 2400			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
 2401				i, q);
 2402			netdev_set_prio_tc_map(dev, i, 0);
 2403		}
 2404	}
 2405}
 2406
 2407int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
 2408{
 2409	if (dev->num_tc) {
 2410		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2411		int i;
 2412
 2413		/* walk through the TCs and see if it falls into any of them */
 2414		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
 2415			if ((txq - tc->offset) < tc->count)
 2416				return i;
 2417		}
 2418
 2419		/* didn't find it, just return -1 to indicate no match */
 2420		return -1;
 2421	}
 2422
 2423	return 0;
 2424}
 2425EXPORT_SYMBOL(netdev_txq_to_tc);
 2426
 2427#ifdef CONFIG_XPS
 2428struct static_key xps_needed __read_mostly;
 2429EXPORT_SYMBOL(xps_needed);
 2430struct static_key xps_rxqs_needed __read_mostly;
 2431EXPORT_SYMBOL(xps_rxqs_needed);
 2432static DEFINE_MUTEX(xps_map_mutex);
 2433#define xmap_dereference(P)		\
 2434	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
 2435
 2436static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 2437			     int tci, u16 index)
 2438{
 2439	struct xps_map *map = NULL;
 2440	int pos;
 2441
 2442	if (dev_maps)
 2443		map = xmap_dereference(dev_maps->attr_map[tci]);
 2444	if (!map)
 2445		return false;
 2446
 2447	for (pos = map->len; pos--;) {
 2448		if (map->queues[pos] != index)
 2449			continue;
 2450
 2451		if (map->len > 1) {
 2452			map->queues[pos] = map->queues[--map->len];
 2453			break;
 2454		}
 2455
 2456		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 2457		kfree_rcu(map, rcu);
 2458		return false;
 2459	}
 2460
 2461	return true;
 2462}
 2463
 2464static bool remove_xps_queue_cpu(struct net_device *dev,
 2465				 struct xps_dev_maps *dev_maps,
 2466				 int cpu, u16 offset, u16 count)
 2467{
 2468	int num_tc = dev->num_tc ? : 1;
 2469	bool active = false;
 2470	int tci;
 2471
 2472	for (tci = cpu * num_tc; num_tc--; tci++) {
 2473		int i, j;
 2474
 2475		for (i = count, j = offset; i--; j++) {
 2476			if (!remove_xps_queue(dev_maps, tci, j))
 2477				break;
 2478		}
 2479
 2480		active |= i < 0;
 2481	}
 2482
 2483	return active;
 2484}
 2485
 2486static void reset_xps_maps(struct net_device *dev,
 2487			   struct xps_dev_maps *dev_maps,
 2488			   bool is_rxqs_map)
 2489{
 2490	if (is_rxqs_map) {
 2491		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
 2492		RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
 2493	} else {
 2494		RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
 2495	}
 2496	static_key_slow_dec_cpuslocked(&xps_needed);
 2497	kfree_rcu(dev_maps, rcu);
 2498}
 2499
 2500static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
 2501			   struct xps_dev_maps *dev_maps, unsigned int nr_ids,
 2502			   u16 offset, u16 count, bool is_rxqs_map)
 2503{
 2504	bool active = false;
 2505	int i, j;
 2506
 2507	for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
 2508	     j < nr_ids;)
 2509		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
 2510					       count);
 2511	if (!active)
 2512		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 2513
 2514	if (!is_rxqs_map) {
 2515		for (i = offset + (count - 1); count--; i--) {
 2516			netdev_queue_numa_node_write(
 2517				netdev_get_tx_queue(dev, i),
 2518				NUMA_NO_NODE);
 2519		}
 2520	}
 2521}
 2522
 2523static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 2524				   u16 count)
 2525{
 2526	const unsigned long *possible_mask = NULL;
 2527	struct xps_dev_maps *dev_maps;
 2528	unsigned int nr_ids;
 2529
 2530	if (!static_key_false(&xps_needed))
 2531		return;
 2532
 2533	cpus_read_lock();
 2534	mutex_lock(&xps_map_mutex);
 2535
 2536	if (static_key_false(&xps_rxqs_needed)) {
 2537		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 2538		if (dev_maps) {
 2539			nr_ids = dev->num_rx_queues;
 2540			clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
 2541				       offset, count, true);
 2542		}
 2543	}
 2544
 2545	dev_maps = xmap_dereference(dev->xps_cpus_map);
 2546	if (!dev_maps)
 2547		goto out_no_maps;
 2548
 2549	if (num_possible_cpus() > 1)
 2550		possible_mask = cpumask_bits(cpu_possible_mask);
 2551	nr_ids = nr_cpu_ids;
 2552	clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
 2553		       false);
 2554
 2555out_no_maps:
 2556	mutex_unlock(&xps_map_mutex);
 2557	cpus_read_unlock();
 2558}
 2559
 2560static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
 2561{
 2562	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
 2563}
 2564
 2565static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
 2566				      u16 index, bool is_rxqs_map)
 2567{
 2568	struct xps_map *new_map;
 2569	int alloc_len = XPS_MIN_MAP_ALLOC;
 2570	int i, pos;
 2571
 2572	for (pos = 0; map && pos < map->len; pos++) {
 2573		if (map->queues[pos] != index)
 2574			continue;
 2575		return map;
 2576	}
 2577
 2578	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
 2579	if (map) {
 2580		if (pos < map->alloc_len)
 2581			return map;
 2582
 2583		alloc_len = map->alloc_len * 2;
 2584	}
 2585
 2586	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
 2587	 *  map
 2588	 */
 2589	if (is_rxqs_map)
 2590		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
 2591	else
 2592		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
 2593				       cpu_to_node(attr_index));
 2594	if (!new_map)
 2595		return NULL;
 2596
 2597	for (i = 0; i < pos; i++)
 2598		new_map->queues[i] = map->queues[i];
 2599	new_map->alloc_len = alloc_len;
 2600	new_map->len = pos;
 2601
 2602	return new_map;
 2603}
 2604
 2605/* Must be called under cpus_read_lock */
 2606int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 2607			  u16 index, bool is_rxqs_map)
 2608{
 2609	const unsigned long *online_mask = NULL, *possible_mask = NULL;
 2610	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
 2611	int i, j, tci, numa_node_id = -2;
 2612	int maps_sz, num_tc = 1, tc = 0;
 2613	struct xps_map *map, *new_map;
 2614	bool active = false;
 2615	unsigned int nr_ids;
 2616
 2617	if (dev->num_tc) {
 2618		/* Do not allow XPS on subordinate device directly */
 2619		num_tc = dev->num_tc;
 2620		if (num_tc < 0)
 2621			return -EINVAL;
 2622
 2623		/* If queue belongs to subordinate dev use its map */
 2624		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
 2625
 2626		tc = netdev_txq_to_tc(dev, index);
 2627		if (tc < 0)
 2628			return -EINVAL;
 2629	}
 2630
 2631	mutex_lock(&xps_map_mutex);
 2632	if (is_rxqs_map) {
 2633		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
 2634		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 2635		nr_ids = dev->num_rx_queues;
 2636	} else {
 2637		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
 2638		if (num_possible_cpus() > 1) {
 2639			online_mask = cpumask_bits(cpu_online_mask);
 2640			possible_mask = cpumask_bits(cpu_possible_mask);
 2641		}
 2642		dev_maps = xmap_dereference(dev->xps_cpus_map);
 2643		nr_ids = nr_cpu_ids;
 2644	}
 2645
 2646	if (maps_sz < L1_CACHE_BYTES)
 2647		maps_sz = L1_CACHE_BYTES;
 2648
 2649	/* allocate memory for queue storage */
 2650	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
 2651	     j < nr_ids;) {
 2652		if (!new_dev_maps)
 2653			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 2654		if (!new_dev_maps) {
 2655			mutex_unlock(&xps_map_mutex);
 2656			return -ENOMEM;
 2657		}
 2658
 2659		tci = j * num_tc + tc;
 2660		map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
 2661				 NULL;
 2662
 2663		map = expand_xps_map(map, j, index, is_rxqs_map);
 2664		if (!map)
 2665			goto error;
 2666
 2667		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2668	}
 2669
 2670	if (!new_dev_maps)
 2671		goto out_no_new_maps;
 2672
 2673	if (!dev_maps) {
 2674		/* Increment static keys at most once per type */
 2675		static_key_slow_inc_cpuslocked(&xps_needed);
 2676		if (is_rxqs_map)
 2677			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
 2678	}
 2679
 2680	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2681	     j < nr_ids;) {
 2682		/* copy maps belonging to foreign traffic classes */
 2683		for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
 2684			/* fill in the new device map from the old device map */
 2685			map = xmap_dereference(dev_maps->attr_map[tci]);
 2686			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2687		}
 2688
 2689		/* We need to explicitly update tci as prevous loop
 2690		 * could break out early if dev_maps is NULL.
 2691		 */
 2692		tci = j * num_tc + tc;
 2693
 2694		if (netif_attr_test_mask(j, mask, nr_ids) &&
 2695		    netif_attr_test_online(j, online_mask, nr_ids)) {
 2696			/* add tx-queue to CPU/rx-queue maps */
 2697			int pos = 0;
 2698
 2699			map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2700			while ((pos < map->len) && (map->queues[pos] != index))
 2701				pos++;
 2702
 2703			if (pos == map->len)
 2704				map->queues[map->len++] = index;
 2705#ifdef CONFIG_NUMA
 2706			if (!is_rxqs_map) {
 2707				if (numa_node_id == -2)
 2708					numa_node_id = cpu_to_node(j);
 2709				else if (numa_node_id != cpu_to_node(j))
 2710					numa_node_id = -1;
 2711			}
 2712#endif
 2713		} else if (dev_maps) {
 2714			/* fill in the new device map from the old device map */
 2715			map = xmap_dereference(dev_maps->attr_map[tci]);
 2716			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2717		}
 2718
 2719		/* copy maps belonging to foreign traffic classes */
 2720		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
 2721			/* fill in the new device map from the old device map */
 2722			map = xmap_dereference(dev_maps->attr_map[tci]);
 2723			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2724		}
 2725	}
 2726
 2727	if (is_rxqs_map)
 2728		rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
 2729	else
 2730		rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
 2731
 2732	/* Cleanup old maps */
 2733	if (!dev_maps)
 2734		goto out_no_old_maps;
 2735
 2736	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2737	     j < nr_ids;) {
 2738		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2739			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2740			map = xmap_dereference(dev_maps->attr_map[tci]);
 2741			if (map && map != new_map)
 2742				kfree_rcu(map, rcu);
 2743		}
 2744	}
 2745
 2746	kfree_rcu(dev_maps, rcu);
 2747
 2748out_no_old_maps:
 2749	dev_maps = new_dev_maps;
 2750	active = true;
 2751
 2752out_no_new_maps:
 2753	if (!is_rxqs_map) {
 2754		/* update Tx queue numa node */
 2755		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
 2756					     (numa_node_id >= 0) ?
 2757					     numa_node_id : NUMA_NO_NODE);
 2758	}
 2759
 2760	if (!dev_maps)
 2761		goto out_no_maps;
 2762
 2763	/* removes tx-queue from unused CPUs/rx-queues */
 2764	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2765	     j < nr_ids;) {
 2766		for (i = tc, tci = j * num_tc; i--; tci++)
 2767			active |= remove_xps_queue(dev_maps, tci, index);
 2768		if (!netif_attr_test_mask(j, mask, nr_ids) ||
 2769		    !netif_attr_test_online(j, online_mask, nr_ids))
 2770			active |= remove_xps_queue(dev_maps, tci, index);
 2771		for (i = num_tc - tc, tci++; --i; tci++)
 2772			active |= remove_xps_queue(dev_maps, tci, index);
 2773	}
 2774
 2775	/* free map if not active */
 2776	if (!active)
 2777		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 2778
 2779out_no_maps:
 2780	mutex_unlock(&xps_map_mutex);
 2781
 2782	return 0;
 2783error:
 2784	/* remove any maps that we added */
 2785	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2786	     j < nr_ids;) {
 2787		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2788			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2789			map = dev_maps ?
 2790			      xmap_dereference(dev_maps->attr_map[tci]) :
 2791			      NULL;
 2792			if (new_map && new_map != map)
 2793				kfree(new_map);
 2794		}
 2795	}
 2796
 2797	mutex_unlock(&xps_map_mutex);
 2798
 2799	kfree(new_dev_maps);
 2800	return -ENOMEM;
 2801}
 2802EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
 2803
 2804int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 2805			u16 index)
 2806{
 2807	int ret;
 2808
 2809	cpus_read_lock();
 2810	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
 2811	cpus_read_unlock();
 2812
 2813	return ret;
 2814}
 2815EXPORT_SYMBOL(netif_set_xps_queue);
 2816
 2817#endif
 2818static void netdev_unbind_all_sb_channels(struct net_device *dev)
 2819{
 2820	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2821
 2822	/* Unbind any subordinate channels */
 2823	while (txq-- != &dev->_tx[0]) {
 2824		if (txq->sb_dev)
 2825			netdev_unbind_sb_channel(dev, txq->sb_dev);
 2826	}
 2827}
 2828
 2829void netdev_reset_tc(struct net_device *dev)
 2830{
 2831#ifdef CONFIG_XPS
 2832	netif_reset_xps_queues_gt(dev, 0);
 2833#endif
 2834	netdev_unbind_all_sb_channels(dev);
 2835
 2836	/* Reset TC configuration of device */
 2837	dev->num_tc = 0;
 2838	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
 2839	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
 2840}
 2841EXPORT_SYMBOL(netdev_reset_tc);
 2842
 2843int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
 2844{
 2845	if (tc >= dev->num_tc)
 2846		return -EINVAL;
 2847
 2848#ifdef CONFIG_XPS
 2849	netif_reset_xps_queues(dev, offset, count);
 2850#endif
 2851	dev->tc_to_txq[tc].count = count;
 2852	dev->tc_to_txq[tc].offset = offset;
 2853	return 0;
 2854}
 2855EXPORT_SYMBOL(netdev_set_tc_queue);
 2856
 2857int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 2858{
 2859	if (num_tc > TC_MAX_QUEUE)
 2860		return -EINVAL;
 2861
 2862#ifdef CONFIG_XPS
 2863	netif_reset_xps_queues_gt(dev, 0);
 2864#endif
 2865	netdev_unbind_all_sb_channels(dev);
 2866
 2867	dev->num_tc = num_tc;
 2868	return 0;
 2869}
 2870EXPORT_SYMBOL(netdev_set_num_tc);
 2871
 2872void netdev_unbind_sb_channel(struct net_device *dev,
 2873			      struct net_device *sb_dev)
 2874{
 2875	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2876
 2877#ifdef CONFIG_XPS
 2878	netif_reset_xps_queues_gt(sb_dev, 0);
 2879#endif
 2880	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
 2881	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
 2882
 2883	while (txq-- != &dev->_tx[0]) {
 2884		if (txq->sb_dev == sb_dev)
 2885			txq->sb_dev = NULL;
 2886	}
 2887}
 2888EXPORT_SYMBOL(netdev_unbind_sb_channel);
 2889
 2890int netdev_bind_sb_channel_queue(struct net_device *dev,
 2891				 struct net_device *sb_dev,
 2892				 u8 tc, u16 count, u16 offset)
 2893{
 2894	/* Make certain the sb_dev and dev are already configured */
 2895	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
 2896		return -EINVAL;
 2897
 2898	/* We cannot hand out queues we don't have */
 2899	if ((offset + count) > dev->real_num_tx_queues)
 2900		return -EINVAL;
 2901
 2902	/* Record the mapping */
 2903	sb_dev->tc_to_txq[tc].count = count;
 2904	sb_dev->tc_to_txq[tc].offset = offset;
 2905
 2906	/* Provide a way for Tx queue to find the tc_to_txq map or
 2907	 * XPS map for itself.
 2908	 */
 2909	while (count--)
 2910		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
 2911
 2912	return 0;
 2913}
 2914EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
 2915
 2916int netdev_set_sb_channel(struct net_device *dev, u16 channel)
 2917{
 2918	/* Do not use a multiqueue device to represent a subordinate channel */
 2919	if (netif_is_multiqueue(dev))
 2920		return -ENODEV;
 2921
 2922	/* We allow channels 1 - 32767 to be used for subordinate channels.
 2923	 * Channel 0 is meant to be "native" mode and used only to represent
 2924	 * the main root device. We allow writing 0 to reset the device back
 2925	 * to normal mode after being used as a subordinate channel.
 2926	 */
 2927	if (channel > S16_MAX)
 2928		return -EINVAL;
 2929
 2930	dev->num_tc = -channel;
 2931
 2932	return 0;
 2933}
 2934EXPORT_SYMBOL(netdev_set_sb_channel);
 2935
 2936/*
 2937 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
 2938 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
 2939 */
 2940int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 2941{
 2942	bool disabling;
 2943	int rc;
 2944
 2945	disabling = txq < dev->real_num_tx_queues;
 2946
 2947	if (txq < 1 || txq > dev->num_tx_queues)
 2948		return -EINVAL;
 2949
 2950	if (dev->reg_state == NETREG_REGISTERED ||
 2951	    dev->reg_state == NETREG_UNREGISTERING) {
 2952		ASSERT_RTNL();
 2953
 2954		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
 2955						  txq);
 2956		if (rc)
 2957			return rc;
 2958
 2959		if (dev->num_tc)
 2960			netif_setup_tc(dev, txq);
 2961
 2962		dev->real_num_tx_queues = txq;
 2963
 2964		if (disabling) {
 2965			synchronize_net();
 2966			qdisc_reset_all_tx_gt(dev, txq);
 2967#ifdef CONFIG_XPS
 2968			netif_reset_xps_queues_gt(dev, txq);
 2969#endif
 2970		}
 2971	} else {
 2972		dev->real_num_tx_queues = txq;
 2973	}
 2974
 
 2975	return 0;
 2976}
 2977EXPORT_SYMBOL(netif_set_real_num_tx_queues);
 2978
 2979#ifdef CONFIG_SYSFS
 2980/**
 2981 *	netif_set_real_num_rx_queues - set actual number of RX queues used
 2982 *	@dev: Network device
 2983 *	@rxq: Actual number of RX queues
 2984 *
 2985 *	This must be called either with the rtnl_lock held or before
 2986 *	registration of the net device.  Returns 0 on success, or a
 2987 *	negative error code.  If called before registration, it always
 2988 *	succeeds.
 2989 */
 2990int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
 2991{
 2992	int rc;
 2993
 2994	if (rxq < 1 || rxq > dev->num_rx_queues)
 2995		return -EINVAL;
 2996
 2997	if (dev->reg_state == NETREG_REGISTERED) {
 2998		ASSERT_RTNL();
 2999
 3000		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
 3001						  rxq);
 3002		if (rc)
 3003			return rc;
 3004	}
 3005
 3006	dev->real_num_rx_queues = rxq;
 3007	return 0;
 3008}
 3009EXPORT_SYMBOL(netif_set_real_num_rx_queues);
 3010#endif
 3011
 3012/**
 3013 * netif_get_num_default_rss_queues - default number of RSS queues
 3014 *
 3015 * This routine should set an upper limit on the number of RSS queues
 3016 * used by default by multiqueue devices.
 3017 */
 3018int netif_get_num_default_rss_queues(void)
 3019{
 3020	return is_kdump_kernel() ?
 3021		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
 3022}
 3023EXPORT_SYMBOL(netif_get_num_default_rss_queues);
 3024
 3025static void __netif_reschedule(struct Qdisc *q)
 3026{
 3027	struct softnet_data *sd;
 3028	unsigned long flags;
 3029
 3030	local_irq_save(flags);
 3031	sd = this_cpu_ptr(&softnet_data);
 3032	q->next_sched = NULL;
 3033	*sd->output_queue_tailp = q;
 3034	sd->output_queue_tailp = &q->next_sched;
 3035	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3036	local_irq_restore(flags);
 3037}
 3038
 3039void __netif_schedule(struct Qdisc *q)
 3040{
 3041	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
 3042		__netif_reschedule(q);
 3043}
 3044EXPORT_SYMBOL(__netif_schedule);
 3045
 3046struct dev_kfree_skb_cb {
 3047	enum skb_free_reason reason;
 3048};
 3049
 3050static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
 3051{
 3052	return (struct dev_kfree_skb_cb *)skb->cb;
 3053}
 3054
 3055void netif_schedule_queue(struct netdev_queue *txq)
 3056{
 3057	rcu_read_lock();
 3058	if (!netif_xmit_stopped(txq)) {
 3059		struct Qdisc *q = rcu_dereference(txq->qdisc);
 3060
 3061		__netif_schedule(q);
 3062	}
 3063	rcu_read_unlock();
 3064}
 3065EXPORT_SYMBOL(netif_schedule_queue);
 3066
 3067void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 3068{
 3069	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
 3070		struct Qdisc *q;
 3071
 3072		rcu_read_lock();
 3073		q = rcu_dereference(dev_queue->qdisc);
 3074		__netif_schedule(q);
 3075		rcu_read_unlock();
 3076	}
 3077}
 3078EXPORT_SYMBOL(netif_tx_wake_queue);
 3079
 3080void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
 3081{
 3082	unsigned long flags;
 3083
 3084	if (unlikely(!skb))
 3085		return;
 3086
 3087	if (likely(refcount_read(&skb->users) == 1)) {
 3088		smp_rmb();
 3089		refcount_set(&skb->users, 0);
 3090	} else if (likely(!refcount_dec_and_test(&skb->users))) {
 3091		return;
 3092	}
 3093	get_kfree_skb_cb(skb)->reason = reason;
 3094	local_irq_save(flags);
 3095	skb->next = __this_cpu_read(softnet_data.completion_queue);
 3096	__this_cpu_write(softnet_data.completion_queue, skb);
 3097	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3098	local_irq_restore(flags);
 3099}
 3100EXPORT_SYMBOL(__dev_kfree_skb_irq);
 3101
 3102void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
 3103{
 3104	if (in_irq() || irqs_disabled())
 3105		__dev_kfree_skb_irq(skb, reason);
 3106	else
 3107		dev_kfree_skb(skb);
 3108}
 3109EXPORT_SYMBOL(__dev_kfree_skb_any);
 3110
 3111
 3112/**
 3113 * netif_device_detach - mark device as removed
 3114 * @dev: network device
 3115 *
 3116 * Mark device as removed from system and therefore no longer available.
 3117 */
 3118void netif_device_detach(struct net_device *dev)
 3119{
 3120	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3121	    netif_running(dev)) {
 3122		netif_tx_stop_all_queues(dev);
 3123	}
 3124}
 3125EXPORT_SYMBOL(netif_device_detach);
 3126
 3127/**
 3128 * netif_device_attach - mark device as attached
 3129 * @dev: network device
 3130 *
 3131 * Mark device as attached from system and restart if needed.
 3132 */
 3133void netif_device_attach(struct net_device *dev)
 3134{
 3135	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3136	    netif_running(dev)) {
 3137		netif_tx_wake_all_queues(dev);
 3138		__netdev_watchdog_up(dev);
 3139	}
 3140}
 3141EXPORT_SYMBOL(netif_device_attach);
 3142
 3143/*
 3144 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 3145 * to be used as a distribution range.
 3146 */
 3147static u16 skb_tx_hash(const struct net_device *dev,
 3148		       const struct net_device *sb_dev,
 3149		       struct sk_buff *skb)
 3150{
 3151	u32 hash;
 3152	u16 qoffset = 0;
 3153	u16 qcount = dev->real_num_tx_queues;
 3154
 3155	if (dev->num_tc) {
 3156		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
 3157
 3158		qoffset = sb_dev->tc_to_txq[tc].offset;
 3159		qcount = sb_dev->tc_to_txq[tc].count;
 3160	}
 3161
 3162	if (skb_rx_queue_recorded(skb)) {
 3163		hash = skb_get_rx_queue(skb);
 3164		if (hash >= qoffset)
 3165			hash -= qoffset;
 3166		while (unlikely(hash >= qcount))
 3167			hash -= qcount;
 3168		return hash + qoffset;
 3169	}
 3170
 3171	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
 3172}
 3173
 3174static void skb_warn_bad_offload(const struct sk_buff *skb)
 3175{
 3176	static const netdev_features_t null_features;
 3177	struct net_device *dev = skb->dev;
 3178	const char *name = "";
 3179
 3180	if (!net_ratelimit())
 3181		return;
 3182
 3183	if (dev) {
 3184		if (dev->dev.parent)
 3185			name = dev_driver_string(dev->dev.parent);
 3186		else
 3187			name = netdev_name(dev);
 3188	}
 3189	skb_dump(KERN_WARNING, skb, false);
 3190	WARN(1, "%s: caps=(%pNF, %pNF)\n",
 3191	     name, dev ? &dev->features : &null_features,
 3192	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
 3193}
 3194
 3195/*
 3196 * Invalidate hardware checksum when packet is to be mangled, and
 3197 * complete checksum manually on outgoing path.
 3198 */
 3199int skb_checksum_help(struct sk_buff *skb)
 3200{
 3201	__wsum csum;
 3202	int ret = 0, offset;
 3203
 3204	if (skb->ip_summed == CHECKSUM_COMPLETE)
 3205		goto out_set_summed;
 3206
 3207	if (unlikely(skb_shinfo(skb)->gso_size)) {
 3208		skb_warn_bad_offload(skb);
 3209		return -EINVAL;
 3210	}
 3211
 3212	/* Before computing a checksum, we should make sure no frag could
 3213	 * be modified by an external entity : checksum could be wrong.
 3214	 */
 3215	if (skb_has_shared_frag(skb)) {
 3216		ret = __skb_linearize(skb);
 3217		if (ret)
 3218			goto out;
 3219	}
 3220
 3221	offset = skb_checksum_start_offset(skb);
 3222	BUG_ON(offset >= skb_headlen(skb));
 3223	csum = skb_checksum(skb, offset, skb->len - offset, 0);
 3224
 3225	offset += skb->csum_offset;
 3226	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
 3227
 3228	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
 3229	if (ret)
 3230		goto out;
 3231
 3232	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
 3233out_set_summed:
 3234	skb->ip_summed = CHECKSUM_NONE;
 3235out:
 3236	return ret;
 3237}
 3238EXPORT_SYMBOL(skb_checksum_help);
 3239
 3240int skb_crc32c_csum_help(struct sk_buff *skb)
 3241{
 3242	__le32 crc32c_csum;
 3243	int ret = 0, offset, start;
 3244
 3245	if (skb->ip_summed != CHECKSUM_PARTIAL)
 3246		goto out;
 3247
 3248	if (unlikely(skb_is_gso(skb)))
 3249		goto out;
 3250
 3251	/* Before computing a checksum, we should make sure no frag could
 3252	 * be modified by an external entity : checksum could be wrong.
 3253	 */
 3254	if (unlikely(skb_has_shared_frag(skb))) {
 3255		ret = __skb_linearize(skb);
 3256		if (ret)
 3257			goto out;
 3258	}
 3259	start = skb_checksum_start_offset(skb);
 3260	offset = start + offsetof(struct sctphdr, checksum);
 3261	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
 3262		ret = -EINVAL;
 3263		goto out;
 3264	}
 3265
 3266	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
 3267	if (ret)
 3268		goto out;
 3269
 3270	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
 3271						  skb->len - start, ~(__u32)0,
 3272						  crc32c_csum_stub));
 3273	*(__le32 *)(skb->data + offset) = crc32c_csum;
 3274	skb->ip_summed = CHECKSUM_NONE;
 3275	skb->csum_not_inet = 0;
 3276out:
 3277	return ret;
 3278}
 3279
 3280__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 3281{
 3282	__be16 type = skb->protocol;
 3283
 3284	/* Tunnel gso handlers can set protocol to ethernet. */
 3285	if (type == htons(ETH_P_TEB)) {
 3286		struct ethhdr *eth;
 3287
 3288		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
 3289			return 0;
 3290
 3291		eth = (struct ethhdr *)skb->data;
 3292		type = eth->h_proto;
 3293	}
 3294
 3295	return __vlan_get_protocol(skb, type, depth);
 3296}
 3297
 3298/**
 3299 *	skb_mac_gso_segment - mac layer segmentation handler.
 3300 *	@skb: buffer to segment
 3301 *	@features: features for the output path (see dev->features)
 3302 */
 3303struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 3304				    netdev_features_t features)
 3305{
 3306	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
 3307	struct packet_offload *ptype;
 3308	int vlan_depth = skb->mac_len;
 3309	__be16 type = skb_network_protocol(skb, &vlan_depth);
 3310
 3311	if (unlikely(!type))
 3312		return ERR_PTR(-EINVAL);
 3313
 3314	__skb_pull(skb, vlan_depth);
 3315
 3316	rcu_read_lock();
 3317	list_for_each_entry_rcu(ptype, &offload_base, list) {
 3318		if (ptype->type == type && ptype->callbacks.gso_segment) {
 3319			segs = ptype->callbacks.gso_segment(skb, features);
 3320			break;
 3321		}
 3322	}
 3323	rcu_read_unlock();
 3324
 3325	__skb_push(skb, skb->data - skb_mac_header(skb));
 3326
 3327	return segs;
 3328}
 3329EXPORT_SYMBOL(skb_mac_gso_segment);
 3330
 3331
 3332/* openvswitch calls this on rx path, so we need a different check.
 3333 */
 3334static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 3335{
 3336	if (tx_path)
 3337		return skb->ip_summed != CHECKSUM_PARTIAL &&
 3338		       skb->ip_summed != CHECKSUM_UNNECESSARY;
 3339
 3340	return skb->ip_summed == CHECKSUM_NONE;
 3341}
 3342
 3343/**
 3344 *	__skb_gso_segment - Perform segmentation on skb.
 3345 *	@skb: buffer to segment
 3346 *	@features: features for the output path (see dev->features)
 3347 *	@tx_path: whether it is called in TX path
 3348 *
 3349 *	This function segments the given skb and returns a list of segments.
 3350 *
 3351 *	It may return NULL if the skb requires no segmentation.  This is
 3352 *	only possible when GSO is used for verifying header integrity.
 3353 *
 3354 *	Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
 3355 */
 3356struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 3357				  netdev_features_t features, bool tx_path)
 3358{
 3359	struct sk_buff *segs;
 3360
 3361	if (unlikely(skb_needs_check(skb, tx_path))) {
 3362		int err;
 
 3363
 3364		/* We're going to init ->check field in TCP or UDP header */
 3365		err = skb_cow_head(skb, 0);
 3366		if (err < 0)
 3367			return ERR_PTR(err);
 3368	}
 3369
 3370	/* Only report GSO partial support if it will enable us to
 3371	 * support segmentation on this frame without needing additional
 3372	 * work.
 3373	 */
 3374	if (features & NETIF_F_GSO_PARTIAL) {
 3375		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
 3376		struct net_device *dev = skb->dev;
 3377
 3378		partial_features |= dev->features & dev->gso_partial_features;
 3379		if (!skb_gso_ok(skb, features | partial_features))
 3380			features &= ~NETIF_F_GSO_PARTIAL;
 3381	}
 3382
 3383	BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
 3384		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
 
 3385
 3386	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
 3387	SKB_GSO_CB(skb)->encap_level = 0;
 3388
 3389	skb_reset_mac_header(skb);
 3390	skb_reset_mac_len(skb);
 
 
 3391
 3392	segs = skb_mac_gso_segment(skb, features);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3393
 3394	if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
 3395		skb_warn_bad_offload(skb);
 3396
 3397	return segs;
 3398}
 3399EXPORT_SYMBOL(__skb_gso_segment);
 3400
 3401/* Take action when hardware reception checksum errors are detected. */
 3402#ifdef CONFIG_BUG
 3403void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 3404{
 3405	if (net_ratelimit()) {
 3406		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
 3407		skb_dump(KERN_ERR, skb, true);
 3408		dump_stack();
 3409	}
 3410}
 3411EXPORT_SYMBOL(netdev_rx_csum_fault);
 3412#endif
 3413
 3414/* XXX: check that highmem exists at all on the given machine. */
 
 
 
 
 3415static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 3416{
 3417#ifdef CONFIG_HIGHMEM
 3418	int i;
 3419
 3420	if (!(dev->features & NETIF_F_HIGHDMA)) {
 3421		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 3422			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
 
 
 
 3423
 3424			if (PageHighMem(skb_frag_page(frag)))
 
 
 
 
 
 
 
 
 3425				return 1;
 3426		}
 3427	}
 3428#endif
 3429	return 0;
 3430}
 3431
 3432/* If MPLS offload request, verify we are testing hardware MPLS features
 3433 * instead of standard features for the netdev.
 3434 */
 3435#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
 3436static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3437					   netdev_features_t features,
 3438					   __be16 type)
 3439{
 3440	if (eth_p_mpls(type))
 3441		features &= skb->dev->mpls_features;
 
 
 
 
 
 
 
 3442
 3443	return features;
 
 
 3444}
 3445#else
 3446static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3447					   netdev_features_t features,
 3448					   __be16 type)
 
 
 
 
 
 
 3449{
 3450	return features;
 3451}
 3452#endif
 3453
 3454static netdev_features_t harmonize_features(struct sk_buff *skb,
 3455	netdev_features_t features)
 3456{
 3457	__be16 type;
 3458
 3459	type = skb_network_protocol(skb, NULL);
 3460	features = net_mpls_features(skb, features, type);
 
 3461
 3462	if (skb->ip_summed != CHECKSUM_NONE &&
 3463	    !can_checksum_protocol(features, type)) {
 3464		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 3465	}
 3466	if (illegal_highdma(skb->dev, skb))
 3467		features &= ~NETIF_F_SG;
 3468
 3469	return features;
 3470}
 
 3471
 3472netdev_features_t passthru_features_check(struct sk_buff *skb,
 3473					  struct net_device *dev,
 3474					  netdev_features_t features)
 3475{
 3476	return features;
 3477}
 3478EXPORT_SYMBOL(passthru_features_check);
 3479
 3480static netdev_features_t dflt_features_check(struct sk_buff *skb,
 3481					     struct net_device *dev,
 3482					     netdev_features_t features)
 3483{
 3484	return vlan_features_check(skb, features);
 
 
 
 
 
 
 3485}
 3486
 3487static netdev_features_t gso_features_check(const struct sk_buff *skb,
 3488					    struct net_device *dev,
 3489					    netdev_features_t features)
 3490{
 3491	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 3492
 3493	if (gso_segs > dev->gso_max_segs)
 3494		return features & ~NETIF_F_GSO_MASK;
 3495
 3496	/* Support for GSO partial features requires software
 3497	 * intervention before we can actually process the packets
 3498	 * so we need to strip support for any partial features now
 3499	 * and we can pull them back in after we have partially
 3500	 * segmented the frame.
 3501	 */
 3502	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
 3503		features &= ~dev->gso_partial_features;
 3504
 3505	/* Make sure to clear the IPv4 ID mangling feature if the
 3506	 * IPv4 header has the potential to be fragmented.
 3507	 */
 3508	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
 3509		struct iphdr *iph = skb->encapsulation ?
 3510				    inner_ip_hdr(skb) : ip_hdr(skb);
 3511
 3512		if (!(iph->frag_off & htons(IP_DF)))
 3513			features &= ~NETIF_F_TSO_MANGLEID;
 3514	}
 3515
 3516	return features;
 3517}
 3518
 3519netdev_features_t netif_skb_features(struct sk_buff *skb)
 3520{
 3521	struct net_device *dev = skb->dev;
 3522	netdev_features_t features = dev->features;
 
 
 
 3523
 3524	if (skb_is_gso(skb))
 3525		features = gso_features_check(skb, dev, features);
 
 
 
 
 3526
 3527	/* If encapsulation offload request, verify we are testing
 3528	 * hardware encapsulation features instead of standard
 3529	 * features for the netdev
 3530	 */
 3531	if (skb->encapsulation)
 3532		features &= dev->hw_enc_features;
 3533
 3534	if (skb_vlan_tagged(skb))
 3535		features = netdev_intersect_features(features,
 3536						     dev->vlan_features |
 3537						     NETIF_F_HW_VLAN_CTAG_TX |
 3538						     NETIF_F_HW_VLAN_STAG_TX);
 3539
 3540	if (dev->netdev_ops->ndo_features_check)
 3541		features &= dev->netdev_ops->ndo_features_check(skb, dev,
 3542								features);
 3543	else
 3544		features &= dflt_features_check(skb, dev, features);
 3545
 3546	return harmonize_features(skb, features);
 
 
 
 
 
 
 3547}
 3548EXPORT_SYMBOL(netif_skb_features);
 3549
 3550static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 3551		    struct netdev_queue *txq, bool more)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3552{
 3553	unsigned int len;
 3554	int rc;
 
 3555
 3556	if (dev_nit_active(dev))
 3557		dev_queue_xmit_nit(skb, dev);
 3558
 3559	len = skb->len;
 3560	trace_net_dev_start_xmit(skb, dev);
 3561	rc = netdev_start_xmit(skb, dev, txq, more);
 3562	trace_net_dev_xmit(skb, rc, dev, len);
 
 
 3563
 3564	return rc;
 3565}
 3566
 3567struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
 3568				    struct netdev_queue *txq, int *ret)
 3569{
 3570	struct sk_buff *skb = first;
 3571	int rc = NETDEV_TX_OK;
 3572
 3573	while (skb) {
 3574		struct sk_buff *next = skb->next;
 
 
 
 3575
 3576		skb_mark_not_on_list(skb);
 3577		rc = xmit_one(skb, dev, txq, next != NULL);
 3578		if (unlikely(!dev_xmit_complete(rc))) {
 3579			skb->next = next;
 3580			goto out;
 3581		}
 3582
 3583		skb = next;
 3584		if (netif_tx_queue_stopped(txq) && skb) {
 3585			rc = NETDEV_TX_BUSY;
 3586			break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3587		}
 
 
 
 
 
 
 
 3588	}
 3589
 3590out:
 3591	*ret = rc;
 3592	return skb;
 3593}
 3594
 3595static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 3596					  netdev_features_t features)
 3597{
 3598	if (skb_vlan_tag_present(skb) &&
 3599	    !vlan_hw_offload_capable(features, skb->vlan_proto))
 3600		skb = __vlan_hwaccel_push_inside(skb);
 3601	return skb;
 3602}
 3603
 3604int skb_csum_hwoffload_help(struct sk_buff *skb,
 3605			    const netdev_features_t features)
 3606{
 3607	if (unlikely(skb->csum_not_inet))
 3608		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
 3609			skb_crc32c_csum_help(skb);
 3610
 3611	return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3612}
 3613EXPORT_SYMBOL(skb_csum_hwoffload_help);
 3614
 3615static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
 
 
 
 
 
 
 
 3616{
 3617	netdev_features_t features;
 
 
 3618
 3619	features = netif_skb_features(skb);
 3620	skb = validate_xmit_vlan(skb, features);
 3621	if (unlikely(!skb))
 3622		goto out_null;
 3623
 3624	skb = sk_validate_xmit_skb(skb, dev);
 3625	if (unlikely(!skb))
 3626		goto out_null;
 3627
 3628	if (netif_needs_gso(skb, features)) {
 3629		struct sk_buff *segs;
 3630
 3631		segs = skb_gso_segment(skb, features);
 3632		if (IS_ERR(segs)) {
 3633			goto out_kfree_skb;
 3634		} else if (segs) {
 3635			consume_skb(skb);
 3636			skb = segs;
 3637		}
 3638	} else {
 3639		if (skb_needs_linearize(skb, features) &&
 3640		    __skb_linearize(skb))
 3641			goto out_kfree_skb;
 3642
 3643		/* If packet is not checksummed and device does not
 3644		 * support checksumming for this protocol, complete
 3645		 * checksumming here.
 3646		 */
 3647		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 3648			if (skb->encapsulation)
 3649				skb_set_inner_transport_header(skb,
 3650							       skb_checksum_start_offset(skb));
 3651			else
 3652				skb_set_transport_header(skb,
 3653							 skb_checksum_start_offset(skb));
 3654			if (skb_csum_hwoffload_help(skb, features))
 3655				goto out_kfree_skb;
 3656		}
 3657	}
 3658
 3659	skb = validate_xmit_xfrm(skb, features, again);
 
 
 
 
 3660
 3661	return skb;
 
 
 
 
 3662
 3663out_kfree_skb:
 3664	kfree_skb(skb);
 3665out_null:
 3666	atomic_long_inc(&dev->tx_dropped);
 3667	return NULL;
 3668}
 
 3669
 3670struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
 3671{
 3672	struct sk_buff *next, *head = NULL, *tail;
 
 
 
 
 
 
 
 3673
 3674	for (; skb != NULL; skb = next) {
 3675		next = skb->next;
 3676		skb_mark_not_on_list(skb);
 
 
 
 3677
 3678		/* in case skb wont be segmented, point to itself */
 3679		skb->prev = skb;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3680
 3681		skb = validate_xmit_skb(skb, dev, again);
 3682		if (!skb)
 3683			continue;
 3684
 3685		if (!head)
 3686			head = skb;
 3687		else
 3688			tail->next = skb;
 3689		/* If skb was segmented, skb->prev points to
 3690		 * the last segment. If not, it still contains skb.
 3691		 */
 3692		tail = skb->prev;
 3693	}
 3694	return head;
 3695}
 3696EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
 3697
 3698static void qdisc_pkt_len_init(struct sk_buff *skb)
 
 3699{
 3700	const struct skb_shared_info *shinfo = skb_shinfo(skb);
 
 3701
 3702	qdisc_skb_cb(skb)->pkt_len = skb->len;
 
 
 
 
 
 
 
 3703
 3704	/* To get more precise estimation of bytes sent on wire,
 3705	 * we add to pkt_len the headers size of all segments
 3706	 */
 3707	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
 3708		unsigned int hdr_len;
 3709		u16 gso_segs = shinfo->gso_segs;
 3710
 3711		/* mac layer + network layer */
 3712		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
 3713
 3714		/* + transport layer */
 3715		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
 3716			const struct tcphdr *th;
 3717			struct tcphdr _tcphdr;
 3718
 3719			th = skb_header_pointer(skb, skb_transport_offset(skb),
 3720						sizeof(_tcphdr), &_tcphdr);
 3721			if (likely(th))
 3722				hdr_len += __tcp_hdrlen(th);
 3723		} else {
 3724			struct udphdr _udphdr;
 3725
 3726			if (skb_header_pointer(skb, skb_transport_offset(skb),
 3727					       sizeof(_udphdr), &_udphdr))
 3728				hdr_len += sizeof(struct udphdr);
 3729		}
 
 3730
 3731		if (shinfo->gso_type & SKB_GSO_DODGY)
 3732			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
 3733						shinfo->gso_size);
 3734
 3735		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
 3736	}
 3737}
 3738
 3739static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 3740				 struct net_device *dev,
 3741				 struct netdev_queue *txq)
 3742{
 3743	spinlock_t *root_lock = qdisc_lock(q);
 3744	struct sk_buff *to_free = NULL;
 3745	bool contended;
 3746	int rc;
 3747
 
 3748	qdisc_calculate_pkt_len(skb, q);
 3749
 3750	if (q->flags & TCQ_F_NOLOCK) {
 3751		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 3752		qdisc_run(q);
 3753
 3754		if (unlikely(to_free))
 3755			kfree_skb_list(to_free);
 3756		return rc;
 3757	}
 3758
 3759	/*
 3760	 * Heuristic to force contended enqueues to serialize on a
 3761	 * separate lock before trying to get qdisc main lock.
 3762	 * This permits qdisc->running owner to get the lock more
 3763	 * often and dequeue packets faster.
 3764	 */
 3765	contended = qdisc_is_running(q);
 3766	if (unlikely(contended))
 3767		spin_lock(&q->busylock);
 3768
 3769	spin_lock(root_lock);
 3770	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
 3771		__qdisc_drop(skb, &to_free);
 3772		rc = NET_XMIT_DROP;
 3773	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
 3774		   qdisc_run_begin(q)) {
 3775		/*
 3776		 * This is a work-conserving queue; there are no old skbs
 3777		 * waiting to be sent out; and the qdisc is not running -
 3778		 * xmit the skb directly.
 3779		 */
 
 
 3780
 3781		qdisc_bstats_update(q, skb);
 3782
 3783		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
 3784			if (unlikely(contended)) {
 3785				spin_unlock(&q->busylock);
 3786				contended = false;
 3787			}
 3788			__qdisc_run(q);
 3789		}
 
 3790
 3791		qdisc_run_end(q);
 3792		rc = NET_XMIT_SUCCESS;
 3793	} else {
 3794		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 
 3795		if (qdisc_run_begin(q)) {
 3796			if (unlikely(contended)) {
 3797				spin_unlock(&q->busylock);
 3798				contended = false;
 3799			}
 3800			__qdisc_run(q);
 3801			qdisc_run_end(q);
 3802		}
 3803	}
 3804	spin_unlock(root_lock);
 3805	if (unlikely(to_free))
 3806		kfree_skb_list(to_free);
 3807	if (unlikely(contended))
 3808		spin_unlock(&q->busylock);
 3809	return rc;
 3810}
 3811
 3812#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 3813static void skb_update_prio(struct sk_buff *skb)
 3814{
 3815	const struct netprio_map *map;
 3816	const struct sock *sk;
 3817	unsigned int prioidx;
 3818
 3819	if (skb->priority)
 3820		return;
 3821	map = rcu_dereference_bh(skb->dev->priomap);
 3822	if (!map)
 3823		return;
 3824	sk = skb_to_full_sk(skb);
 3825	if (!sk)
 3826		return;
 3827
 3828	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
 
 3829
 3830	if (prioidx < map->priomap_len)
 3831		skb->priority = map->priomap[prioidx];
 
 3832}
 3833#else
 3834#define skb_update_prio(skb)
 3835#endif
 3836
 3837/**
 3838 *	dev_loopback_xmit - loop back @skb
 3839 *	@net: network namespace this loopback is happening in
 3840 *	@sk:  sk needed to be a netfilter okfn
 3841 *	@skb: buffer to transmit
 3842 */
 3843int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
 3844{
 3845	skb_reset_mac_header(skb);
 3846	__skb_pull(skb, skb_network_offset(skb));
 3847	skb->pkt_type = PACKET_LOOPBACK;
 3848	skb->ip_summed = CHECKSUM_UNNECESSARY;
 3849	WARN_ON(!skb_dst(skb));
 3850	skb_dst_force(skb);
 3851	netif_rx_ni(skb);
 3852	return 0;
 3853}
 3854EXPORT_SYMBOL(dev_loopback_xmit);
 3855
 3856#ifdef CONFIG_NET_EGRESS
 3857static struct sk_buff *
 3858sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 3859{
 3860	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
 3861	struct tcf_result cl_res;
 3862
 3863	if (!miniq)
 3864		return skb;
 3865
 3866	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
 3867	mini_qdisc_bstats_cpu_update(miniq, skb);
 3868
 3869	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
 3870	case TC_ACT_OK:
 3871	case TC_ACT_RECLASSIFY:
 3872		skb->tc_index = TC_H_MIN(cl_res.classid);
 3873		break;
 3874	case TC_ACT_SHOT:
 3875		mini_qdisc_qstats_cpu_drop(miniq);
 3876		*ret = NET_XMIT_DROP;
 3877		kfree_skb(skb);
 3878		return NULL;
 3879	case TC_ACT_STOLEN:
 3880	case TC_ACT_QUEUED:
 3881	case TC_ACT_TRAP:
 3882		*ret = NET_XMIT_SUCCESS;
 3883		consume_skb(skb);
 3884		return NULL;
 3885	case TC_ACT_REDIRECT:
 3886		/* No need to push/pop skb's mac_header here on egress! */
 3887		skb_do_redirect(skb);
 3888		*ret = NET_XMIT_SUCCESS;
 3889		return NULL;
 3890	default:
 3891		break;
 3892	}
 3893
 3894	return skb;
 3895}
 3896#endif /* CONFIG_NET_EGRESS */
 3897
 3898#ifdef CONFIG_XPS
 3899static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 3900			       struct xps_dev_maps *dev_maps, unsigned int tci)
 3901{
 3902	struct xps_map *map;
 3903	int queue_index = -1;
 3904
 3905	if (dev->num_tc) {
 3906		tci *= dev->num_tc;
 3907		tci += netdev_get_prio_tc_map(dev, skb->priority);
 3908	}
 3909
 3910	map = rcu_dereference(dev_maps->attr_map[tci]);
 3911	if (map) {
 3912		if (map->len == 1)
 3913			queue_index = map->queues[0];
 3914		else
 3915			queue_index = map->queues[reciprocal_scale(
 3916						skb_get_hash(skb), map->len)];
 3917		if (unlikely(queue_index >= dev->real_num_tx_queues))
 3918			queue_index = -1;
 3919	}
 3920	return queue_index;
 3921}
 3922#endif
 3923
 3924static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
 3925			 struct sk_buff *skb)
 3926{
 3927#ifdef CONFIG_XPS
 3928	struct xps_dev_maps *dev_maps;
 3929	struct sock *sk = skb->sk;
 3930	int queue_index = -1;
 3931
 3932	if (!static_key_false(&xps_needed))
 3933		return -1;
 3934
 3935	rcu_read_lock();
 3936	if (!static_key_false(&xps_rxqs_needed))
 3937		goto get_cpus_map;
 3938
 3939	dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
 3940	if (dev_maps) {
 3941		int tci = sk_rx_queue_get(sk);
 3942
 3943		if (tci >= 0 && tci < dev->num_rx_queues)
 3944			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 3945							  tci);
 3946	}
 3947
 3948get_cpus_map:
 3949	if (queue_index < 0) {
 3950		dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
 3951		if (dev_maps) {
 3952			unsigned int tci = skb->sender_cpu - 1;
 3953
 3954			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 3955							  tci);
 3956		}
 3957	}
 3958	rcu_read_unlock();
 3959
 3960	return queue_index;
 3961#else
 3962	return -1;
 3963#endif
 3964}
 3965
 3966u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 3967		     struct net_device *sb_dev)
 3968{
 3969	return 0;
 3970}
 3971EXPORT_SYMBOL(dev_pick_tx_zero);
 3972
 3973u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
 3974		       struct net_device *sb_dev)
 3975{
 3976	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
 3977}
 3978EXPORT_SYMBOL(dev_pick_tx_cpu_id);
 3979
 3980u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 3981		     struct net_device *sb_dev)
 3982{
 3983	struct sock *sk = skb->sk;
 3984	int queue_index = sk_tx_queue_get(sk);
 3985
 3986	sb_dev = sb_dev ? : dev;
 3987
 3988	if (queue_index < 0 || skb->ooo_okay ||
 3989	    queue_index >= dev->real_num_tx_queues) {
 3990		int new_index = get_xps_queue(dev, sb_dev, skb);
 3991
 3992		if (new_index < 0)
 3993			new_index = skb_tx_hash(dev, sb_dev, skb);
 3994
 3995		if (queue_index != new_index && sk &&
 3996		    sk_fullsock(sk) &&
 3997		    rcu_access_pointer(sk->sk_dst_cache))
 3998			sk_tx_queue_set(sk, new_index);
 3999
 4000		queue_index = new_index;
 4001	}
 4002
 4003	return queue_index;
 4004}
 4005EXPORT_SYMBOL(netdev_pick_tx);
 4006
 4007struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
 4008					 struct sk_buff *skb,
 4009					 struct net_device *sb_dev)
 4010{
 4011	int queue_index = 0;
 4012
 4013#ifdef CONFIG_XPS
 4014	u32 sender_cpu = skb->sender_cpu - 1;
 4015
 4016	if (sender_cpu >= (u32)NR_CPUS)
 4017		skb->sender_cpu = raw_smp_processor_id() + 1;
 4018#endif
 4019
 4020	if (dev->real_num_tx_queues != 1) {
 4021		const struct net_device_ops *ops = dev->netdev_ops;
 4022
 4023		if (ops->ndo_select_queue)
 4024			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
 4025		else
 4026			queue_index = netdev_pick_tx(dev, skb, sb_dev);
 4027
 4028		queue_index = netdev_cap_txqueue(dev, queue_index);
 4029	}
 4030
 4031	skb_set_queue_mapping(skb, queue_index);
 4032	return netdev_get_tx_queue(dev, queue_index);
 4033}
 4034
 4035/**
 4036 *	__dev_queue_xmit - transmit a buffer
 4037 *	@skb: buffer to transmit
 4038 *	@sb_dev: suboordinate device used for L2 forwarding offload
 4039 *
 4040 *	Queue a buffer for transmission to a network device. The caller must
 4041 *	have set the device and priority and built the buffer before calling
 4042 *	this function. The function can be called from an interrupt.
 4043 *
 4044 *	A negative errno code is returned on a failure. A success does not
 4045 *	guarantee the frame will be transmitted as it may be dropped due
 4046 *	to congestion or traffic shaping.
 4047 *
 4048 * -----------------------------------------------------------------------------------
 4049 *      I notice this method can also return errors from the queue disciplines,
 4050 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
 4051 *      be positive.
 4052 *
 4053 *      Regardless of the return value, the skb is consumed, so it is currently
 4054 *      difficult to retry a send to this method.  (You can bump the ref count
 4055 *      before sending to hold a reference for retry if you are careful.)
 4056 *
 4057 *      When calling this method, interrupts MUST be enabled.  This is because
 4058 *      the BH enable code must have IRQs enabled so that it will not deadlock.
 4059 *          --BLG
 4060 */
 4061static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 4062{
 4063	struct net_device *dev = skb->dev;
 4064	struct netdev_queue *txq;
 4065	struct Qdisc *q;
 4066	int rc = -ENOMEM;
 4067	bool again = false;
 4068
 4069	skb_reset_mac_header(skb);
 4070
 4071	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
 4072		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
 4073
 4074	/* Disable soft irqs for various locks below. Also
 4075	 * stops preemption for RCU.
 4076	 */
 4077	rcu_read_lock_bh();
 4078
 4079	skb_update_prio(skb);
 4080
 4081	qdisc_pkt_len_init(skb);
 
 
 4082#ifdef CONFIG_NET_CLS_ACT
 4083	skb->tc_at_ingress = 0;
 4084# ifdef CONFIG_NET_EGRESS
 4085	if (static_branch_unlikely(&egress_needed_key)) {
 4086		skb = sch_handle_egress(skb, &rc, dev);
 4087		if (!skb)
 4088			goto out;
 4089	}
 4090# endif
 4091#endif
 4092	/* If device/qdisc don't need skb->dst, release it right now while
 4093	 * its hot in this cpu cache.
 4094	 */
 4095	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
 4096		skb_dst_drop(skb);
 4097	else
 4098		skb_dst_force(skb);
 4099
 4100	txq = netdev_core_pick_tx(dev, skb, sb_dev);
 4101	q = rcu_dereference_bh(txq->qdisc);
 4102
 4103	trace_net_dev_queue(skb);
 4104	if (q->enqueue) {
 4105		rc = __dev_xmit_skb(skb, q, dev, txq);
 4106		goto out;
 4107	}
 4108
 4109	/* The device has no queue. Common case for software devices:
 4110	 * loopback, all the sorts of tunnels...
 4111
 4112	 * Really, it is unlikely that netif_tx_lock protection is necessary
 4113	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
 4114	 * counters.)
 4115	 * However, it is possible, that they rely on protection
 4116	 * made by us here.
 4117
 4118	 * Check this and shot the lock. It is not prone from deadlocks.
 4119	 *Either shot noqueue qdisc, it is even simpler 8)
 4120	 */
 4121	if (dev->flags & IFF_UP) {
 4122		int cpu = smp_processor_id(); /* ok because BHs are off */
 4123
 4124		if (txq->xmit_lock_owner != cpu) {
 4125			if (dev_xmit_recursion())
 
 4126				goto recursion_alert;
 4127
 4128			skb = validate_xmit_skb(skb, dev, &again);
 4129			if (!skb)
 4130				goto out;
 4131
 4132			HARD_TX_LOCK(dev, txq, cpu);
 4133
 4134			if (!netif_xmit_stopped(txq)) {
 4135				dev_xmit_recursion_inc();
 4136				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
 4137				dev_xmit_recursion_dec();
 4138				if (dev_xmit_complete(rc)) {
 4139					HARD_TX_UNLOCK(dev, txq);
 4140					goto out;
 4141				}
 4142			}
 4143			HARD_TX_UNLOCK(dev, txq);
 4144			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
 4145					     dev->name);
 4146		} else {
 4147			/* Recursion is detected! It is possible,
 4148			 * unfortunately
 4149			 */
 4150recursion_alert:
 4151			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
 4152					     dev->name);
 4153		}
 4154	}
 4155
 4156	rc = -ENETDOWN;
 4157	rcu_read_unlock_bh();
 4158
 4159	atomic_long_inc(&dev->tx_dropped);
 4160	kfree_skb_list(skb);
 4161	return rc;
 4162out:
 4163	rcu_read_unlock_bh();
 4164	return rc;
 4165}
 4166
 4167int dev_queue_xmit(struct sk_buff *skb)
 4168{
 4169	return __dev_queue_xmit(skb, NULL);
 4170}
 4171EXPORT_SYMBOL(dev_queue_xmit);
 4172
 4173int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
 4174{
 4175	return __dev_queue_xmit(skb, sb_dev);
 4176}
 4177EXPORT_SYMBOL(dev_queue_xmit_accel);
 4178
 4179int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 4180{
 4181	struct net_device *dev = skb->dev;
 4182	struct sk_buff *orig_skb = skb;
 4183	struct netdev_queue *txq;
 4184	int ret = NETDEV_TX_BUSY;
 4185	bool again = false;
 4186
 4187	if (unlikely(!netif_running(dev) ||
 4188		     !netif_carrier_ok(dev)))
 4189		goto drop;
 4190
 4191	skb = validate_xmit_skb_list(skb, dev, &again);
 4192	if (skb != orig_skb)
 4193		goto drop;
 4194
 4195	skb_set_queue_mapping(skb, queue_id);
 4196	txq = skb_get_tx_queue(dev, skb);
 4197
 4198	local_bh_disable();
 4199
 4200	dev_xmit_recursion_inc();
 4201	HARD_TX_LOCK(dev, txq, smp_processor_id());
 4202	if (!netif_xmit_frozen_or_drv_stopped(txq))
 4203		ret = netdev_start_xmit(skb, dev, txq, false);
 4204	HARD_TX_UNLOCK(dev, txq);
 4205	dev_xmit_recursion_dec();
 4206
 4207	local_bh_enable();
 4208
 4209	if (!dev_xmit_complete(ret))
 4210		kfree_skb(skb);
 4211
 4212	return ret;
 4213drop:
 4214	atomic_long_inc(&dev->tx_dropped);
 4215	kfree_skb_list(skb);
 4216	return NET_XMIT_DROP;
 4217}
 4218EXPORT_SYMBOL(dev_direct_xmit);
 4219
 4220/*************************************************************************
 4221 *			Receiver routines
 4222 *************************************************************************/
 4223
 4224int netdev_max_backlog __read_mostly = 1000;
 4225EXPORT_SYMBOL(netdev_max_backlog);
 4226
 4227int netdev_tstamp_prequeue __read_mostly = 1;
 4228int netdev_budget __read_mostly = 300;
 4229/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
 4230unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
 4231int weight_p __read_mostly = 64;           /* old backlog weight */
 4232int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 4233int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
 4234int dev_rx_weight __read_mostly = 64;
 4235int dev_tx_weight __read_mostly = 64;
 4236/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
 4237int gro_normal_batch __read_mostly = 8;
 4238
 4239/* Called with irq disabled */
 4240static inline void ____napi_schedule(struct softnet_data *sd,
 4241				     struct napi_struct *napi)
 4242{
 4243	list_add_tail(&napi->poll_list, &sd->poll_list);
 4244	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4245}
 4246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 4247#ifdef CONFIG_RPS
 4248
 4249/* One global table that all flow-based protocols share. */
 4250struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 4251EXPORT_SYMBOL(rps_sock_flow_table);
 4252u32 rps_cpu_mask __read_mostly;
 4253EXPORT_SYMBOL(rps_cpu_mask);
 4254
 4255struct static_key_false rps_needed __read_mostly;
 4256EXPORT_SYMBOL(rps_needed);
 4257struct static_key_false rfs_needed __read_mostly;
 4258EXPORT_SYMBOL(rfs_needed);
 4259
 4260static struct rps_dev_flow *
 4261set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4262	    struct rps_dev_flow *rflow, u16 next_cpu)
 4263{
 4264	if (next_cpu < nr_cpu_ids) {
 4265#ifdef CONFIG_RFS_ACCEL
 4266		struct netdev_rx_queue *rxqueue;
 4267		struct rps_dev_flow_table *flow_table;
 4268		struct rps_dev_flow *old_rflow;
 4269		u32 flow_id;
 4270		u16 rxq_index;
 4271		int rc;
 4272
 4273		/* Should we steer this flow to a different hardware queue? */
 4274		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
 4275		    !(dev->features & NETIF_F_NTUPLE))
 4276			goto out;
 4277		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
 4278		if (rxq_index == skb_get_rx_queue(skb))
 4279			goto out;
 4280
 4281		rxqueue = dev->_rx + rxq_index;
 4282		flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4283		if (!flow_table)
 4284			goto out;
 4285		flow_id = skb_get_hash(skb) & flow_table->mask;
 4286		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
 4287							rxq_index, flow_id);
 4288		if (rc < 0)
 4289			goto out;
 4290		old_rflow = rflow;
 4291		rflow = &flow_table->flows[flow_id];
 4292		rflow->filter = rc;
 4293		if (old_rflow->filter == rflow->filter)
 4294			old_rflow->filter = RPS_NO_FILTER;
 4295	out:
 4296#endif
 4297		rflow->last_qtail =
 4298			per_cpu(softnet_data, next_cpu).input_queue_head;
 4299	}
 4300
 4301	rflow->cpu = next_cpu;
 4302	return rflow;
 4303}
 4304
 4305/*
 4306 * get_rps_cpu is called from netif_receive_skb and returns the target
 4307 * CPU from the RPS map of the receiving queue for a given skb.
 4308 * rcu_read_lock must be held on entry.
 4309 */
 4310static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4311		       struct rps_dev_flow **rflowp)
 4312{
 4313	const struct rps_sock_flow_table *sock_flow_table;
 4314	struct netdev_rx_queue *rxqueue = dev->_rx;
 4315	struct rps_dev_flow_table *flow_table;
 4316	struct rps_map *map;
 4317	int cpu = -1;
 4318	u32 tcpu;
 4319	u32 hash;
 4320
 4321	if (skb_rx_queue_recorded(skb)) {
 4322		u16 index = skb_get_rx_queue(skb);
 4323
 4324		if (unlikely(index >= dev->real_num_rx_queues)) {
 4325			WARN_ONCE(dev->real_num_rx_queues > 1,
 4326				  "%s received packet on queue %u, but number "
 4327				  "of RX queues is %u\n",
 4328				  dev->name, index, dev->real_num_rx_queues);
 4329			goto done;
 4330		}
 4331		rxqueue += index;
 4332	}
 4333
 4334	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
 4335
 4336	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4337	map = rcu_dereference(rxqueue->rps_map);
 4338	if (!flow_table && !map)
 
 
 
 
 
 
 
 
 4339		goto done;
 
 4340
 4341	skb_reset_network_header(skb);
 4342	hash = skb_get_hash(skb);
 4343	if (!hash)
 4344		goto done;
 4345
 
 4346	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 4347	if (flow_table && sock_flow_table) {
 
 4348		struct rps_dev_flow *rflow;
 4349		u32 next_cpu;
 4350		u32 ident;
 4351
 4352		/* First check into global flow table if there is a match */
 4353		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
 4354		if ((ident ^ hash) & ~rps_cpu_mask)
 4355			goto try_rps;
 4356
 4357		next_cpu = ident & rps_cpu_mask;
 4358
 4359		/* OK, now we know there is a match,
 4360		 * we can look at the local (per receive queue) flow table
 4361		 */
 4362		rflow = &flow_table->flows[hash & flow_table->mask];
 4363		tcpu = rflow->cpu;
 4364
 4365		/*
 4366		 * If the desired CPU (where last recvmsg was done) is
 4367		 * different from current CPU (one in the rx-queue flow
 4368		 * table entry), switch if one of the following holds:
 4369		 *   - Current CPU is unset (>= nr_cpu_ids).
 4370		 *   - Current CPU is offline.
 4371		 *   - The current CPU's queue tail has advanced beyond the
 4372		 *     last packet that was enqueued using this table entry.
 4373		 *     This guarantees that all previous packets for the flow
 4374		 *     have been dequeued, thus preserving in order delivery.
 4375		 */
 4376		if (unlikely(tcpu != next_cpu) &&
 4377		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
 4378		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
 4379		      rflow->last_qtail)) >= 0)) {
 4380			tcpu = next_cpu;
 4381			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 4382		}
 4383
 4384		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
 4385			*rflowp = rflow;
 4386			cpu = tcpu;
 4387			goto done;
 4388		}
 4389	}
 4390
 4391try_rps:
 
 4392
 4393	if (map) {
 4394		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 4395		if (cpu_online(tcpu)) {
 4396			cpu = tcpu;
 4397			goto done;
 4398		}
 4399	}
 4400
 4401done:
 4402	return cpu;
 4403}
 4404
 4405#ifdef CONFIG_RFS_ACCEL
 4406
 4407/**
 4408 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 4409 * @dev: Device on which the filter was set
 4410 * @rxq_index: RX queue index
 4411 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 4412 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 4413 *
 4414 * Drivers that implement ndo_rx_flow_steer() should periodically call
 4415 * this function for each installed filter and remove the filters for
 4416 * which it returns %true.
 4417 */
 4418bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 4419			 u32 flow_id, u16 filter_id)
 4420{
 4421	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
 4422	struct rps_dev_flow_table *flow_table;
 4423	struct rps_dev_flow *rflow;
 4424	bool expire = true;
 4425	unsigned int cpu;
 4426
 4427	rcu_read_lock();
 4428	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4429	if (flow_table && flow_id <= flow_table->mask) {
 4430		rflow = &flow_table->flows[flow_id];
 4431		cpu = READ_ONCE(rflow->cpu);
 4432		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 4433		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
 4434			   rflow->last_qtail) <
 4435		     (int)(10 * flow_table->mask)))
 4436			expire = false;
 4437	}
 4438	rcu_read_unlock();
 4439	return expire;
 4440}
 4441EXPORT_SYMBOL(rps_may_expire_flow);
 4442
 4443#endif /* CONFIG_RFS_ACCEL */
 4444
 4445/* Called from hardirq (IPI) context */
 4446static void rps_trigger_softirq(void *data)
 4447{
 4448	struct softnet_data *sd = data;
 4449
 4450	____napi_schedule(sd, &sd->backlog);
 4451	sd->received_rps++;
 4452}
 4453
 4454#endif /* CONFIG_RPS */
 4455
 4456/*
 4457 * Check if this softnet_data structure is another cpu one
 4458 * If yes, queue it to our IPI list and return 1
 4459 * If no, return 0
 4460 */
 4461static int rps_ipi_queued(struct softnet_data *sd)
 4462{
 4463#ifdef CONFIG_RPS
 4464	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 4465
 4466	if (sd != mysd) {
 4467		sd->rps_ipi_next = mysd->rps_ipi_list;
 4468		mysd->rps_ipi_list = sd;
 4469
 4470		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4471		return 1;
 4472	}
 4473#endif /* CONFIG_RPS */
 4474	return 0;
 4475}
 4476
 4477#ifdef CONFIG_NET_FLOW_LIMIT
 4478int netdev_flow_limit_table_len __read_mostly = (1 << 12);
 4479#endif
 4480
 4481static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
 4482{
 4483#ifdef CONFIG_NET_FLOW_LIMIT
 4484	struct sd_flow_limit *fl;
 4485	struct softnet_data *sd;
 4486	unsigned int old_flow, new_flow;
 4487
 4488	if (qlen < (netdev_max_backlog >> 1))
 4489		return false;
 4490
 4491	sd = this_cpu_ptr(&softnet_data);
 4492
 4493	rcu_read_lock();
 4494	fl = rcu_dereference(sd->flow_limit);
 4495	if (fl) {
 4496		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
 4497		old_flow = fl->history[fl->history_head];
 4498		fl->history[fl->history_head] = new_flow;
 4499
 4500		fl->history_head++;
 4501		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
 4502
 4503		if (likely(fl->buckets[old_flow]))
 4504			fl->buckets[old_flow]--;
 4505
 4506		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
 4507			fl->count++;
 4508			rcu_read_unlock();
 4509			return true;
 4510		}
 4511	}
 4512	rcu_read_unlock();
 4513#endif
 4514	return false;
 4515}
 4516
 4517/*
 4518 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 4519 * queue (may be a remote CPU queue).
 4520 */
 4521static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 4522			      unsigned int *qtail)
 4523{
 4524	struct softnet_data *sd;
 4525	unsigned long flags;
 4526	unsigned int qlen;
 4527
 4528	sd = &per_cpu(softnet_data, cpu);
 4529
 4530	local_irq_save(flags);
 4531
 4532	rps_lock(sd);
 4533	if (!netif_running(skb->dev))
 4534		goto drop;
 4535	qlen = skb_queue_len(&sd->input_pkt_queue);
 4536	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 4537		if (qlen) {
 4538enqueue:
 4539			__skb_queue_tail(&sd->input_pkt_queue, skb);
 4540			input_queue_tail_incr_save(sd, qtail);
 4541			rps_unlock(sd);
 4542			local_irq_restore(flags);
 4543			return NET_RX_SUCCESS;
 4544		}
 4545
 4546		/* Schedule NAPI for backlog device
 4547		 * We can use non atomic operation since we own the queue lock
 4548		 */
 4549		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
 4550			if (!rps_ipi_queued(sd))
 4551				____napi_schedule(sd, &sd->backlog);
 4552		}
 4553		goto enqueue;
 4554	}
 4555
 4556drop:
 4557	sd->dropped++;
 4558	rps_unlock(sd);
 4559
 4560	local_irq_restore(flags);
 4561
 4562	atomic_long_inc(&skb->dev->rx_dropped);
 4563	kfree_skb(skb);
 4564	return NET_RX_DROP;
 4565}
 4566
 4567static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
 4568{
 4569	struct net_device *dev = skb->dev;
 4570	struct netdev_rx_queue *rxqueue;
 4571
 4572	rxqueue = dev->_rx;
 4573
 4574	if (skb_rx_queue_recorded(skb)) {
 4575		u16 index = skb_get_rx_queue(skb);
 4576
 4577		if (unlikely(index >= dev->real_num_rx_queues)) {
 4578			WARN_ONCE(dev->real_num_rx_queues > 1,
 4579				  "%s received packet on queue %u, but number "
 4580				  "of RX queues is %u\n",
 4581				  dev->name, index, dev->real_num_rx_queues);
 4582
 4583			return rxqueue; /* Return first rxqueue */
 4584		}
 4585		rxqueue += index;
 4586	}
 4587	return rxqueue;
 4588}
 4589
 4590static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 4591				     struct xdp_buff *xdp,
 4592				     struct bpf_prog *xdp_prog)
 4593{
 4594	struct netdev_rx_queue *rxqueue;
 4595	void *orig_data, *orig_data_end;
 4596	u32 metalen, act = XDP_DROP;
 4597	__be16 orig_eth_type;
 4598	struct ethhdr *eth;
 4599	bool orig_bcast;
 4600	int hlen, off;
 4601	u32 mac_len;
 4602
 4603	/* Reinjected packets coming from act_mirred or similar should
 4604	 * not get XDP generic processing.
 4605	 */
 4606	if (skb_is_redirected(skb))
 4607		return XDP_PASS;
 4608
 4609	/* XDP packets must be linear and must have sufficient headroom
 4610	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
 4611	 * native XDP provides, thus we need to do it here as well.
 4612	 */
 4613	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
 4614	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
 4615		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 4616		int troom = skb->tail + skb->data_len - skb->end;
 4617
 4618		/* In case we have to go down the path and also linearize,
 4619		 * then lets do the pskb_expand_head() work just once here.
 4620		 */
 4621		if (pskb_expand_head(skb,
 4622				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
 4623				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
 4624			goto do_drop;
 4625		if (skb_linearize(skb))
 4626			goto do_drop;
 4627	}
 4628
 4629	/* The XDP program wants to see the packet starting at the MAC
 4630	 * header.
 4631	 */
 4632	mac_len = skb->data - skb_mac_header(skb);
 4633	hlen = skb_headlen(skb) + mac_len;
 4634	xdp->data = skb->data - mac_len;
 4635	xdp->data_meta = xdp->data;
 4636	xdp->data_end = xdp->data + hlen;
 4637	xdp->data_hard_start = skb->data - skb_headroom(skb);
 4638
 4639	/* SKB "head" area always have tailroom for skb_shared_info */
 4640	xdp->frame_sz  = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
 4641	xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 4642
 4643	orig_data_end = xdp->data_end;
 4644	orig_data = xdp->data;
 4645	eth = (struct ethhdr *)xdp->data;
 4646	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
 4647	orig_eth_type = eth->h_proto;
 4648
 4649	rxqueue = netif_get_rxqueue(skb);
 4650	xdp->rxq = &rxqueue->xdp_rxq;
 4651
 4652	act = bpf_prog_run_xdp(xdp_prog, xdp);
 4653
 4654	/* check if bpf_xdp_adjust_head was used */
 4655	off = xdp->data - orig_data;
 4656	if (off) {
 4657		if (off > 0)
 4658			__skb_pull(skb, off);
 4659		else if (off < 0)
 4660			__skb_push(skb, -off);
 4661
 4662		skb->mac_header += off;
 4663		skb_reset_network_header(skb);
 4664	}
 4665
 4666	/* check if bpf_xdp_adjust_tail was used */
 4667	off = xdp->data_end - orig_data_end;
 4668	if (off != 0) {
 4669		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
 4670		skb->len += off; /* positive on grow, negative on shrink */
 4671	}
 4672
 4673	/* check if XDP changed eth hdr such SKB needs update */
 4674	eth = (struct ethhdr *)xdp->data;
 4675	if ((orig_eth_type != eth->h_proto) ||
 4676	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
 4677		__skb_push(skb, ETH_HLEN);
 4678		skb->protocol = eth_type_trans(skb, skb->dev);
 4679	}
 4680
 4681	switch (act) {
 4682	case XDP_REDIRECT:
 4683	case XDP_TX:
 4684		__skb_push(skb, mac_len);
 4685		break;
 4686	case XDP_PASS:
 4687		metalen = xdp->data - xdp->data_meta;
 4688		if (metalen)
 4689			skb_metadata_set(skb, metalen);
 4690		break;
 4691	default:
 4692		bpf_warn_invalid_xdp_action(act);
 4693		fallthrough;
 4694	case XDP_ABORTED:
 4695		trace_xdp_exception(skb->dev, xdp_prog, act);
 4696		fallthrough;
 4697	case XDP_DROP:
 4698	do_drop:
 4699		kfree_skb(skb);
 4700		break;
 4701	}
 4702
 4703	return act;
 4704}
 4705
 4706/* When doing generic XDP we have to bypass the qdisc layer and the
 4707 * network taps in order to match in-driver-XDP behavior.
 4708 */
 4709void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 4710{
 4711	struct net_device *dev = skb->dev;
 4712	struct netdev_queue *txq;
 4713	bool free_skb = true;
 4714	int cpu, rc;
 4715
 4716	txq = netdev_core_pick_tx(dev, skb, NULL);
 4717	cpu = smp_processor_id();
 4718	HARD_TX_LOCK(dev, txq, cpu);
 4719	if (!netif_xmit_stopped(txq)) {
 4720		rc = netdev_start_xmit(skb, dev, txq, 0);
 4721		if (dev_xmit_complete(rc))
 4722			free_skb = false;
 4723	}
 4724	HARD_TX_UNLOCK(dev, txq);
 4725	if (free_skb) {
 4726		trace_xdp_exception(dev, xdp_prog, XDP_TX);
 4727		kfree_skb(skb);
 4728	}
 4729}
 4730
 4731static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
 4732
 4733int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 4734{
 4735	if (xdp_prog) {
 4736		struct xdp_buff xdp;
 4737		u32 act;
 4738		int err;
 4739
 4740		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
 4741		if (act != XDP_PASS) {
 4742			switch (act) {
 4743			case XDP_REDIRECT:
 4744				err = xdp_do_generic_redirect(skb->dev, skb,
 4745							      &xdp, xdp_prog);
 4746				if (err)
 4747					goto out_redir;
 4748				break;
 4749			case XDP_TX:
 4750				generic_xdp_tx(skb, xdp_prog);
 4751				break;
 4752			}
 4753			return XDP_DROP;
 4754		}
 4755	}
 4756	return XDP_PASS;
 4757out_redir:
 4758	kfree_skb(skb);
 4759	return XDP_DROP;
 4760}
 4761EXPORT_SYMBOL_GPL(do_xdp_generic);
 4762
 4763static int netif_rx_internal(struct sk_buff *skb)
 4764{
 4765	int ret;
 4766
 4767	net_timestamp_check(netdev_tstamp_prequeue, skb);
 4768
 4769	trace_netif_rx(skb);
 4770
 4771#ifdef CONFIG_RPS
 4772	if (static_branch_unlikely(&rps_needed)) {
 4773		struct rps_dev_flow voidflow, *rflow = &voidflow;
 4774		int cpu;
 4775
 4776		preempt_disable();
 4777		rcu_read_lock();
 4778
 4779		cpu = get_rps_cpu(skb->dev, skb, &rflow);
 4780		if (cpu < 0)
 4781			cpu = smp_processor_id();
 4782
 4783		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 4784
 4785		rcu_read_unlock();
 4786		preempt_enable();
 4787	} else
 4788#endif
 4789	{
 4790		unsigned int qtail;
 4791
 4792		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
 4793		put_cpu();
 4794	}
 4795	return ret;
 4796}
 4797
 4798/**
 4799 *	netif_rx	-	post buffer to the network code
 4800 *	@skb: buffer to post
 4801 *
 4802 *	This function receives a packet from a device driver and queues it for
 4803 *	the upper (protocol) levels to process.  It always succeeds. The buffer
 4804 *	may be dropped during processing for congestion control or by the
 4805 *	protocol layers.
 4806 *
 4807 *	return values:
 4808 *	NET_RX_SUCCESS	(no congestion)
 4809 *	NET_RX_DROP     (packet was dropped)
 4810 *
 4811 */
 4812
 4813int netif_rx(struct sk_buff *skb)
 4814{
 4815	int ret;
 4816
 4817	trace_netif_rx_entry(skb);
 4818
 4819	ret = netif_rx_internal(skb);
 4820	trace_netif_rx_exit(ret);
 4821
 4822	return ret;
 4823}
 4824EXPORT_SYMBOL(netif_rx);
 4825
 4826int netif_rx_ni(struct sk_buff *skb)
 4827{
 4828	int err;
 4829
 4830	trace_netif_rx_ni_entry(skb);
 4831
 4832	preempt_disable();
 4833	err = netif_rx_internal(skb);
 4834	if (local_softirq_pending())
 4835		do_softirq();
 4836	preempt_enable();
 4837	trace_netif_rx_ni_exit(err);
 4838
 4839	return err;
 4840}
 4841EXPORT_SYMBOL(netif_rx_ni);
 4842
 4843static __latent_entropy void net_tx_action(struct softirq_action *h)
 4844{
 4845	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 4846
 4847	if (sd->completion_queue) {
 4848		struct sk_buff *clist;
 4849
 4850		local_irq_disable();
 4851		clist = sd->completion_queue;
 4852		sd->completion_queue = NULL;
 4853		local_irq_enable();
 4854
 4855		while (clist) {
 4856			struct sk_buff *skb = clist;
 4857
 4858			clist = clist->next;
 4859
 4860			WARN_ON(refcount_read(&skb->users));
 4861			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
 4862				trace_consume_skb(skb);
 4863			else
 4864				trace_kfree_skb(skb, net_tx_action);
 4865
 4866			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 4867				__kfree_skb(skb);
 4868			else
 4869				__kfree_skb_defer(skb);
 4870		}
 4871
 4872		__kfree_skb_flush();
 4873	}
 4874
 4875	if (sd->output_queue) {
 4876		struct Qdisc *head;
 4877
 4878		local_irq_disable();
 4879		head = sd->output_queue;
 4880		sd->output_queue = NULL;
 4881		sd->output_queue_tailp = &sd->output_queue;
 4882		local_irq_enable();
 4883
 4884		while (head) {
 4885			struct Qdisc *q = head;
 4886			spinlock_t *root_lock = NULL;
 4887
 4888			head = head->next_sched;
 4889
 4890			if (!(q->flags & TCQ_F_NOLOCK)) {
 4891				root_lock = qdisc_lock(q);
 4892				spin_lock(root_lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 4893			}
 4894			/* We need to make sure head->next_sched is read
 4895			 * before clearing __QDISC_STATE_SCHED
 4896			 */
 4897			smp_mb__before_atomic();
 4898			clear_bit(__QDISC_STATE_SCHED, &q->state);
 4899			qdisc_run(q);
 4900			if (root_lock)
 4901				spin_unlock(root_lock);
 4902		}
 4903	}
 4904
 4905	xfrm_dev_backlog(sd);
 4906}
 4907
 4908#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
 
 4909/* This hook is defined here for ATM LANE */
 4910int (*br_fdb_test_addr_hook)(struct net_device *dev,
 4911			     unsigned char *addr) __read_mostly;
 4912EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 4913#endif
 4914
 4915static inline struct sk_buff *
 4916sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 4917		   struct net_device *orig_dev)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 4918{
 4919#ifdef CONFIG_NET_CLS_ACT
 4920	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
 4921	struct tcf_result cl_res;
 4922
 4923	/* If there's at least one ingress present somewhere (so
 4924	 * we get here via enabled static key), remaining devices
 4925	 * that are not configured with an ingress qdisc will bail
 4926	 * out here.
 4927	 */
 4928	if (!miniq)
 4929		return skb;
 4930
 4931	if (*pt_prev) {
 4932		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 4933		*pt_prev = NULL;
 4934	}
 4935
 4936	qdisc_skb_cb(skb)->pkt_len = skb->len;
 4937	skb->tc_at_ingress = 1;
 4938	mini_qdisc_bstats_cpu_update(miniq, skb);
 4939
 4940	switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
 4941				     &cl_res, false)) {
 4942	case TC_ACT_OK:
 4943	case TC_ACT_RECLASSIFY:
 4944		skb->tc_index = TC_H_MIN(cl_res.classid);
 4945		break;
 4946	case TC_ACT_SHOT:
 4947		mini_qdisc_qstats_cpu_drop(miniq);
 4948		kfree_skb(skb);
 4949		return NULL;
 4950	case TC_ACT_STOLEN:
 4951	case TC_ACT_QUEUED:
 4952	case TC_ACT_TRAP:
 4953		consume_skb(skb);
 4954		return NULL;
 4955	case TC_ACT_REDIRECT:
 4956		/* skb_mac_header check was done by cls/act_bpf, so
 4957		 * we can safely push the L2 header back before
 4958		 * redirecting to another netdev
 4959		 */
 4960		__skb_push(skb, skb->mac_len);
 4961		skb_do_redirect(skb);
 4962		return NULL;
 4963	case TC_ACT_CONSUMED:
 4964		return NULL;
 4965	default:
 4966		break;
 4967	}
 4968#endif /* CONFIG_NET_CLS_ACT */
 
 
 4969	return skb;
 4970}
 4971
 4972/**
 4973 *	netdev_is_rx_handler_busy - check if receive handler is registered
 4974 *	@dev: device to check
 4975 *
 4976 *	Check if a receive handler is already registered for a given device.
 4977 *	Return true if there one.
 4978 *
 4979 *	The caller must hold the rtnl_mutex.
 4980 */
 4981bool netdev_is_rx_handler_busy(struct net_device *dev)
 4982{
 4983	ASSERT_RTNL();
 4984	return dev && rtnl_dereference(dev->rx_handler);
 4985}
 4986EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
 4987
 4988/**
 4989 *	netdev_rx_handler_register - register receive handler
 4990 *	@dev: device to register a handler for
 4991 *	@rx_handler: receive handler to register
 4992 *	@rx_handler_data: data pointer that is used by rx handler
 4993 *
 4994 *	Register a receive handler for a device. This handler will then be
 4995 *	called from __netif_receive_skb. A negative errno code is returned
 4996 *	on a failure.
 4997 *
 4998 *	The caller must hold the rtnl_mutex.
 4999 *
 5000 *	For a general description of rx_handler, see enum rx_handler_result.
 5001 */
 5002int netdev_rx_handler_register(struct net_device *dev,
 5003			       rx_handler_func_t *rx_handler,
 5004			       void *rx_handler_data)
 5005{
 5006	if (netdev_is_rx_handler_busy(dev))
 
 
 5007		return -EBUSY;
 5008
 5009	if (dev->priv_flags & IFF_NO_RX_HANDLER)
 5010		return -EINVAL;
 5011
 5012	/* Note: rx_handler_data must be set before rx_handler */
 5013	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
 5014	rcu_assign_pointer(dev->rx_handler, rx_handler);
 5015
 5016	return 0;
 5017}
 5018EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
 5019
 5020/**
 5021 *	netdev_rx_handler_unregister - unregister receive handler
 5022 *	@dev: device to unregister a handler from
 5023 *
 5024 *	Unregister a receive handler from a device.
 5025 *
 5026 *	The caller must hold the rtnl_mutex.
 5027 */
 5028void netdev_rx_handler_unregister(struct net_device *dev)
 5029{
 5030
 5031	ASSERT_RTNL();
 5032	RCU_INIT_POINTER(dev->rx_handler, NULL);
 5033	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
 5034	 * section has a guarantee to see a non NULL rx_handler_data
 5035	 * as well.
 5036	 */
 5037	synchronize_net();
 5038	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
 5039}
 5040EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 5041
 5042/*
 5043 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 5044 * the special handling of PFMEMALLOC skbs.
 5045 */
 5046static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
 5047{
 5048	switch (skb->protocol) {
 5049	case htons(ETH_P_ARP):
 5050	case htons(ETH_P_IP):
 5051	case htons(ETH_P_IPV6):
 5052	case htons(ETH_P_8021Q):
 5053	case htons(ETH_P_8021AD):
 5054		return true;
 5055	default:
 5056		return false;
 5057	}
 5058}
 5059
 5060static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 5061			     int *ret, struct net_device *orig_dev)
 5062{
 5063	if (nf_hook_ingress_active(skb)) {
 5064		int ingress_retval;
 5065
 5066		if (*pt_prev) {
 5067			*ret = deliver_skb(skb, *pt_prev, orig_dev);
 5068			*pt_prev = NULL;
 5069		}
 5070
 5071		rcu_read_lock();
 5072		ingress_retval = nf_hook_ingress(skb);
 5073		rcu_read_unlock();
 5074		return ingress_retval;
 5075	}
 5076	return 0;
 5077}
 5078
 5079static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 5080				    struct packet_type **ppt_prev)
 5081{
 5082	struct packet_type *ptype, *pt_prev;
 5083	rx_handler_func_t *rx_handler;
 5084	struct sk_buff *skb = *pskb;
 5085	struct net_device *orig_dev;
 
 5086	bool deliver_exact = false;
 5087	int ret = NET_RX_DROP;
 5088	__be16 type;
 5089
 5090	net_timestamp_check(!netdev_tstamp_prequeue, skb);
 5091
 5092	trace_netif_receive_skb(skb);
 5093
 
 
 
 
 
 
 5094	orig_dev = skb->dev;
 5095
 5096	skb_reset_network_header(skb);
 5097	if (!skb_transport_header_was_set(skb))
 5098		skb_reset_transport_header(skb);
 5099	skb_reset_mac_len(skb);
 5100
 5101	pt_prev = NULL;
 5102
 
 
 5103another_round:
 5104	skb->skb_iif = skb->dev->ifindex;
 5105
 5106	__this_cpu_inc(softnet_data.processed);
 5107
 5108	if (static_branch_unlikely(&generic_xdp_needed_key)) {
 5109		int ret2;
 5110
 5111		preempt_disable();
 5112		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
 5113		preempt_enable();
 5114
 5115		if (ret2 != XDP_PASS) {
 5116			ret = NET_RX_DROP;
 5117			goto out;
 5118		}
 5119		skb_reset_mac_len(skb);
 5120	}
 5121
 5122	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 5123	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 5124		skb = skb_vlan_untag(skb);
 5125		if (unlikely(!skb))
 5126			goto out;
 5127	}
 5128
 5129	if (skb_skip_tc_classify(skb))
 5130		goto skip_classify;
 5131
 5132	if (pfmemalloc)
 5133		goto skip_taps;
 5134
 5135	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 5136		if (pt_prev)
 5137			ret = deliver_skb(skb, pt_prev, orig_dev);
 5138		pt_prev = ptype;
 
 
 5139	}
 5140
 5141	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
 5142		if (pt_prev)
 5143			ret = deliver_skb(skb, pt_prev, orig_dev);
 5144		pt_prev = ptype;
 5145	}
 5146
 5147skip_taps:
 5148#ifdef CONFIG_NET_INGRESS
 5149	if (static_branch_unlikely(&ingress_needed_key)) {
 5150		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
 5151		if (!skb)
 5152			goto out;
 5153
 5154		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
 5155			goto out;
 5156	}
 5157#endif
 5158	skb_reset_redirect(skb);
 5159skip_classify:
 5160	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 5161		goto drop;
 5162
 5163	if (skb_vlan_tag_present(skb)) {
 
 5164		if (pt_prev) {
 5165			ret = deliver_skb(skb, pt_prev, orig_dev);
 5166			pt_prev = NULL;
 5167		}
 5168		if (vlan_do_receive(&skb))
 5169			goto another_round;
 5170		else if (unlikely(!skb))
 5171			goto out;
 5172	}
 5173
 5174	rx_handler = rcu_dereference(skb->dev->rx_handler);
 5175	if (rx_handler) {
 5176		if (pt_prev) {
 5177			ret = deliver_skb(skb, pt_prev, orig_dev);
 5178			pt_prev = NULL;
 5179		}
 5180		switch (rx_handler(&skb)) {
 5181		case RX_HANDLER_CONSUMED:
 5182			ret = NET_RX_SUCCESS;
 5183			goto out;
 5184		case RX_HANDLER_ANOTHER:
 5185			goto another_round;
 5186		case RX_HANDLER_EXACT:
 5187			deliver_exact = true;
 5188		case RX_HANDLER_PASS:
 5189			break;
 5190		default:
 5191			BUG();
 5192		}
 5193	}
 5194
 5195	if (unlikely(skb_vlan_tag_present(skb))) {
 5196check_vlan_id:
 5197		if (skb_vlan_tag_get_id(skb)) {
 5198			/* Vlan id is non 0 and vlan_do_receive() above couldn't
 5199			 * find vlan device.
 5200			 */
 5201			skb->pkt_type = PACKET_OTHERHOST;
 5202		} else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 5203			   skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 5204			/* Outer header is 802.1P with vlan 0, inner header is
 5205			 * 802.1Q or 802.1AD and vlan_do_receive() above could
 5206			 * not find vlan dev for vlan id 0.
 5207			 */
 5208			__vlan_hwaccel_clear_tag(skb);
 5209			skb = skb_vlan_untag(skb);
 5210			if (unlikely(!skb))
 5211				goto out;
 5212			if (vlan_do_receive(&skb))
 5213				/* After stripping off 802.1P header with vlan 0
 5214				 * vlan dev is found for inner header.
 5215				 */
 5216				goto another_round;
 5217			else if (unlikely(!skb))
 5218				goto out;
 5219			else
 5220				/* We have stripped outer 802.1P vlan 0 header.
 5221				 * But could not find vlan dev.
 5222				 * check again for vlan id to set OTHERHOST.
 5223				 */
 5224				goto check_vlan_id;
 5225		}
 5226		/* Note: we might in the future use prio bits
 5227		 * and set skb->priority like in vlan_do_receive()
 5228		 * For the time being, just ignore Priority Code Point
 5229		 */
 5230		__vlan_hwaccel_clear_tag(skb);
 5231	}
 5232
 5233	type = skb->protocol;
 5234
 5235	/* deliver only exact match when indicated */
 5236	if (likely(!deliver_exact)) {
 5237		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5238				       &ptype_base[ntohs(type) &
 5239						   PTYPE_HASH_MASK]);
 5240	}
 5241
 5242	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5243			       &orig_dev->ptype_specific);
 5244
 5245	if (unlikely(skb->dev != orig_dev)) {
 5246		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5247				       &skb->dev->ptype_specific);
 5248	}
 5249
 5250	if (pt_prev) {
 5251		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 5252			goto drop;
 5253		*ppt_prev = pt_prev;
 5254	} else {
 5255drop:
 5256		if (!deliver_exact)
 5257			atomic_long_inc(&skb->dev->rx_dropped);
 5258		else
 5259			atomic_long_inc(&skb->dev->rx_nohandler);
 5260		kfree_skb(skb);
 5261		/* Jamal, now you will not able to escape explaining
 5262		 * me how you were going to use this. :-)
 5263		 */
 5264		ret = NET_RX_DROP;
 5265	}
 5266
 5267out:
 5268	/* The invariant here is that if *ppt_prev is not NULL
 5269	 * then skb should also be non-NULL.
 5270	 *
 5271	 * Apparently *ppt_prev assignment above holds this invariant due to
 5272	 * skb dereferencing near it.
 5273	 */
 5274	*pskb = skb;
 5275	return ret;
 5276}
 5277
 5278static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
 5279{
 5280	struct net_device *orig_dev = skb->dev;
 5281	struct packet_type *pt_prev = NULL;
 5282	int ret;
 5283
 5284	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5285	if (pt_prev)
 5286		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
 5287					 skb->dev, pt_prev, orig_dev);
 5288	return ret;
 5289}
 5290
 5291/**
 5292 *	netif_receive_skb_core - special purpose version of netif_receive_skb
 5293 *	@skb: buffer to process
 5294 *
 5295 *	More direct receive version of netif_receive_skb().  It should
 5296 *	only be used by callers that have a need to skip RPS and Generic XDP.
 5297 *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
 5298 *
 5299 *	This function may only be called from softirq context and interrupts
 5300 *	should be enabled.
 5301 *
 5302 *	Return values (usually ignored):
 5303 *	NET_RX_SUCCESS: no congestion
 5304 *	NET_RX_DROP: packet was dropped
 5305 */
 5306int netif_receive_skb_core(struct sk_buff *skb)
 5307{
 5308	int ret;
 5309
 5310	rcu_read_lock();
 5311	ret = __netif_receive_skb_one_core(skb, false);
 5312	rcu_read_unlock();
 5313
 5314	return ret;
 5315}
 5316EXPORT_SYMBOL(netif_receive_skb_core);
 5317
 5318static inline void __netif_receive_skb_list_ptype(struct list_head *head,
 5319						  struct packet_type *pt_prev,
 5320						  struct net_device *orig_dev)
 5321{
 5322	struct sk_buff *skb, *next;
 5323
 5324	if (!pt_prev)
 5325		return;
 5326	if (list_empty(head))
 5327		return;
 5328	if (pt_prev->list_func != NULL)
 5329		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
 5330				   ip_list_rcv, head, pt_prev, orig_dev);
 5331	else
 5332		list_for_each_entry_safe(skb, next, head, list) {
 5333			skb_list_del_init(skb);
 5334			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 5335		}
 5336}
 5337
 5338static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
 5339{
 5340	/* Fast-path assumptions:
 5341	 * - There is no RX handler.
 5342	 * - Only one packet_type matches.
 5343	 * If either of these fails, we will end up doing some per-packet
 5344	 * processing in-line, then handling the 'last ptype' for the whole
 5345	 * sublist.  This can't cause out-of-order delivery to any single ptype,
 5346	 * because the 'last ptype' must be constant across the sublist, and all
 5347	 * other ptypes are handled per-packet.
 5348	 */
 5349	/* Current (common) ptype of sublist */
 5350	struct packet_type *pt_curr = NULL;
 5351	/* Current (common) orig_dev of sublist */
 5352	struct net_device *od_curr = NULL;
 5353	struct list_head sublist;
 5354	struct sk_buff *skb, *next;
 5355
 5356	INIT_LIST_HEAD(&sublist);
 5357	list_for_each_entry_safe(skb, next, head, list) {
 5358		struct net_device *orig_dev = skb->dev;
 5359		struct packet_type *pt_prev = NULL;
 5360
 5361		skb_list_del_init(skb);
 5362		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5363		if (!pt_prev)
 5364			continue;
 5365		if (pt_curr != pt_prev || od_curr != orig_dev) {
 5366			/* dispatch old sublist */
 5367			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5368			/* start new sublist */
 5369			INIT_LIST_HEAD(&sublist);
 5370			pt_curr = pt_prev;
 5371			od_curr = orig_dev;
 5372		}
 5373		list_add_tail(&skb->list, &sublist);
 5374	}
 5375
 5376	/* dispatch final sublist */
 5377	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5378}
 5379
 5380static int __netif_receive_skb(struct sk_buff *skb)
 5381{
 5382	int ret;
 5383
 5384	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
 5385		unsigned int noreclaim_flag;
 5386
 5387		/*
 5388		 * PFMEMALLOC skbs are special, they should
 5389		 * - be delivered to SOCK_MEMALLOC sockets only
 5390		 * - stay away from userspace
 5391		 * - have bounded memory usage
 5392		 *
 5393		 * Use PF_MEMALLOC as this saves us from propagating the allocation
 5394		 * context down to all allocation sites.
 5395		 */
 5396		noreclaim_flag = memalloc_noreclaim_save();
 5397		ret = __netif_receive_skb_one_core(skb, true);
 5398		memalloc_noreclaim_restore(noreclaim_flag);
 5399	} else
 5400		ret = __netif_receive_skb_one_core(skb, false);
 5401
 5402	return ret;
 5403}
 5404
 5405static void __netif_receive_skb_list(struct list_head *head)
 5406{
 5407	unsigned long noreclaim_flag = 0;
 5408	struct sk_buff *skb, *next;
 5409	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
 5410
 5411	list_for_each_entry_safe(skb, next, head, list) {
 5412		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
 5413			struct list_head sublist;
 5414
 5415			/* Handle the previous sublist */
 5416			list_cut_before(&sublist, head, &skb->list);
 5417			if (!list_empty(&sublist))
 5418				__netif_receive_skb_list_core(&sublist, pfmemalloc);
 5419			pfmemalloc = !pfmemalloc;
 5420			/* See comments in __netif_receive_skb */
 5421			if (pfmemalloc)
 5422				noreclaim_flag = memalloc_noreclaim_save();
 5423			else
 5424				memalloc_noreclaim_restore(noreclaim_flag);
 5425		}
 5426	}
 5427	/* Handle the remaining sublist */
 5428	if (!list_empty(head))
 5429		__netif_receive_skb_list_core(head, pfmemalloc);
 5430	/* Restore pflags */
 5431	if (pfmemalloc)
 5432		memalloc_noreclaim_restore(noreclaim_flag);
 5433}
 5434
 5435static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 5436{
 5437	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
 5438	struct bpf_prog *new = xdp->prog;
 5439	int ret = 0;
 5440
 5441	if (new) {
 5442		u32 i;
 5443
 5444		/* generic XDP does not work with DEVMAPs that can
 5445		 * have a bpf_prog installed on an entry
 5446		 */
 5447		for (i = 0; i < new->aux->used_map_cnt; i++) {
 5448			if (dev_map_can_have_prog(new->aux->used_maps[i]))
 5449				return -EINVAL;
 5450			if (cpu_map_prog_allowed(new->aux->used_maps[i]))
 5451				return -EINVAL;
 5452		}
 5453	}
 5454
 5455	switch (xdp->command) {
 5456	case XDP_SETUP_PROG:
 5457		rcu_assign_pointer(dev->xdp_prog, new);
 5458		if (old)
 5459			bpf_prog_put(old);
 5460
 5461		if (old && !new) {
 5462			static_branch_dec(&generic_xdp_needed_key);
 5463		} else if (new && !old) {
 5464			static_branch_inc(&generic_xdp_needed_key);
 5465			dev_disable_lro(dev);
 5466			dev_disable_gro_hw(dev);
 5467		}
 5468		break;
 5469
 5470	default:
 5471		ret = -EINVAL;
 5472		break;
 5473	}
 5474
 5475	return ret;
 5476}
 5477
 5478static int netif_receive_skb_internal(struct sk_buff *skb)
 5479{
 5480	int ret;
 5481
 5482	net_timestamp_check(netdev_tstamp_prequeue, skb);
 5483
 5484	if (skb_defer_rx_timestamp(skb))
 5485		return NET_RX_SUCCESS;
 5486
 5487	rcu_read_lock();
 5488#ifdef CONFIG_RPS
 5489	if (static_branch_unlikely(&rps_needed)) {
 5490		struct rps_dev_flow voidflow, *rflow = &voidflow;
 5491		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 
 
 
 
 5492
 5493		if (cpu >= 0) {
 5494			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5495			rcu_read_unlock();
 5496			return ret;
 5497		}
 
 5498	}
 5499#endif
 5500	ret = __netif_receive_skb(skb);
 5501	rcu_read_unlock();
 5502	return ret;
 5503}
 5504
 5505static void netif_receive_skb_list_internal(struct list_head *head)
 5506{
 5507	struct sk_buff *skb, *next;
 5508	struct list_head sublist;
 5509
 5510	INIT_LIST_HEAD(&sublist);
 5511	list_for_each_entry_safe(skb, next, head, list) {
 5512		net_timestamp_check(netdev_tstamp_prequeue, skb);
 5513		skb_list_del_init(skb);
 5514		if (!skb_defer_rx_timestamp(skb))
 5515			list_add_tail(&skb->list, &sublist);
 5516	}
 5517	list_splice_init(&sublist, head);
 5518
 5519	rcu_read_lock();
 5520#ifdef CONFIG_RPS
 5521	if (static_branch_unlikely(&rps_needed)) {
 5522		list_for_each_entry_safe(skb, next, head, list) {
 5523			struct rps_dev_flow voidflow, *rflow = &voidflow;
 5524			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5525
 5526			if (cpu >= 0) {
 5527				/* Will be handled, remove from list */
 5528				skb_list_del_init(skb);
 5529				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5530			}
 5531		}
 5532	}
 5533#endif
 5534	__netif_receive_skb_list(head);
 5535	rcu_read_unlock();
 5536}
 5537
 5538/**
 5539 *	netif_receive_skb - process receive buffer from network
 5540 *	@skb: buffer to process
 5541 *
 5542 *	netif_receive_skb() is the main receive data processing function.
 5543 *	It always succeeds. The buffer may be dropped during processing
 5544 *	for congestion control or by the protocol layers.
 5545 *
 5546 *	This function may only be called from softirq context and interrupts
 5547 *	should be enabled.
 5548 *
 5549 *	Return values (usually ignored):
 5550 *	NET_RX_SUCCESS: no congestion
 5551 *	NET_RX_DROP: packet was dropped
 5552 */
 5553int netif_receive_skb(struct sk_buff *skb)
 5554{
 5555	int ret;
 5556
 5557	trace_netif_receive_skb_entry(skb);
 5558
 5559	ret = netif_receive_skb_internal(skb);
 5560	trace_netif_receive_skb_exit(ret);
 5561
 5562	return ret;
 5563}
 5564EXPORT_SYMBOL(netif_receive_skb);
 5565
 5566/**
 5567 *	netif_receive_skb_list - process many receive buffers from network
 5568 *	@head: list of skbs to process.
 5569 *
 5570 *	Since return value of netif_receive_skb() is normally ignored, and
 5571 *	wouldn't be meaningful for a list, this function returns void.
 5572 *
 5573 *	This function may only be called from softirq context and interrupts
 5574 *	should be enabled.
 5575 */
 5576void netif_receive_skb_list(struct list_head *head)
 5577{
 5578	struct sk_buff *skb;
 5579
 5580	if (list_empty(head))
 5581		return;
 5582	if (trace_netif_receive_skb_list_entry_enabled()) {
 5583		list_for_each_entry(skb, head, list)
 5584			trace_netif_receive_skb_list_entry(skb);
 5585	}
 5586	netif_receive_skb_list_internal(head);
 5587	trace_netif_receive_skb_list_exit(0);
 5588}
 5589EXPORT_SYMBOL(netif_receive_skb_list);
 5590
 5591static DEFINE_PER_CPU(struct work_struct, flush_works);
 5592
 5593/* Network device is going away, flush any packets still pending */
 5594static void flush_backlog(struct work_struct *work)
 5595{
 
 
 5596	struct sk_buff *skb, *tmp;
 5597	struct softnet_data *sd;
 5598
 5599	local_bh_disable();
 5600	sd = this_cpu_ptr(&softnet_data);
 5601
 5602	local_irq_disable();
 5603	rps_lock(sd);
 5604	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 5605		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5606			__skb_unlink(skb, &sd->input_pkt_queue);
 5607			dev_kfree_skb_irq(skb);
 5608			input_queue_head_incr(sd);
 5609		}
 5610	}
 5611	rps_unlock(sd);
 5612	local_irq_enable();
 5613
 5614	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 5615		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5616			__skb_unlink(skb, &sd->process_queue);
 5617			kfree_skb(skb);
 5618			input_queue_head_incr(sd);
 5619		}
 5620	}
 5621	local_bh_enable();
 5622}
 5623
 5624static void flush_all_backlogs(void)
 5625{
 5626	unsigned int cpu;
 5627
 5628	get_online_cpus();
 5629
 5630	for_each_online_cpu(cpu)
 5631		queue_work_on(cpu, system_highpri_wq,
 5632			      per_cpu_ptr(&flush_works, cpu));
 5633
 5634	for_each_online_cpu(cpu)
 5635		flush_work(per_cpu_ptr(&flush_works, cpu));
 5636
 5637	put_online_cpus();
 5638}
 5639
 5640/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
 5641static void gro_normal_list(struct napi_struct *napi)
 5642{
 5643	if (!napi->rx_count)
 5644		return;
 5645	netif_receive_skb_list_internal(&napi->rx_list);
 5646	INIT_LIST_HEAD(&napi->rx_list);
 5647	napi->rx_count = 0;
 5648}
 5649
 5650/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
 5651 * pass the whole batch up to the stack.
 5652 */
 5653static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
 5654{
 5655	list_add_tail(&skb->list, &napi->rx_list);
 5656	if (++napi->rx_count >= gro_normal_batch)
 5657		gro_normal_list(napi);
 5658}
 5659
 5660INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
 5661INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
 5662static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
 5663{
 5664	struct packet_offload *ptype;
 5665	__be16 type = skb->protocol;
 5666	struct list_head *head = &offload_base;
 5667	int err = -ENOENT;
 5668
 5669	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
 5670
 5671	if (NAPI_GRO_CB(skb)->count == 1) {
 5672		skb_shinfo(skb)->gso_size = 0;
 5673		goto out;
 5674	}
 5675
 5676	rcu_read_lock();
 5677	list_for_each_entry_rcu(ptype, head, list) {
 5678		if (ptype->type != type || !ptype->callbacks.gro_complete)
 5679			continue;
 5680
 5681		err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
 5682					 ipv6_gro_complete, inet_gro_complete,
 5683					 skb, 0);
 5684		break;
 5685	}
 5686	rcu_read_unlock();
 5687
 5688	if (err) {
 5689		WARN_ON(&ptype->list == head);
 5690		kfree_skb(skb);
 5691		return NET_RX_SUCCESS;
 5692	}
 5693
 5694out:
 5695	gro_normal_one(napi, skb);
 5696	return NET_RX_SUCCESS;
 5697}
 5698
 5699static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 5700				   bool flush_old)
 5701{
 5702	struct list_head *head = &napi->gro_hash[index].list;
 5703	struct sk_buff *skb, *p;
 5704
 5705	list_for_each_entry_safe_reverse(skb, p, head, list) {
 5706		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 5707			return;
 5708		skb_list_del_init(skb);
 5709		napi_gro_complete(napi, skb);
 5710		napi->gro_hash[index].count--;
 5711	}
 5712
 5713	if (!napi->gro_hash[index].count)
 5714		__clear_bit(index, &napi->gro_bitmask);
 5715}
 5716
 5717/* napi->gro_hash[].list contains packets ordered by age.
 5718 * youngest packets at the head of it.
 5719 * Complete skbs in reverse order to reduce latencies.
 5720 */
 5721void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 5722{
 5723	unsigned long bitmask = napi->gro_bitmask;
 5724	unsigned int i, base = ~0U;
 5725
 5726	while ((i = ffs(bitmask)) != 0) {
 5727		bitmask >>= i;
 5728		base += i;
 5729		__napi_gro_flush_chain(napi, base, flush_old);
 5730	}
 5731}
 5732EXPORT_SYMBOL(napi_gro_flush);
 5733
 5734static struct list_head *gro_list_prepare(struct napi_struct *napi,
 5735					  struct sk_buff *skb)
 5736{
 5737	unsigned int maclen = skb->dev->hard_header_len;
 5738	u32 hash = skb_get_hash_raw(skb);
 5739	struct list_head *head;
 5740	struct sk_buff *p;
 5741
 5742	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
 5743	list_for_each_entry(p, head, list) {
 5744		unsigned long diffs;
 5745
 5746		NAPI_GRO_CB(p)->flush = 0;
 5747
 5748		if (hash != skb_get_hash_raw(p)) {
 5749			NAPI_GRO_CB(p)->same_flow = 0;
 5750			continue;
 5751		}
 5752
 5753		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 5754		diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
 5755		if (skb_vlan_tag_present(p))
 5756			diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
 5757		diffs |= skb_metadata_dst_cmp(p, skb);
 5758		diffs |= skb_metadata_differs(p, skb);
 5759		if (maclen == ETH_HLEN)
 5760			diffs |= compare_ether_header(skb_mac_header(p),
 5761						      skb_mac_header(skb));
 5762		else if (!diffs)
 5763			diffs = memcmp(skb_mac_header(p),
 5764				       skb_mac_header(skb),
 5765				       maclen);
 5766		NAPI_GRO_CB(p)->same_flow = !diffs;
 5767	}
 5768
 5769	return head;
 5770}
 5771
 5772static void skb_gro_reset_offset(struct sk_buff *skb)
 5773{
 5774	const struct skb_shared_info *pinfo = skb_shinfo(skb);
 5775	const skb_frag_t *frag0 = &pinfo->frags[0];
 5776
 5777	NAPI_GRO_CB(skb)->data_offset = 0;
 5778	NAPI_GRO_CB(skb)->frag0 = NULL;
 5779	NAPI_GRO_CB(skb)->frag0_len = 0;
 5780
 5781	if (!skb_headlen(skb) && pinfo->nr_frags &&
 5782	    !PageHighMem(skb_frag_page(frag0))) {
 5783		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
 5784		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
 5785						    skb_frag_size(frag0),
 5786						    skb->end - skb->tail);
 5787	}
 5788}
 5789
 5790static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
 5791{
 5792	struct skb_shared_info *pinfo = skb_shinfo(skb);
 5793
 5794	BUG_ON(skb->end - skb->tail < grow);
 5795
 5796	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
 5797
 5798	skb->data_len -= grow;
 5799	skb->tail += grow;
 5800
 5801	skb_frag_off_add(&pinfo->frags[0], grow);
 5802	skb_frag_size_sub(&pinfo->frags[0], grow);
 5803
 5804	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
 5805		skb_frag_unref(skb, 0);
 5806		memmove(pinfo->frags, pinfo->frags + 1,
 5807			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
 5808	}
 5809}
 5810
 5811static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
 5812{
 5813	struct sk_buff *oldest;
 5814
 5815	oldest = list_last_entry(head, struct sk_buff, list);
 5816
 5817	/* We are called with head length >= MAX_GRO_SKBS, so this is
 5818	 * impossible.
 5819	 */
 5820	if (WARN_ON_ONCE(!oldest))
 5821		return;
 5822
 5823	/* Do not adjust napi->gro_hash[].count, caller is adding a new
 5824	 * SKB to the chain.
 5825	 */
 5826	skb_list_del_init(oldest);
 5827	napi_gro_complete(napi, oldest);
 5828}
 5829
 5830INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
 5831							   struct sk_buff *));
 5832INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
 5833							   struct sk_buff *));
 5834static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 5835{
 5836	u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
 5837	struct list_head *head = &offload_base;
 5838	struct packet_offload *ptype;
 5839	__be16 type = skb->protocol;
 5840	struct list_head *gro_head;
 5841	struct sk_buff *pp = NULL;
 
 5842	enum gro_result ret;
 5843	int same_flow;
 5844	int grow;
 5845
 5846	if (netif_elide_gro(skb->dev))
 5847		goto normal;
 5848
 5849	gro_head = gro_list_prepare(napi, skb);
 
 5850
 5851	rcu_read_lock();
 5852	list_for_each_entry_rcu(ptype, head, list) {
 5853		if (ptype->type != type || !ptype->callbacks.gro_receive)
 5854			continue;
 5855
 5856		skb_set_network_header(skb, skb_gro_offset(skb));
 5857		skb_reset_mac_len(skb);
 
 5858		NAPI_GRO_CB(skb)->same_flow = 0;
 5859		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
 5860		NAPI_GRO_CB(skb)->free = 0;
 5861		NAPI_GRO_CB(skb)->encap_mark = 0;
 5862		NAPI_GRO_CB(skb)->recursion_counter = 0;
 5863		NAPI_GRO_CB(skb)->is_fou = 0;
 5864		NAPI_GRO_CB(skb)->is_atomic = 1;
 5865		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 5866
 5867		/* Setup for GRO checksum validation */
 5868		switch (skb->ip_summed) {
 5869		case CHECKSUM_COMPLETE:
 5870			NAPI_GRO_CB(skb)->csum = skb->csum;
 5871			NAPI_GRO_CB(skb)->csum_valid = 1;
 5872			NAPI_GRO_CB(skb)->csum_cnt = 0;
 5873			break;
 5874		case CHECKSUM_UNNECESSARY:
 5875			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
 5876			NAPI_GRO_CB(skb)->csum_valid = 0;
 5877			break;
 5878		default:
 5879			NAPI_GRO_CB(skb)->csum_cnt = 0;
 5880			NAPI_GRO_CB(skb)->csum_valid = 0;
 5881		}
 5882
 5883		pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
 5884					ipv6_gro_receive, inet_gro_receive,
 5885					gro_head, skb);
 5886		break;
 5887	}
 5888	rcu_read_unlock();
 5889
 5890	if (&ptype->list == head)
 5891		goto normal;
 5892
 5893	if (PTR_ERR(pp) == -EINPROGRESS) {
 5894		ret = GRO_CONSUMED;
 5895		goto ok;
 5896	}
 5897
 5898	same_flow = NAPI_GRO_CB(skb)->same_flow;
 5899	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 5900
 5901	if (pp) {
 5902		skb_list_del_init(pp);
 5903		napi_gro_complete(napi, pp);
 5904		napi->gro_hash[hash].count--;
 
 
 
 5905	}
 5906
 5907	if (same_flow)
 5908		goto ok;
 5909
 5910	if (NAPI_GRO_CB(skb)->flush)
 5911		goto normal;
 5912
 5913	if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
 5914		gro_flush_oldest(napi, gro_head);
 5915	} else {
 5916		napi->gro_hash[hash].count++;
 5917	}
 5918	NAPI_GRO_CB(skb)->count = 1;
 5919	NAPI_GRO_CB(skb)->age = jiffies;
 5920	NAPI_GRO_CB(skb)->last = skb;
 5921	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 5922	list_add(&skb->list, gro_head);
 
 5923	ret = GRO_HELD;
 5924
 5925pull:
 5926	grow = skb_gro_offset(skb) - skb_headlen(skb);
 5927	if (grow > 0)
 5928		gro_pull_from_frag0(skb, grow);
 5929ok:
 5930	if (napi->gro_hash[hash].count) {
 5931		if (!test_bit(hash, &napi->gro_bitmask))
 5932			__set_bit(hash, &napi->gro_bitmask);
 5933	} else if (test_bit(hash, &napi->gro_bitmask)) {
 5934		__clear_bit(hash, &napi->gro_bitmask);
 
 
 
 
 
 
 
 
 
 
 5935	}
 5936
 
 5937	return ret;
 5938
 5939normal:
 5940	ret = GRO_NORMAL;
 5941	goto pull;
 5942}
 
 5943
 5944struct packet_offload *gro_find_receive_by_type(__be16 type)
 
 5945{
 5946	struct list_head *offload_head = &offload_base;
 5947	struct packet_offload *ptype;
 5948
 5949	list_for_each_entry_rcu(ptype, offload_head, list) {
 5950		if (ptype->type != type || !ptype->callbacks.gro_receive)
 5951			continue;
 5952		return ptype;
 5953	}
 5954	return NULL;
 5955}
 5956EXPORT_SYMBOL(gro_find_receive_by_type);
 5957
 5958struct packet_offload *gro_find_complete_by_type(__be16 type)
 5959{
 5960	struct list_head *offload_head = &offload_base;
 5961	struct packet_offload *ptype;
 5962
 5963	list_for_each_entry_rcu(ptype, offload_head, list) {
 5964		if (ptype->type != type || !ptype->callbacks.gro_complete)
 5965			continue;
 5966		return ptype;
 
 
 5967	}
 5968	return NULL;
 5969}
 5970EXPORT_SYMBOL(gro_find_complete_by_type);
 5971
 5972static void napi_skb_free_stolen_head(struct sk_buff *skb)
 5973{
 5974	skb_dst_drop(skb);
 5975	skb_ext_put(skb);
 5976	kmem_cache_free(skbuff_head_cache, skb);
 5977}
 5978
 5979static gro_result_t napi_skb_finish(struct napi_struct *napi,
 5980				    struct sk_buff *skb,
 5981				    gro_result_t ret)
 5982{
 5983	switch (ret) {
 5984	case GRO_NORMAL:
 5985		gro_normal_one(napi, skb);
 
 5986		break;
 5987
 5988	case GRO_DROP:
 5989		kfree_skb(skb);
 5990		break;
 5991
 5992	case GRO_MERGED_FREE:
 5993		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 5994			napi_skb_free_stolen_head(skb);
 5995		else
 5996			__kfree_skb(skb);
 5997		break;
 5998
 5999	case GRO_HELD:
 6000	case GRO_MERGED:
 6001	case GRO_CONSUMED:
 6002		break;
 6003	}
 6004
 6005	return ret;
 6006}
 
 6007
 6008gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 6009{
 6010	gro_result_t ret;
 
 
 6011
 6012	skb_mark_napi_id(skb, napi);
 6013	trace_napi_gro_receive_entry(skb);
 
 
 
 
 
 
 6014
 
 
 6015	skb_gro_reset_offset(skb);
 6016
 6017	ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
 6018	trace_napi_gro_receive_exit(ret);
 6019
 6020	return ret;
 6021}
 6022EXPORT_SYMBOL(napi_gro_receive);
 6023
 6024static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 6025{
 6026	if (unlikely(skb->pfmemalloc)) {
 6027		consume_skb(skb);
 6028		return;
 6029	}
 6030	__skb_pull(skb, skb_headlen(skb));
 6031	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
 6032	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
 6033	__vlan_hwaccel_clear_tag(skb);
 6034	skb->dev = napi->dev;
 6035	skb->skb_iif = 0;
 6036
 6037	/* eth_type_trans() assumes pkt_type is PACKET_HOST */
 6038	skb->pkt_type = PACKET_HOST;
 6039
 6040	skb->encapsulation = 0;
 6041	skb_shinfo(skb)->gso_type = 0;
 6042	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 6043	skb_ext_reset(skb);
 6044
 6045	napi->skb = skb;
 6046}
 6047
 6048struct sk_buff *napi_get_frags(struct napi_struct *napi)
 6049{
 6050	struct sk_buff *skb = napi->skb;
 6051
 6052	if (!skb) {
 6053		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
 6054		if (skb) {
 6055			napi->skb = skb;
 6056			skb_mark_napi_id(skb, napi);
 6057		}
 6058	}
 6059	return skb;
 6060}
 6061EXPORT_SYMBOL(napi_get_frags);
 6062
 6063static gro_result_t napi_frags_finish(struct napi_struct *napi,
 6064				      struct sk_buff *skb,
 6065				      gro_result_t ret)
 6066{
 6067	switch (ret) {
 6068	case GRO_NORMAL:
 6069	case GRO_HELD:
 6070		__skb_push(skb, ETH_HLEN);
 6071		skb->protocol = eth_type_trans(skb, skb->dev);
 6072		if (ret == GRO_NORMAL)
 6073			gro_normal_one(napi, skb);
 
 
 
 6074		break;
 6075
 6076	case GRO_DROP:
 
 6077		napi_reuse_skb(napi, skb);
 6078		break;
 6079
 6080	case GRO_MERGED_FREE:
 6081		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 6082			napi_skb_free_stolen_head(skb);
 6083		else
 6084			napi_reuse_skb(napi, skb);
 6085		break;
 6086
 6087	case GRO_MERGED:
 6088	case GRO_CONSUMED:
 6089		break;
 6090	}
 6091
 6092	return ret;
 6093}
 
 6094
 6095/* Upper GRO stack assumes network header starts at gro_offset=0
 6096 * Drivers could call both napi_gro_frags() and napi_gro_receive()
 6097 * We copy ethernet header into skb->data to have a common layout.
 6098 */
 6099static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 6100{
 6101	struct sk_buff *skb = napi->skb;
 6102	const struct ethhdr *eth;
 6103	unsigned int hlen = sizeof(*eth);
 
 6104
 6105	napi->skb = NULL;
 6106
 6107	skb_reset_mac_header(skb);
 6108	skb_gro_reset_offset(skb);
 6109
 6110	if (unlikely(skb_gro_header_hard(skb, hlen))) {
 6111		eth = skb_gro_header_slow(skb, hlen, 0);
 
 
 
 6112		if (unlikely(!eth)) {
 6113			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
 6114					     __func__, napi->dev->name);
 6115			napi_reuse_skb(napi, skb);
 6116			return NULL;
 
 6117		}
 6118	} else {
 6119		eth = (const struct ethhdr *)skb->data;
 6120		gro_pull_from_frag0(skb, hlen);
 6121		NAPI_GRO_CB(skb)->frag0 += hlen;
 6122		NAPI_GRO_CB(skb)->frag0_len -= hlen;
 6123	}
 6124	__skb_pull(skb, hlen);
 
 6125
 6126	/*
 6127	 * This works because the only protocols we care about don't require
 6128	 * special handling.
 6129	 * We'll fix it up properly in napi_frags_finish()
 6130	 */
 6131	skb->protocol = eth->h_proto;
 6132
 
 6133	return skb;
 6134}
 6135
 6136gro_result_t napi_gro_frags(struct napi_struct *napi)
 6137{
 6138	gro_result_t ret;
 6139	struct sk_buff *skb = napi_frags_skb(napi);
 6140
 6141	if (!skb)
 6142		return GRO_DROP;
 6143
 6144	trace_napi_gro_frags_entry(skb);
 6145
 6146	ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
 6147	trace_napi_gro_frags_exit(ret);
 6148
 6149	return ret;
 6150}
 6151EXPORT_SYMBOL(napi_gro_frags);
 6152
 6153/* Compute the checksum from gro_offset and return the folded value
 6154 * after adding in any pseudo checksum.
 6155 */
 6156__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
 6157{
 6158	__wsum wsum;
 6159	__sum16 sum;
 6160
 6161	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
 6162
 6163	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
 6164	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
 6165	/* See comments in __skb_checksum_complete(). */
 6166	if (likely(!sum)) {
 6167		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
 6168		    !skb->csum_complete_sw)
 6169			netdev_rx_csum_fault(skb->dev, skb);
 6170	}
 6171
 6172	NAPI_GRO_CB(skb)->csum = wsum;
 6173	NAPI_GRO_CB(skb)->csum_valid = 1;
 6174
 6175	return sum;
 6176}
 6177EXPORT_SYMBOL(__skb_gro_checksum_complete);
 6178
 6179static void net_rps_send_ipi(struct softnet_data *remsd)
 6180{
 6181#ifdef CONFIG_RPS
 6182	while (remsd) {
 6183		struct softnet_data *next = remsd->rps_ipi_next;
 6184
 6185		if (cpu_online(remsd->cpu))
 6186			smp_call_function_single_async(remsd->cpu, &remsd->csd);
 6187		remsd = next;
 6188	}
 6189#endif
 6190}
 6191
 6192/*
 6193 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 6194 * Note: called with local irq disabled, but exits with local irq enabled.
 6195 */
 6196static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 6197{
 6198#ifdef CONFIG_RPS
 6199	struct softnet_data *remsd = sd->rps_ipi_list;
 6200
 6201	if (remsd) {
 6202		sd->rps_ipi_list = NULL;
 6203
 6204		local_irq_enable();
 6205
 6206		/* Send pending IPI's to kick RPS processing on remote cpus. */
 6207		net_rps_send_ipi(remsd);
 
 
 
 
 
 
 
 6208	} else
 6209#endif
 6210		local_irq_enable();
 6211}
 6212
 6213static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
 6214{
 6215#ifdef CONFIG_RPS
 6216	return sd->rps_ipi_list != NULL;
 6217#else
 6218	return false;
 6219#endif
 6220}
 6221
 6222static int process_backlog(struct napi_struct *napi, int quota)
 6223{
 
 6224	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 6225	bool again = true;
 6226	int work = 0;
 6227
 
 6228	/* Check if we have pending ipi, its better to send them now,
 6229	 * not waiting net_rx_action() end.
 6230	 */
 6231	if (sd_has_rps_ipi_waiting(sd)) {
 6232		local_irq_disable();
 6233		net_rps_action_and_irq_enable(sd);
 6234	}
 6235
 6236	napi->weight = dev_rx_weight;
 6237	while (again) {
 
 6238		struct sk_buff *skb;
 
 6239
 6240		while ((skb = __skb_dequeue(&sd->process_queue))) {
 6241			rcu_read_lock();
 6242			__netif_receive_skb(skb);
 6243			rcu_read_unlock();
 6244			input_queue_head_incr(sd);
 6245			if (++work >= quota)
 
 6246				return work;
 6247
 6248		}
 6249
 6250		local_irq_disable();
 6251		rps_lock(sd);
 6252		if (skb_queue_empty(&sd->input_pkt_queue)) {
 
 
 
 
 
 6253			/*
 6254			 * Inline a custom version of __napi_complete().
 6255			 * only current cpu owns and manipulates this napi,
 6256			 * and NAPI_STATE_SCHED is the only possible flag set
 6257			 * on backlog.
 6258			 * We can use a plain write instead of clear_bit(),
 6259			 * and we dont need an smp_mb() memory barrier.
 6260			 */
 
 6261			napi->state = 0;
 6262			again = false;
 6263		} else {
 6264			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 6265						   &sd->process_queue);
 6266		}
 6267		rps_unlock(sd);
 6268		local_irq_enable();
 6269	}
 
 6270
 6271	return work;
 6272}
 6273
 6274/**
 6275 * __napi_schedule - schedule for receive
 6276 * @n: entry to schedule
 6277 *
 6278 * The entry's receive function will be scheduled to run.
 6279 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 6280 */
 6281void __napi_schedule(struct napi_struct *n)
 6282{
 6283	unsigned long flags;
 6284
 6285	local_irq_save(flags);
 6286	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6287	local_irq_restore(flags);
 6288}
 6289EXPORT_SYMBOL(__napi_schedule);
 6290
 6291/**
 6292 *	napi_schedule_prep - check if napi can be scheduled
 6293 *	@n: napi context
 6294 *
 6295 * Test if NAPI routine is already running, and if not mark
 6296 * it as running.  This is used as a condition variable
 6297 * insure only one NAPI poll instance runs.  We also make
 6298 * sure there is no pending NAPI disable.
 6299 */
 6300bool napi_schedule_prep(struct napi_struct *n)
 6301{
 6302	unsigned long val, new;
 6303
 6304	do {
 6305		val = READ_ONCE(n->state);
 6306		if (unlikely(val & NAPIF_STATE_DISABLE))
 6307			return false;
 6308		new = val | NAPIF_STATE_SCHED;
 6309
 6310		/* Sets STATE_MISSED bit if STATE_SCHED was already set
 6311		 * This was suggested by Alexander Duyck, as compiler
 6312		 * emits better code than :
 6313		 * if (val & NAPIF_STATE_SCHED)
 6314		 *     new |= NAPIF_STATE_MISSED;
 6315		 */
 6316		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
 6317						   NAPIF_STATE_MISSED;
 6318	} while (cmpxchg(&n->state, val, new) != val);
 6319
 6320	return !(val & NAPIF_STATE_SCHED);
 6321}
 6322EXPORT_SYMBOL(napi_schedule_prep);
 6323
 6324/**
 6325 * __napi_schedule_irqoff - schedule for receive
 6326 * @n: entry to schedule
 6327 *
 6328 * Variant of __napi_schedule() assuming hard irqs are masked
 6329 */
 6330void __napi_schedule_irqoff(struct napi_struct *n)
 6331{
 6332	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6333}
 6334EXPORT_SYMBOL(__napi_schedule_irqoff);
 6335
 6336bool napi_complete_done(struct napi_struct *n, int work_done)
 6337{
 6338	unsigned long flags, val, new, timeout = 0;
 6339	bool ret = true;
 6340
 6341	/*
 6342	 * 1) Don't let napi dequeue from the cpu poll list
 6343	 *    just in case its running on a different cpu.
 6344	 * 2) If we are busy polling, do nothing here, we have
 6345	 *    the guarantee we will be called later.
 6346	 */
 6347	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
 6348				 NAPIF_STATE_IN_BUSY_POLL)))
 6349		return false;
 6350
 6351	if (work_done) {
 6352		if (n->gro_bitmask)
 6353			timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6354		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
 6355	}
 6356	if (n->defer_hard_irqs_count > 0) {
 6357		n->defer_hard_irqs_count--;
 6358		timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6359		if (timeout)
 6360			ret = false;
 6361	}
 6362	if (n->gro_bitmask) {
 6363		/* When the NAPI instance uses a timeout and keeps postponing
 6364		 * it, we need to bound somehow the time packets are kept in
 6365		 * the GRO layer
 6366		 */
 6367		napi_gro_flush(n, !!timeout);
 6368	}
 6369
 6370	gro_normal_list(n);
 6371
 6372	if (unlikely(!list_empty(&n->poll_list))) {
 6373		/* If n->poll_list is not empty, we need to mask irqs */
 6374		local_irq_save(flags);
 6375		list_del_init(&n->poll_list);
 6376		local_irq_restore(flags);
 6377	}
 6378
 6379	do {
 6380		val = READ_ONCE(n->state);
 6381
 6382		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
 6383
 6384		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
 6385
 6386		/* If STATE_MISSED was set, leave STATE_SCHED set,
 6387		 * because we will call napi->poll() one more time.
 6388		 * This C code was suggested by Alexander Duyck to help gcc.
 6389		 */
 6390		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
 6391						    NAPIF_STATE_SCHED;
 6392	} while (cmpxchg(&n->state, val, new) != val);
 6393
 6394	if (unlikely(val & NAPIF_STATE_MISSED)) {
 6395		__napi_schedule(n);
 6396		return false;
 6397	}
 6398
 6399	if (timeout)
 6400		hrtimer_start(&n->timer, ns_to_ktime(timeout),
 6401			      HRTIMER_MODE_REL_PINNED);
 6402	return ret;
 6403}
 6404EXPORT_SYMBOL(napi_complete_done);
 6405
 6406/* must be called under rcu_read_lock(), as we dont take a reference */
 6407static struct napi_struct *napi_by_id(unsigned int napi_id)
 6408{
 6409	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
 6410	struct napi_struct *napi;
 6411
 6412	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
 6413		if (napi->napi_id == napi_id)
 6414			return napi;
 6415
 6416	return NULL;
 6417}
 6418
 6419#if defined(CONFIG_NET_RX_BUSY_POLL)
 6420
 6421#define BUSY_POLL_BUDGET 8
 6422
 6423static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 6424{
 6425	int rc;
 6426
 6427	/* Busy polling means there is a high chance device driver hard irq
 6428	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
 6429	 * set in napi_schedule_prep().
 6430	 * Since we are about to call napi->poll() once more, we can safely
 6431	 * clear NAPI_STATE_MISSED.
 6432	 *
 6433	 * Note: x86 could use a single "lock and ..." instruction
 6434	 * to perform these two clear_bit()
 6435	 */
 6436	clear_bit(NAPI_STATE_MISSED, &napi->state);
 6437	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
 6438
 6439	local_bh_disable();
 6440
 6441	/* All we really want here is to re-enable device interrupts.
 6442	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
 6443	 */
 6444	rc = napi->poll(napi, BUSY_POLL_BUDGET);
 6445	/* We can't gro_normal_list() here, because napi->poll() might have
 6446	 * rearmed the napi (napi_complete_done()) in which case it could
 6447	 * already be running on another CPU.
 6448	 */
 6449	trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
 6450	netpoll_poll_unlock(have_poll_lock);
 6451	if (rc == BUSY_POLL_BUDGET) {
 6452		/* As the whole budget was spent, we still own the napi so can
 6453		 * safely handle the rx_list.
 6454		 */
 6455		gro_normal_list(napi);
 6456		__napi_schedule(napi);
 6457	}
 6458	local_bh_enable();
 6459}
 6460
 6461void napi_busy_loop(unsigned int napi_id,
 6462		    bool (*loop_end)(void *, unsigned long),
 6463		    void *loop_end_arg)
 6464{
 6465	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
 6466	int (*napi_poll)(struct napi_struct *napi, int budget);
 6467	void *have_poll_lock = NULL;
 6468	struct napi_struct *napi;
 6469
 6470restart:
 6471	napi_poll = NULL;
 6472
 6473	rcu_read_lock();
 6474
 6475	napi = napi_by_id(napi_id);
 6476	if (!napi)
 6477		goto out;
 6478
 6479	preempt_disable();
 6480	for (;;) {
 6481		int work = 0;
 6482
 6483		local_bh_disable();
 6484		if (!napi_poll) {
 6485			unsigned long val = READ_ONCE(napi->state);
 6486
 6487			/* If multiple threads are competing for this napi,
 6488			 * we avoid dirtying napi->state as much as we can.
 6489			 */
 6490			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
 6491				   NAPIF_STATE_IN_BUSY_POLL))
 6492				goto count;
 6493			if (cmpxchg(&napi->state, val,
 6494				    val | NAPIF_STATE_IN_BUSY_POLL |
 6495					  NAPIF_STATE_SCHED) != val)
 6496				goto count;
 6497			have_poll_lock = netpoll_poll_lock(napi);
 6498			napi_poll = napi->poll;
 6499		}
 6500		work = napi_poll(napi, BUSY_POLL_BUDGET);
 6501		trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
 6502		gro_normal_list(napi);
 6503count:
 6504		if (work > 0)
 6505			__NET_ADD_STATS(dev_net(napi->dev),
 6506					LINUX_MIB_BUSYPOLLRXPACKETS, work);
 6507		local_bh_enable();
 6508
 6509		if (!loop_end || loop_end(loop_end_arg, start_time))
 6510			break;
 6511
 6512		if (unlikely(need_resched())) {
 6513			if (napi_poll)
 6514				busy_poll_stop(napi, have_poll_lock);
 6515			preempt_enable();
 6516			rcu_read_unlock();
 6517			cond_resched();
 6518			if (loop_end(loop_end_arg, start_time))
 6519				return;
 6520			goto restart;
 6521		}
 6522		cpu_relax();
 6523	}
 6524	if (napi_poll)
 6525		busy_poll_stop(napi, have_poll_lock);
 6526	preempt_enable();
 6527out:
 6528	rcu_read_unlock();
 6529}
 6530EXPORT_SYMBOL(napi_busy_loop);
 6531
 6532#endif /* CONFIG_NET_RX_BUSY_POLL */
 6533
 6534static void napi_hash_add(struct napi_struct *napi)
 6535{
 6536	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
 6537	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
 6538		return;
 6539
 6540	spin_lock(&napi_hash_lock);
 6541
 6542	/* 0..NR_CPUS range is reserved for sender_cpu use */
 6543	do {
 6544		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
 6545			napi_gen_id = MIN_NAPI_ID;
 6546	} while (napi_by_id(napi_gen_id));
 6547	napi->napi_id = napi_gen_id;
 6548
 6549	hlist_add_head_rcu(&napi->napi_hash_node,
 6550			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
 6551
 6552	spin_unlock(&napi_hash_lock);
 6553}
 6554
 6555/* Warning : caller is responsible to make sure rcu grace period
 6556 * is respected before freeing memory containing @napi
 6557 */
 6558bool napi_hash_del(struct napi_struct *napi)
 6559{
 6560	bool rcu_sync_needed = false;
 6561
 6562	spin_lock(&napi_hash_lock);
 6563
 6564	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
 6565		rcu_sync_needed = true;
 6566		hlist_del_rcu(&napi->napi_hash_node);
 6567	}
 6568	spin_unlock(&napi_hash_lock);
 6569	return rcu_sync_needed;
 6570}
 6571EXPORT_SYMBOL_GPL(napi_hash_del);
 6572
 6573static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 6574{
 6575	struct napi_struct *napi;
 6576
 6577	napi = container_of(timer, struct napi_struct, timer);
 6578
 6579	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 6580	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 6581	 */
 6582	if (!napi_disable_pending(napi) &&
 6583	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 6584		__napi_schedule_irqoff(napi);
 6585
 6586	return HRTIMER_NORESTART;
 6587}
 6588
 6589static void init_gro_hash(struct napi_struct *napi)
 6590{
 6591	int i;
 6592
 6593	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6594		INIT_LIST_HEAD(&napi->gro_hash[i].list);
 6595		napi->gro_hash[i].count = 0;
 6596	}
 6597	napi->gro_bitmask = 0;
 6598}
 
 6599
 6600void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 6601		    int (*poll)(struct napi_struct *, int), int weight)
 6602{
 6603	INIT_LIST_HEAD(&napi->poll_list);
 6604	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 6605	napi->timer.function = napi_watchdog;
 6606	init_gro_hash(napi);
 6607	napi->skb = NULL;
 6608	INIT_LIST_HEAD(&napi->rx_list);
 6609	napi->rx_count = 0;
 6610	napi->poll = poll;
 6611	if (weight > NAPI_POLL_WEIGHT)
 6612		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
 6613				weight);
 6614	napi->weight = weight;
 
 6615	napi->dev = dev;
 6616#ifdef CONFIG_NETPOLL
 
 6617	napi->poll_owner = -1;
 6618#endif
 6619	set_bit(NAPI_STATE_SCHED, &napi->state);
 6620	set_bit(NAPI_STATE_NPSVC, &napi->state);
 6621	list_add_rcu(&napi->dev_list, &dev->napi_list);
 6622	napi_hash_add(napi);
 6623}
 6624EXPORT_SYMBOL(netif_napi_add);
 6625
 6626void napi_disable(struct napi_struct *n)
 6627{
 6628	might_sleep();
 6629	set_bit(NAPI_STATE_DISABLE, &n->state);
 6630
 6631	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
 6632		msleep(1);
 6633	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
 6634		msleep(1);
 6635
 6636	hrtimer_cancel(&n->timer);
 6637
 6638	clear_bit(NAPI_STATE_DISABLE, &n->state);
 6639}
 6640EXPORT_SYMBOL(napi_disable);
 6641
 6642static void flush_gro_hash(struct napi_struct *napi)
 6643{
 6644	int i;
 6645
 6646	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6647		struct sk_buff *skb, *n;
 6648
 6649		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
 6650			kfree_skb(skb);
 6651		napi->gro_hash[i].count = 0;
 6652	}
 6653}
 6654
 6655/* Must be called in process context */
 6656void netif_napi_del(struct napi_struct *napi)
 6657{
 6658	might_sleep();
 6659	if (napi_hash_del(napi))
 6660		synchronize_net();
 6661	list_del_init(&napi->dev_list);
 6662	napi_free_frags(napi);
 6663
 6664	flush_gro_hash(napi);
 6665	napi->gro_bitmask = 0;
 6666}
 6667EXPORT_SYMBOL(netif_napi_del);
 6668
 6669static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 6670{
 6671	void *have;
 6672	int work, weight;
 6673
 6674	list_del_init(&n->poll_list);
 6675
 6676	have = netpoll_poll_lock(n);
 6677
 6678	weight = n->weight;
 6679
 6680	/* This NAPI_STATE_SCHED test is for avoiding a race
 6681	 * with netpoll's poll_napi().  Only the entity which
 6682	 * obtains the lock and sees NAPI_STATE_SCHED set will
 6683	 * actually make the ->poll() call.  Therefore we avoid
 6684	 * accidentally calling ->poll() when NAPI is not scheduled.
 6685	 */
 6686	work = 0;
 6687	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 6688		work = n->poll(n, weight);
 6689		trace_napi_poll(n, work, weight);
 6690	}
 6691
 6692	if (unlikely(work > weight))
 6693		pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
 6694			    n->poll, work, weight);
 6695
 6696	if (likely(work < weight))
 6697		goto out_unlock;
 6698
 6699	/* Drivers must not modify the NAPI state if they
 6700	 * consume the entire weight.  In such cases this code
 6701	 * still "owns" the NAPI instance and therefore can
 6702	 * move the instance around on the list at-will.
 6703	 */
 6704	if (unlikely(napi_disable_pending(n))) {
 6705		napi_complete(n);
 6706		goto out_unlock;
 6707	}
 6708
 6709	if (n->gro_bitmask) {
 6710		/* flush too old packets
 6711		 * If HZ < 1000, flush all packets.
 6712		 */
 6713		napi_gro_flush(n, HZ >= 1000);
 6714	}
 6715
 6716	gro_normal_list(n);
 6717
 6718	/* Some drivers may have called napi_schedule
 6719	 * prior to exhausting their budget.
 6720	 */
 6721	if (unlikely(!list_empty(&n->poll_list))) {
 6722		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
 6723			     n->dev ? n->dev->name : "backlog");
 6724		goto out_unlock;
 6725	}
 6726
 6727	list_add_tail(&n->poll_list, repoll);
 6728
 6729out_unlock:
 6730	netpoll_poll_unlock(have);
 6731
 6732	return work;
 6733}
 
 6734
 6735static __latent_entropy void net_rx_action(struct softirq_action *h)
 6736{
 6737	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 6738	unsigned long time_limit = jiffies +
 6739		usecs_to_jiffies(netdev_budget_usecs);
 6740	int budget = netdev_budget;
 6741	LIST_HEAD(list);
 6742	LIST_HEAD(repoll);
 6743
 6744	local_irq_disable();
 6745	list_splice_init(&sd->poll_list, &list);
 6746	local_irq_enable();
 6747
 6748	for (;;) {
 6749		struct napi_struct *n;
 
 6750
 6751		if (list_empty(&list)) {
 6752			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
 6753				goto out;
 6754			break;
 6755		}
 6756
 6757		n = list_first_entry(&list, struct napi_struct, poll_list);
 6758		budget -= napi_poll(n, &repoll);
 6759
 6760		/* If softirq window is exhausted then punt.
 6761		 * Allow this to run for 2 jiffies since which will allow
 6762		 * an average latency of 1.5/HZ.
 6763		 */
 6764		if (unlikely(budget <= 0 ||
 6765			     time_after_eq(jiffies, time_limit))) {
 6766			sd->time_squeeze++;
 6767			break;
 6768		}
 6769	}
 6770
 6771	local_irq_disable();
 6772
 6773	list_splice_tail_init(&sd->poll_list, &list);
 6774	list_splice_tail(&repoll, &list);
 6775	list_splice(&list, &sd->poll_list);
 6776	if (!list_empty(&sd->poll_list))
 6777		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 6778
 6779	net_rps_action_and_irq_enable(sd);
 6780out:
 6781	__kfree_skb_flush();
 6782}
 6783
 6784struct netdev_adjacent {
 6785	struct net_device *dev;
 6786
 6787	/* upper master flag, there can only be one master device per list */
 6788	bool master;
 
 
 
 
 
 
 
 
 
 6789
 6790	/* lookup ignore flag */
 6791	bool ignore;
 6792
 6793	/* counter for the number of times this device was added to us */
 6794	u16 ref_nr;
 6795
 6796	/* private field for the users */
 6797	void *private;
 6798
 6799	struct list_head list;
 6800	struct rcu_head rcu;
 6801};
 
 
 
 
 
 
 
 
 
 
 6802
 6803static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
 6804						 struct list_head *adj_list)
 6805{
 6806	struct netdev_adjacent *adj;
 6807
 6808	list_for_each_entry(adj, adj_list, list) {
 6809		if (adj->dev == adj_dev)
 6810			return adj;
 6811	}
 6812	return NULL;
 6813}
 
 6814
 6815static int ____netdev_has_upper_dev(struct net_device *upper_dev,
 6816				    struct netdev_nested_priv *priv)
 6817{
 6818	struct net_device *dev = (struct net_device *)priv->data;
 6819
 6820	return upper_dev == dev;
 
 
 
 6821}
 6822
 
 
 6823/**
 6824 * netdev_has_upper_dev - Check if device is linked to an upper device
 6825 * @dev: device
 6826 * @upper_dev: upper device to check
 6827 *
 6828 * Find out if a device is linked to specified upper device and return true
 6829 * in case it is. Note that this checks only immediate upper device,
 6830 * not through a complete stack of devices. The caller must hold the RTNL lock.
 6831 */
 6832bool netdev_has_upper_dev(struct net_device *dev,
 6833			  struct net_device *upper_dev)
 6834{
 6835	struct netdev_nested_priv priv = {
 6836		.data = (void *)upper_dev,
 6837	};
 6838
 6839	ASSERT_RTNL();
 6840
 6841	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6842					     &priv);
 6843}
 6844EXPORT_SYMBOL(netdev_has_upper_dev);
 6845
 6846/**
 6847 * netdev_has_upper_dev_all - Check if device is linked to an upper device
 6848 * @dev: device
 6849 * @upper_dev: upper device to check
 6850 *
 6851 * Find out if a device is linked to specified upper device and return true
 6852 * in case it is. Note that this checks the entire upper device chain.
 6853 * The caller must hold rcu lock.
 6854 */
 6855
 6856bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
 6857				  struct net_device *upper_dev)
 6858{
 6859	struct netdev_nested_priv priv = {
 6860		.data = (void *)upper_dev,
 6861	};
 6862
 6863	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6864					       &priv);
 6865}
 6866EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
 6867
 6868/**
 6869 * netdev_has_any_upper_dev - Check if device is linked to some device
 6870 * @dev: device
 6871 *
 6872 * Find out if a device is linked to an upper device and return true in case
 6873 * it is. The caller must hold the RTNL lock.
 6874 */
 6875bool netdev_has_any_upper_dev(struct net_device *dev)
 6876{
 6877	ASSERT_RTNL();
 6878
 6879	return !list_empty(&dev->adj_list.upper);
 6880}
 6881EXPORT_SYMBOL(netdev_has_any_upper_dev);
 6882
 6883/**
 6884 * netdev_master_upper_dev_get - Get master upper device
 6885 * @dev: device
 6886 *
 6887 * Find a master upper device and return pointer to it or NULL in case
 6888 * it's not there. The caller must hold the RTNL lock.
 6889 */
 6890struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 6891{
 6892	struct netdev_adjacent *upper;
 6893
 6894	ASSERT_RTNL();
 6895
 6896	if (list_empty(&dev->adj_list.upper))
 6897		return NULL;
 6898
 6899	upper = list_first_entry(&dev->adj_list.upper,
 6900				 struct netdev_adjacent, list);
 6901	if (likely(upper->master))
 6902		return upper->dev;
 6903	return NULL;
 6904}
 6905EXPORT_SYMBOL(netdev_master_upper_dev_get);
 6906
 6907static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
 6908{
 6909	struct netdev_adjacent *upper;
 
 6910
 6911	ASSERT_RTNL();
 
 
 6912
 6913	if (list_empty(&dev->adj_list.upper))
 6914		return NULL;
 6915
 6916	upper = list_first_entry(&dev->adj_list.upper,
 6917				 struct netdev_adjacent, list);
 6918	if (likely(upper->master) && !upper->ignore)
 6919		return upper->dev;
 6920	return NULL;
 6921}
 6922
 6923/**
 6924 * netdev_has_any_lower_dev - Check if device is linked to some device
 6925 * @dev: device
 6926 *
 6927 * Find out if a device is linked to a lower device and return true in case
 6928 * it is. The caller must hold the RTNL lock.
 6929 */
 6930static bool netdev_has_any_lower_dev(struct net_device *dev)
 6931{
 6932	ASSERT_RTNL();
 6933
 6934	return !list_empty(&dev->adj_list.lower);
 
 
 6935}
 6936
 6937void *netdev_adjacent_get_private(struct list_head *adj_list)
 6938{
 6939	struct netdev_adjacent *adj;
 6940
 6941	adj = list_entry(adj_list, struct netdev_adjacent, list);
 6942
 6943	return adj->private;
 6944}
 6945EXPORT_SYMBOL(netdev_adjacent_get_private);
 6946
 6947/**
 6948 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 6949 * @dev: device
 6950 * @iter: list_head ** of the current position
 6951 *
 6952 * Gets the next device from the dev's upper list, starting from iter
 6953 * position. The caller must hold RCU read lock.
 6954 */
 6955struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 6956						 struct list_head **iter)
 6957{
 6958	struct netdev_adjacent *upper;
 6959
 6960	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 6961
 6962	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6963
 6964	if (&upper->list == &dev->adj_list.upper)
 6965		return NULL;
 6966
 6967	*iter = &upper->list;
 6968
 6969	return upper->dev;
 6970}
 6971EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 6972
 6973static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
 6974						  struct list_head **iter,
 6975						  bool *ignore)
 6976{
 6977	struct netdev_adjacent *upper;
 
 
 
 
 
 6978
 6979	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
 
 
 6980
 6981	if (&upper->list == &dev->adj_list.upper)
 6982		return NULL;
 6983
 6984	*iter = &upper->list;
 6985	*ignore = upper->ignore;
 6986
 6987	return upper->dev;
 6988}
 
 6989
 6990static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 6991						    struct list_head **iter)
 6992{
 6993	struct netdev_adjacent *upper;
 6994
 6995	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 6996
 6997	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6998
 6999	if (&upper->list == &dev->adj_list.upper)
 7000		return NULL;
 7001
 7002	*iter = &upper->list;
 7003
 7004	return upper->dev;
 7005}
 7006
 7007static int __netdev_walk_all_upper_dev(struct net_device *dev,
 7008				       int (*fn)(struct net_device *dev,
 7009					 struct netdev_nested_priv *priv),
 7010				       struct netdev_nested_priv *priv)
 7011{
 7012	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7013	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7014	int ret, cur = 0;
 7015	bool ignore;
 7016
 7017	now = dev;
 7018	iter = &dev->adj_list.upper;
 7019
 7020	while (1) {
 7021		if (now != dev) {
 7022			ret = fn(now, priv);
 7023			if (ret)
 7024				return ret;
 7025		}
 
 7026
 7027		next = NULL;
 7028		while (1) {
 7029			udev = __netdev_next_upper_dev(now, &iter, &ignore);
 7030			if (!udev)
 7031				break;
 7032			if (ignore)
 7033				continue;
 7034
 7035			next = udev;
 7036			niter = &udev->adj_list.upper;
 7037			dev_stack[cur] = now;
 7038			iter_stack[cur++] = iter;
 7039			break;
 7040		}
 7041
 7042		if (!next) {
 7043			if (!cur)
 7044				return 0;
 7045			next = dev_stack[--cur];
 7046			niter = iter_stack[cur];
 7047		}
 7048
 7049		now = next;
 7050		iter = niter;
 7051	}
 7052
 7053	return 0;
 7054}
 7055
 7056int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 7057				  int (*fn)(struct net_device *dev,
 7058					    struct netdev_nested_priv *priv),
 7059				  struct netdev_nested_priv *priv)
 7060{
 7061	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7062	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7063	int ret, cur = 0;
 7064
 7065	now = dev;
 7066	iter = &dev->adj_list.upper;
 7067
 7068	while (1) {
 7069		if (now != dev) {
 7070			ret = fn(now, priv);
 7071			if (ret)
 7072				return ret;
 7073		}
 7074
 7075		next = NULL;
 7076		while (1) {
 7077			udev = netdev_next_upper_dev_rcu(now, &iter);
 7078			if (!udev)
 7079				break;
 7080
 7081			next = udev;
 7082			niter = &udev->adj_list.upper;
 7083			dev_stack[cur] = now;
 7084			iter_stack[cur++] = iter;
 7085			break;
 7086		}
 7087
 7088		if (!next) {
 7089			if (!cur)
 7090				return 0;
 7091			next = dev_stack[--cur];
 7092			niter = iter_stack[cur];
 7093		}
 7094
 7095		now = next;
 7096		iter = niter;
 
 
 
 
 
 
 
 
 
 
 7097	}
 7098
 7099	return 0;
 7100}
 7101EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
 7102
 7103static bool __netdev_has_upper_dev(struct net_device *dev,
 7104				   struct net_device *upper_dev)
 7105{
 7106	struct netdev_nested_priv priv = {
 7107		.flags = 0,
 7108		.data = (void *)upper_dev,
 7109	};
 7110
 7111	ASSERT_RTNL();
 
 
 
 7112
 7113	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
 7114					   &priv);
 7115}
 7116
 7117/**
 7118 * netdev_lower_get_next_private - Get the next ->private from the
 7119 *				   lower neighbour list
 7120 * @dev: device
 7121 * @iter: list_head ** of the current position
 7122 *
 7123 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7124 * list, starting from iter position. The caller must hold either hold the
 7125 * RTNL lock or its own locking that guarantees that the neighbour lower
 7126 * list will remain unchanged.
 7127 */
 7128void *netdev_lower_get_next_private(struct net_device *dev,
 7129				    struct list_head **iter)
 7130{
 7131	struct netdev_adjacent *lower;
 7132
 7133	lower = list_entry(*iter, struct netdev_adjacent, list);
 7134
 7135	if (&lower->list == &dev->adj_list.lower)
 7136		return NULL;
 7137
 7138	*iter = lower->list.next;
 7139
 7140	return lower->private;
 7141}
 7142EXPORT_SYMBOL(netdev_lower_get_next_private);
 7143
 7144/**
 7145 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 7146 *				       lower neighbour list, RCU
 7147 *				       variant
 7148 * @dev: device
 7149 * @iter: list_head ** of the current position
 7150 *
 7151 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7152 * list, starting from iter position. The caller must hold RCU read lock.
 7153 */
 7154void *netdev_lower_get_next_private_rcu(struct net_device *dev,
 7155					struct list_head **iter)
 7156{
 7157	struct netdev_adjacent *lower;
 7158
 7159	WARN_ON_ONCE(!rcu_read_lock_held());
 7160
 7161	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7162
 7163	if (&lower->list == &dev->adj_list.lower)
 7164		return NULL;
 7165
 7166	*iter = &lower->list;
 7167
 7168	return lower->private;
 7169}
 7170EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
 7171
 7172/**
 7173 * netdev_lower_get_next - Get the next device from the lower neighbour
 7174 *                         list
 7175 * @dev: device
 7176 * @iter: list_head ** of the current position
 7177 *
 7178 * Gets the next netdev_adjacent from the dev's lower neighbour
 7179 * list, starting from iter position. The caller must hold RTNL lock or
 7180 * its own locking that guarantees that the neighbour lower
 7181 * list will remain unchanged.
 7182 */
 7183void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 7184{
 7185	struct netdev_adjacent *lower;
 7186
 7187	lower = list_entry(*iter, struct netdev_adjacent, list);
 7188
 7189	if (&lower->list == &dev->adj_list.lower)
 7190		return NULL;
 7191
 7192	*iter = lower->list.next;
 7193
 7194	return lower->dev;
 7195}
 7196EXPORT_SYMBOL(netdev_lower_get_next);
 7197
 7198static struct net_device *netdev_next_lower_dev(struct net_device *dev,
 7199						struct list_head **iter)
 7200{
 7201	struct netdev_adjacent *lower;
 7202
 7203	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7204
 7205	if (&lower->list == &dev->adj_list.lower)
 7206		return NULL;
 7207
 7208	*iter = &lower->list;
 7209
 7210	return lower->dev;
 7211}
 7212
 7213static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
 7214						  struct list_head **iter,
 7215						  bool *ignore)
 7216{
 7217	struct netdev_adjacent *lower;
 7218
 7219	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7220
 7221	if (&lower->list == &dev->adj_list.lower)
 7222		return NULL;
 7223
 7224	*iter = &lower->list;
 7225	*ignore = lower->ignore;
 7226
 7227	return lower->dev;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 7228}
 7229
 7230int netdev_walk_all_lower_dev(struct net_device *dev,
 7231			      int (*fn)(struct net_device *dev,
 7232					struct netdev_nested_priv *priv),
 7233			      struct netdev_nested_priv *priv)
 
 7234{
 7235	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7236	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7237	int ret, cur = 0;
 7238
 7239	now = dev;
 7240	iter = &dev->adj_list.lower;
 7241
 7242	while (1) {
 7243		if (now != dev) {
 7244			ret = fn(now, priv);
 7245			if (ret)
 7246				return ret;
 7247		}
 7248
 7249		next = NULL;
 7250		while (1) {
 7251			ldev = netdev_next_lower_dev(now, &iter);
 7252			if (!ldev)
 7253				break;
 7254
 7255			next = ldev;
 7256			niter = &ldev->adj_list.lower;
 7257			dev_stack[cur] = now;
 7258			iter_stack[cur++] = iter;
 7259			break;
 7260		}
 7261
 7262		if (!next) {
 7263			if (!cur)
 7264				return 0;
 7265			next = dev_stack[--cur];
 7266			niter = iter_stack[cur];
 7267		}
 7268
 7269		now = next;
 7270		iter = niter;
 7271	}
 7272
 7273	return 0;
 7274}
 7275EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
 7276
 7277static int __netdev_walk_all_lower_dev(struct net_device *dev,
 7278				       int (*fn)(struct net_device *dev,
 7279					 struct netdev_nested_priv *priv),
 7280				       struct netdev_nested_priv *priv)
 7281{
 7282	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7283	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7284	int ret, cur = 0;
 7285	bool ignore;
 7286
 7287	now = dev;
 7288	iter = &dev->adj_list.lower;
 7289
 7290	while (1) {
 7291		if (now != dev) {
 7292			ret = fn(now, priv);
 7293			if (ret)
 7294				return ret;
 7295		}
 7296
 7297		next = NULL;
 7298		while (1) {
 7299			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
 7300			if (!ldev)
 7301				break;
 7302			if (ignore)
 7303				continue;
 7304
 7305			next = ldev;
 7306			niter = &ldev->adj_list.lower;
 7307			dev_stack[cur] = now;
 7308			iter_stack[cur++] = iter;
 7309			break;
 7310		}
 7311
 7312		if (!next) {
 7313			if (!cur)
 7314				return 0;
 7315			next = dev_stack[--cur];
 7316			niter = iter_stack[cur];
 7317		}
 7318
 7319		now = next;
 7320		iter = niter;
 7321	}
 7322
 7323	return 0;
 7324}
 7325
 7326struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
 7327					     struct list_head **iter)
 7328{
 7329	struct netdev_adjacent *lower;
 7330
 7331	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7332	if (&lower->list == &dev->adj_list.lower)
 7333		return NULL;
 7334
 7335	*iter = &lower->list;
 7336
 7337	return lower->dev;
 7338}
 7339EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
 7340
 7341static u8 __netdev_upper_depth(struct net_device *dev)
 7342{
 7343	struct net_device *udev;
 7344	struct list_head *iter;
 7345	u8 max_depth = 0;
 7346	bool ignore;
 7347
 7348	for (iter = &dev->adj_list.upper,
 7349	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
 7350	     udev;
 7351	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
 7352		if (ignore)
 7353			continue;
 7354		if (max_depth < udev->upper_level)
 7355			max_depth = udev->upper_level;
 7356	}
 7357
 7358	return max_depth;
 7359}
 7360
 7361static u8 __netdev_lower_depth(struct net_device *dev)
 7362{
 7363	struct net_device *ldev;
 7364	struct list_head *iter;
 7365	u8 max_depth = 0;
 7366	bool ignore;
 7367
 7368	for (iter = &dev->adj_list.lower,
 7369	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
 7370	     ldev;
 7371	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
 7372		if (ignore)
 7373			continue;
 7374		if (max_depth < ldev->lower_level)
 7375			max_depth = ldev->lower_level;
 7376	}
 7377
 7378	return max_depth;
 7379}
 7380
 7381static int __netdev_update_upper_level(struct net_device *dev,
 7382				       struct netdev_nested_priv *__unused)
 7383{
 7384	dev->upper_level = __netdev_upper_depth(dev) + 1;
 7385	return 0;
 7386}
 7387
 7388static int __netdev_update_lower_level(struct net_device *dev,
 7389				       struct netdev_nested_priv *priv)
 7390{
 7391	dev->lower_level = __netdev_lower_depth(dev) + 1;
 7392
 7393#ifdef CONFIG_LOCKDEP
 7394	if (!priv)
 7395		return 0;
 7396
 7397	if (priv->flags & NESTED_SYNC_IMM)
 7398		dev->nested_level = dev->lower_level - 1;
 7399	if (priv->flags & NESTED_SYNC_TODO)
 7400		net_unlink_todo(dev);
 7401#endif
 7402	return 0;
 7403}
 7404
 7405int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
 7406				  int (*fn)(struct net_device *dev,
 7407					    struct netdev_nested_priv *priv),
 7408				  struct netdev_nested_priv *priv)
 7409{
 7410	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7411	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7412	int ret, cur = 0;
 7413
 7414	now = dev;
 7415	iter = &dev->adj_list.lower;
 7416
 7417	while (1) {
 7418		if (now != dev) {
 7419			ret = fn(now, priv);
 7420			if (ret)
 7421				return ret;
 7422		}
 7423
 7424		next = NULL;
 7425		while (1) {
 7426			ldev = netdev_next_lower_dev_rcu(now, &iter);
 7427			if (!ldev)
 7428				break;
 7429
 7430			next = ldev;
 7431			niter = &ldev->adj_list.lower;
 7432			dev_stack[cur] = now;
 7433			iter_stack[cur++] = iter;
 7434			break;
 7435		}
 7436
 7437		if (!next) {
 7438			if (!cur)
 7439				return 0;
 7440			next = dev_stack[--cur];
 7441			niter = iter_stack[cur];
 7442		}
 7443
 7444		now = next;
 7445		iter = niter;
 7446	}
 7447
 7448	return 0;
 7449}
 7450EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
 7451
 7452/**
 7453 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 7454 *				       lower neighbour list, RCU
 7455 *				       variant
 7456 * @dev: device
 7457 *
 7458 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 7459 * list. The caller must hold RCU read lock.
 7460 */
 7461void *netdev_lower_get_first_private_rcu(struct net_device *dev)
 7462{
 7463	struct netdev_adjacent *lower;
 7464
 7465	lower = list_first_or_null_rcu(&dev->adj_list.lower,
 7466			struct netdev_adjacent, list);
 7467	if (lower)
 7468		return lower->private;
 7469	return NULL;
 7470}
 7471EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
 7472
 7473/**
 7474 * netdev_master_upper_dev_get_rcu - Get master upper device
 7475 * @dev: device
 7476 *
 7477 * Find a master upper device and return pointer to it or NULL in case
 7478 * it's not there. The caller must hold the RCU read lock.
 7479 */
 7480struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
 7481{
 7482	struct netdev_adjacent *upper;
 7483
 7484	upper = list_first_or_null_rcu(&dev->adj_list.upper,
 7485				       struct netdev_adjacent, list);
 7486	if (upper && likely(upper->master))
 7487		return upper->dev;
 7488	return NULL;
 7489}
 7490EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
 7491
 7492static int netdev_adjacent_sysfs_add(struct net_device *dev,
 7493			      struct net_device *adj_dev,
 7494			      struct list_head *dev_list)
 7495{
 7496	char linkname[IFNAMSIZ+7];
 
 
 7497
 7498	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7499		"upper_%s" : "lower_%s", adj_dev->name);
 7500	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
 7501				 linkname);
 7502}
 7503static void netdev_adjacent_sysfs_del(struct net_device *dev,
 7504			       char *name,
 7505			       struct list_head *dev_list)
 7506{
 7507	char linkname[IFNAMSIZ+7];
 7508
 7509	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7510		"upper_%s" : "lower_%s", name);
 7511	sysfs_remove_link(&(dev->dev.kobj), linkname);
 
 
 
 
 
 
 
 
 
 
 
 
 
 7512}
 7513
 7514static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
 7515						 struct net_device *adj_dev,
 7516						 struct list_head *dev_list)
 7517{
 7518	return (dev_list == &dev->adj_list.upper ||
 7519		dev_list == &dev->adj_list.lower) &&
 7520		net_eq(dev_net(dev), dev_net(adj_dev));
 7521}
 7522
 7523static int __netdev_adjacent_dev_insert(struct net_device *dev,
 7524					struct net_device *adj_dev,
 7525					struct list_head *dev_list,
 7526					void *private, bool master)
 7527{
 7528	struct netdev_adjacent *adj;
 7529	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 7530
 7531	adj = __netdev_find_adj(adj_dev, dev_list);
 7532
 7533	if (adj) {
 7534		adj->ref_nr += 1;
 7535		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
 7536			 dev->name, adj_dev->name, adj->ref_nr);
 7537
 7538		return 0;
 7539	}
 7540
 7541	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
 7542	if (!adj)
 7543		return -ENOMEM;
 7544
 7545	adj->dev = adj_dev;
 7546	adj->master = master;
 7547	adj->ref_nr = 1;
 7548	adj->private = private;
 7549	adj->ignore = false;
 7550	dev_hold(adj_dev);
 7551
 7552	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
 7553		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
 7554
 7555	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
 7556		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
 7557		if (ret)
 7558			goto free_adj;
 7559	}
 7560
 7561	/* Ensure that master link is always the first item in list. */
 7562	if (master) {
 7563		ret = sysfs_create_link(&(dev->dev.kobj),
 7564					&(adj_dev->dev.kobj), "master");
 7565		if (ret)
 7566			goto remove_symlinks;
 7567
 7568		list_add_rcu(&adj->list, dev_list);
 7569	} else {
 7570		list_add_tail_rcu(&adj->list, dev_list);
 7571	}
 7572
 7573	return 0;
 7574
 7575remove_symlinks:
 7576	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7577		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7578free_adj:
 7579	kfree(adj);
 7580	dev_put(adj_dev);
 7581
 7582	return ret;
 7583}
 7584
 7585static void __netdev_adjacent_dev_remove(struct net_device *dev,
 7586					 struct net_device *adj_dev,
 7587					 u16 ref_nr,
 7588					 struct list_head *dev_list)
 7589{
 7590	struct netdev_adjacent *adj;
 7591
 7592	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
 7593		 dev->name, adj_dev->name, ref_nr);
 7594
 7595	adj = __netdev_find_adj(adj_dev, dev_list);
 7596
 7597	if (!adj) {
 7598		pr_err("Adjacency does not exist for device %s from %s\n",
 7599		       dev->name, adj_dev->name);
 7600		WARN_ON(1);
 7601		return;
 7602	}
 7603
 7604	if (adj->ref_nr > ref_nr) {
 7605		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
 7606			 dev->name, adj_dev->name, ref_nr,
 7607			 adj->ref_nr - ref_nr);
 7608		adj->ref_nr -= ref_nr;
 7609		return;
 7610	}
 7611
 7612	if (adj->master)
 7613		sysfs_remove_link(&(dev->dev.kobj), "master");
 7614
 7615	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7616		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7617
 7618	list_del_rcu(&adj->list);
 7619	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
 7620		 adj_dev->name, dev->name, adj_dev->name);
 7621	dev_put(adj_dev);
 7622	kfree_rcu(adj, rcu);
 7623}
 7624
 7625static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
 7626					    struct net_device *upper_dev,
 7627					    struct list_head *up_list,
 7628					    struct list_head *down_list,
 7629					    void *private, bool master)
 7630{
 7631	int ret;
 7632
 7633	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
 7634					   private, master);
 7635	if (ret)
 7636		return ret;
 
 
 
 7637
 7638	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
 7639					   private, false);
 7640	if (ret) {
 7641		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
 7642		return ret;
 7643	}
 7644
 7645	return 0;
 7646}
 7647
 7648static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
 7649					       struct net_device *upper_dev,
 7650					       u16 ref_nr,
 7651					       struct list_head *up_list,
 7652					       struct list_head *down_list)
 7653{
 7654	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
 7655	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
 7656}
 7657
 7658static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
 7659						struct net_device *upper_dev,
 7660						void *private, bool master)
 7661{
 7662	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
 7663						&dev->adj_list.upper,
 7664						&upper_dev->adj_list.lower,
 7665						private, master);
 7666}
 7667
 7668static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 7669						   struct net_device *upper_dev)
 7670{
 7671	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
 7672					   &dev->adj_list.upper,
 7673					   &upper_dev->adj_list.lower);
 7674}
 7675
 7676static int __netdev_upper_dev_link(struct net_device *dev,
 7677				   struct net_device *upper_dev, bool master,
 7678				   void *upper_priv, void *upper_info,
 7679				   struct netdev_nested_priv *priv,
 7680				   struct netlink_ext_ack *extack)
 7681{
 7682	struct netdev_notifier_changeupper_info changeupper_info = {
 7683		.info = {
 7684			.dev = dev,
 7685			.extack = extack,
 7686		},
 7687		.upper_dev = upper_dev,
 7688		.master = master,
 7689		.linking = true,
 7690		.upper_info = upper_info,
 7691	};
 7692	struct net_device *master_dev;
 7693	int ret = 0;
 7694
 7695	ASSERT_RTNL();
 7696
 7697	if (dev == upper_dev)
 7698		return -EBUSY;
 7699
 7700	/* To prevent loops, check if dev is not upper device to upper_dev. */
 7701	if (__netdev_has_upper_dev(upper_dev, dev))
 7702		return -EBUSY;
 7703
 7704	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
 7705		return -EMLINK;
 7706
 7707	if (!master) {
 7708		if (__netdev_has_upper_dev(dev, upper_dev))
 7709			return -EEXIST;
 7710	} else {
 7711		master_dev = __netdev_master_upper_dev_get(dev);
 7712		if (master_dev)
 7713			return master_dev == upper_dev ? -EEXIST : -EBUSY;
 7714	}
 7715
 7716	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7717					    &changeupper_info.info);
 7718	ret = notifier_to_errno(ret);
 7719	if (ret)
 7720		return ret;
 7721
 7722	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
 7723						   master);
 7724	if (ret)
 7725		return ret;
 7726
 7727	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7728					    &changeupper_info.info);
 7729	ret = notifier_to_errno(ret);
 7730	if (ret)
 7731		goto rollback;
 7732
 7733	__netdev_update_upper_level(dev, NULL);
 7734	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7735
 7736	__netdev_update_lower_level(upper_dev, priv);
 7737	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7738				    priv);
 7739
 7740	return 0;
 7741
 7742rollback:
 7743	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7744
 7745	return ret;
 7746}
 7747
 7748/**
 7749 * netdev_upper_dev_link - Add a link to the upper device
 7750 * @dev: device
 7751 * @upper_dev: new upper device
 7752 * @extack: netlink extended ack
 7753 *
 7754 * Adds a link to device which is upper to this one. The caller must hold
 7755 * the RTNL lock. On a failure a negative errno code is returned.
 7756 * On success the reference counts are adjusted and the function
 7757 * returns zero.
 7758 */
 7759int netdev_upper_dev_link(struct net_device *dev,
 7760			  struct net_device *upper_dev,
 7761			  struct netlink_ext_ack *extack)
 7762{
 7763	struct netdev_nested_priv priv = {
 7764		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7765		.data = NULL,
 7766	};
 7767
 7768	return __netdev_upper_dev_link(dev, upper_dev, false,
 7769				       NULL, NULL, &priv, extack);
 7770}
 7771EXPORT_SYMBOL(netdev_upper_dev_link);
 7772
 7773/**
 7774 * netdev_master_upper_dev_link - Add a master link to the upper device
 7775 * @dev: device
 7776 * @upper_dev: new upper device
 7777 * @upper_priv: upper device private
 7778 * @upper_info: upper info to be passed down via notifier
 7779 * @extack: netlink extended ack
 7780 *
 7781 * Adds a link to device which is upper to this one. In this case, only
 7782 * one master upper device can be linked, although other non-master devices
 7783 * might be linked as well. The caller must hold the RTNL lock.
 7784 * On a failure a negative errno code is returned. On success the reference
 7785 * counts are adjusted and the function returns zero.
 7786 */
 7787int netdev_master_upper_dev_link(struct net_device *dev,
 7788				 struct net_device *upper_dev,
 7789				 void *upper_priv, void *upper_info,
 7790				 struct netlink_ext_ack *extack)
 7791{
 7792	struct netdev_nested_priv priv = {
 7793		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7794		.data = NULL,
 7795	};
 7796
 7797	return __netdev_upper_dev_link(dev, upper_dev, true,
 7798				       upper_priv, upper_info, &priv, extack);
 7799}
 7800EXPORT_SYMBOL(netdev_master_upper_dev_link);
 7801
 7802static void __netdev_upper_dev_unlink(struct net_device *dev,
 7803				      struct net_device *upper_dev,
 7804				      struct netdev_nested_priv *priv)
 7805{
 7806	struct netdev_notifier_changeupper_info changeupper_info = {
 7807		.info = {
 7808			.dev = dev,
 7809		},
 7810		.upper_dev = upper_dev,
 7811		.linking = false,
 7812	};
 7813
 7814	ASSERT_RTNL();
 7815
 7816	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
 7817
 7818	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7819				      &changeupper_info.info);
 7820
 7821	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7822
 7823	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7824				      &changeupper_info.info);
 7825
 7826	__netdev_update_upper_level(dev, NULL);
 7827	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7828
 7829	__netdev_update_lower_level(upper_dev, priv);
 7830	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7831				    priv);
 7832}
 7833
 7834/**
 7835 * netdev_upper_dev_unlink - Removes a link to upper device
 7836 * @dev: device
 7837 * @upper_dev: new upper device
 7838 *
 7839 * Removes a link to device which is upper to this one. The caller must hold
 7840 * the RTNL lock.
 7841 */
 7842void netdev_upper_dev_unlink(struct net_device *dev,
 7843			     struct net_device *upper_dev)
 7844{
 7845	struct netdev_nested_priv priv = {
 7846		.flags = NESTED_SYNC_TODO,
 7847		.data = NULL,
 7848	};
 7849
 7850	__netdev_upper_dev_unlink(dev, upper_dev, &priv);
 7851}
 7852EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
 
 
 7853
 7854static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
 7855				      struct net_device *lower_dev,
 7856				      bool val)
 7857{
 7858	struct netdev_adjacent *adj;
 7859
 7860	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
 7861	if (adj)
 7862		adj->ignore = val;
 7863
 7864	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
 7865	if (adj)
 7866		adj->ignore = val;
 7867}
 7868
 7869static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
 7870					struct net_device *lower_dev)
 7871{
 7872	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
 7873}
 7874
 7875static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
 7876				       struct net_device *lower_dev)
 7877{
 7878	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
 7879}
 7880
 7881int netdev_adjacent_change_prepare(struct net_device *old_dev,
 7882				   struct net_device *new_dev,
 7883				   struct net_device *dev,
 7884				   struct netlink_ext_ack *extack)
 7885{
 7886	struct netdev_nested_priv priv = {
 7887		.flags = 0,
 7888		.data = NULL,
 7889	};
 7890	int err;
 7891
 7892	if (!new_dev)
 7893		return 0;
 7894
 7895	if (old_dev && new_dev != old_dev)
 7896		netdev_adjacent_dev_disable(dev, old_dev);
 7897	err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
 7898				      extack);
 7899	if (err) {
 7900		if (old_dev && new_dev != old_dev)
 7901			netdev_adjacent_dev_enable(dev, old_dev);
 7902		return err;
 7903	}
 7904
 7905	return 0;
 7906}
 7907EXPORT_SYMBOL(netdev_adjacent_change_prepare);
 7908
 7909void netdev_adjacent_change_commit(struct net_device *old_dev,
 7910				   struct net_device *new_dev,
 7911				   struct net_device *dev)
 7912{
 7913	struct netdev_nested_priv priv = {
 7914		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7915		.data = NULL,
 7916	};
 7917
 7918	if (!new_dev || !old_dev)
 7919		return;
 7920
 7921	if (new_dev == old_dev)
 7922		return;
 7923
 7924	netdev_adjacent_dev_enable(dev, old_dev);
 7925	__netdev_upper_dev_unlink(old_dev, dev, &priv);
 7926}
 7927EXPORT_SYMBOL(netdev_adjacent_change_commit);
 7928
 7929void netdev_adjacent_change_abort(struct net_device *old_dev,
 7930				  struct net_device *new_dev,
 7931				  struct net_device *dev)
 7932{
 7933	struct netdev_nested_priv priv = {
 7934		.flags = 0,
 7935		.data = NULL,
 7936	};
 7937
 7938	if (!new_dev)
 7939		return;
 7940
 7941	if (old_dev && new_dev != old_dev)
 7942		netdev_adjacent_dev_enable(dev, old_dev);
 7943
 7944	__netdev_upper_dev_unlink(new_dev, dev, &priv);
 7945}
 7946EXPORT_SYMBOL(netdev_adjacent_change_abort);
 7947
 7948/**
 7949 * netdev_bonding_info_change - Dispatch event about slave change
 7950 * @dev: device
 7951 * @bonding_info: info to dispatch
 7952 *
 7953 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 7954 * The caller must hold the RTNL lock.
 7955 */
 7956void netdev_bonding_info_change(struct net_device *dev,
 7957				struct netdev_bonding_info *bonding_info)
 7958{
 7959	struct netdev_notifier_bonding_info info = {
 7960		.info.dev = dev,
 7961	};
 7962
 7963	memcpy(&info.bonding_info, bonding_info,
 7964	       sizeof(struct netdev_bonding_info));
 7965	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
 7966				      &info.info);
 7967}
 7968EXPORT_SYMBOL(netdev_bonding_info_change);
 7969
 7970/**
 7971 * netdev_get_xmit_slave - Get the xmit slave of master device
 7972 * @dev: device
 7973 * @skb: The packet
 7974 * @all_slaves: assume all the slaves are active
 7975 *
 7976 * The reference counters are not incremented so the caller must be
 7977 * careful with locks. The caller must hold RCU lock.
 7978 * %NULL is returned if no slave is found.
 
 7979 */
 7980
 7981struct net_device *netdev_get_xmit_slave(struct net_device *dev,
 7982					 struct sk_buff *skb,
 7983					 bool all_slaves)
 7984{
 7985	const struct net_device_ops *ops = dev->netdev_ops;
 7986
 7987	if (!ops->ndo_get_xmit_slave)
 7988		return NULL;
 7989	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
 7990}
 7991EXPORT_SYMBOL(netdev_get_xmit_slave);
 7992
 7993static void netdev_adjacent_add_links(struct net_device *dev)
 7994{
 7995	struct netdev_adjacent *iter;
 7996
 7997	struct net *net = dev_net(dev);
 7998
 7999	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8000		if (!net_eq(net, dev_net(iter->dev)))
 8001			continue;
 8002		netdev_adjacent_sysfs_add(iter->dev, dev,
 8003					  &iter->dev->adj_list.lower);
 8004		netdev_adjacent_sysfs_add(dev, iter->dev,
 8005					  &dev->adj_list.upper);
 8006	}
 8007
 8008	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8009		if (!net_eq(net, dev_net(iter->dev)))
 8010			continue;
 8011		netdev_adjacent_sysfs_add(iter->dev, dev,
 8012					  &iter->dev->adj_list.upper);
 8013		netdev_adjacent_sysfs_add(dev, iter->dev,
 8014					  &dev->adj_list.lower);
 8015	}
 8016}
 8017
 8018static void netdev_adjacent_del_links(struct net_device *dev)
 8019{
 8020	struct netdev_adjacent *iter;
 8021
 8022	struct net *net = dev_net(dev);
 8023
 8024	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8025		if (!net_eq(net, dev_net(iter->dev)))
 8026			continue;
 8027		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8028					  &iter->dev->adj_list.lower);
 8029		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8030					  &dev->adj_list.upper);
 8031	}
 8032
 8033	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8034		if (!net_eq(net, dev_net(iter->dev)))
 8035			continue;
 8036		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8037					  &iter->dev->adj_list.upper);
 8038		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8039					  &dev->adj_list.lower);
 8040	}
 8041}
 8042
 8043void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
 8044{
 8045	struct netdev_adjacent *iter;
 8046
 8047	struct net *net = dev_net(dev);
 8048
 8049	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8050		if (!net_eq(net, dev_net(iter->dev)))
 8051			continue;
 8052		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8053					  &iter->dev->adj_list.lower);
 8054		netdev_adjacent_sysfs_add(iter->dev, dev,
 8055					  &iter->dev->adj_list.lower);
 8056	}
 8057
 8058	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8059		if (!net_eq(net, dev_net(iter->dev)))
 8060			continue;
 8061		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8062					  &iter->dev->adj_list.upper);
 8063		netdev_adjacent_sysfs_add(iter->dev, dev,
 8064					  &iter->dev->adj_list.upper);
 8065	}
 8066}
 8067
 8068void *netdev_lower_dev_get_private(struct net_device *dev,
 8069				   struct net_device *lower_dev)
 8070{
 8071	struct netdev_adjacent *lower;
 8072
 8073	if (!lower_dev)
 8074		return NULL;
 8075	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
 8076	if (!lower)
 8077		return NULL;
 8078
 8079	return lower->private;
 8080}
 8081EXPORT_SYMBOL(netdev_lower_dev_get_private);
 8082
 8083
 8084/**
 8085 * netdev_lower_change - Dispatch event about lower device state change
 8086 * @lower_dev: device
 8087 * @lower_state_info: state to dispatch
 8088 *
 8089 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 8090 * The caller must hold the RTNL lock.
 
 
 8091 */
 8092void netdev_lower_state_changed(struct net_device *lower_dev,
 8093				void *lower_state_info)
 8094{
 8095	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
 8096		.info.dev = lower_dev,
 8097	};
 8098
 8099	ASSERT_RTNL();
 8100	changelowerstate_info.lower_state_info = lower_state_info;
 8101	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
 8102				      &changelowerstate_info.info);
 
 
 
 
 
 
 
 
 8103}
 8104EXPORT_SYMBOL(netdev_lower_state_changed);
 8105
 8106static void dev_change_rx_flags(struct net_device *dev, int flags)
 8107{
 8108	const struct net_device_ops *ops = dev->netdev_ops;
 8109
 8110	if (ops->ndo_change_rx_flags)
 8111		ops->ndo_change_rx_flags(dev, flags);
 8112}
 8113
 8114static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 8115{
 8116	unsigned int old_flags = dev->flags;
 8117	kuid_t uid;
 8118	kgid_t gid;
 8119
 8120	ASSERT_RTNL();
 8121
 8122	dev->flags |= IFF_PROMISC;
 8123	dev->promiscuity += inc;
 8124	if (dev->promiscuity == 0) {
 8125		/*
 8126		 * Avoid overflow.
 8127		 * If inc causes overflow, untouch promisc and return error.
 8128		 */
 8129		if (inc < 0)
 8130			dev->flags &= ~IFF_PROMISC;
 8131		else {
 8132			dev->promiscuity -= inc;
 8133			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
 8134				dev->name);
 8135			return -EOVERFLOW;
 8136		}
 8137	}
 8138	if (dev->flags != old_flags) {
 8139		pr_info("device %s %s promiscuous mode\n",
 8140			dev->name,
 8141			dev->flags & IFF_PROMISC ? "entered" : "left");
 8142		if (audit_enabled) {
 8143			current_uid_gid(&uid, &gid);
 8144			audit_log(audit_context(), GFP_ATOMIC,
 8145				  AUDIT_ANOM_PROMISCUOUS,
 8146				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
 8147				  dev->name, (dev->flags & IFF_PROMISC),
 8148				  (old_flags & IFF_PROMISC),
 8149				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
 8150				  from_kuid(&init_user_ns, uid),
 8151				  from_kgid(&init_user_ns, gid),
 8152				  audit_get_sessionid(current));
 8153		}
 8154
 8155		dev_change_rx_flags(dev, IFF_PROMISC);
 8156	}
 8157	if (notify)
 8158		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
 8159	return 0;
 8160}
 8161
 8162/**
 8163 *	dev_set_promiscuity	- update promiscuity count on a device
 8164 *	@dev: device
 8165 *	@inc: modifier
 8166 *
 8167 *	Add or remove promiscuity from a device. While the count in the device
 8168 *	remains above zero the interface remains promiscuous. Once it hits zero
 8169 *	the device reverts back to normal filtering operation. A negative inc
 8170 *	value is used to drop promiscuity on the device.
 8171 *	Return 0 if successful or a negative errno code on error.
 8172 */
 8173int dev_set_promiscuity(struct net_device *dev, int inc)
 8174{
 8175	unsigned int old_flags = dev->flags;
 8176	int err;
 8177
 8178	err = __dev_set_promiscuity(dev, inc, true);
 8179	if (err < 0)
 8180		return err;
 8181	if (dev->flags != old_flags)
 8182		dev_set_rx_mode(dev);
 8183	return err;
 8184}
 8185EXPORT_SYMBOL(dev_set_promiscuity);
 8186
 8187static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
 
 
 
 
 
 
 
 
 
 
 
 
 
 8188{
 8189	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
 8190
 8191	ASSERT_RTNL();
 8192
 8193	dev->flags |= IFF_ALLMULTI;
 8194	dev->allmulti += inc;
 8195	if (dev->allmulti == 0) {
 8196		/*
 8197		 * Avoid overflow.
 8198		 * If inc causes overflow, untouch allmulti and return error.
 8199		 */
 8200		if (inc < 0)
 8201			dev->flags &= ~IFF_ALLMULTI;
 8202		else {
 8203			dev->allmulti -= inc;
 8204			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
 8205				dev->name);
 8206			return -EOVERFLOW;
 8207		}
 8208	}
 8209	if (dev->flags ^ old_flags) {
 8210		dev_change_rx_flags(dev, IFF_ALLMULTI);
 8211		dev_set_rx_mode(dev);
 8212		if (notify)
 8213			__dev_notify_flags(dev, old_flags,
 8214					   dev->gflags ^ old_gflags);
 8215	}
 8216	return 0;
 8217}
 8218
 8219/**
 8220 *	dev_set_allmulti	- update allmulti count on a device
 8221 *	@dev: device
 8222 *	@inc: modifier
 8223 *
 8224 *	Add or remove reception of all multicast frames to a device. While the
 8225 *	count in the device remains above zero the interface remains listening
 8226 *	to all interfaces. Once it hits zero the device reverts back to normal
 8227 *	filtering operation. A negative @inc value is used to drop the counter
 8228 *	when releasing a resource needing all multicasts.
 8229 *	Return 0 if successful or a negative errno code on error.
 8230 */
 8231
 8232int dev_set_allmulti(struct net_device *dev, int inc)
 8233{
 8234	return __dev_set_allmulti(dev, inc, true);
 8235}
 8236EXPORT_SYMBOL(dev_set_allmulti);
 8237
 8238/*
 8239 *	Upload unicast and multicast address lists to device and
 8240 *	configure RX filtering. When the device doesn't support unicast
 8241 *	filtering it is put in promiscuous mode while unicast addresses
 8242 *	are present.
 8243 */
 8244void __dev_set_rx_mode(struct net_device *dev)
 8245{
 8246	const struct net_device_ops *ops = dev->netdev_ops;
 8247
 8248	/* dev_open will call this function so the list will stay sane. */
 8249	if (!(dev->flags&IFF_UP))
 8250		return;
 8251
 8252	if (!netif_device_present(dev))
 8253		return;
 8254
 8255	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
 8256		/* Unicast addresses changes may only happen under the rtnl,
 8257		 * therefore calling __dev_set_promiscuity here is safe.
 8258		 */
 8259		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
 8260			__dev_set_promiscuity(dev, 1, false);
 8261			dev->uc_promisc = true;
 8262		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
 8263			__dev_set_promiscuity(dev, -1, false);
 8264			dev->uc_promisc = false;
 8265		}
 8266	}
 8267
 8268	if (ops->ndo_set_rx_mode)
 8269		ops->ndo_set_rx_mode(dev);
 8270}
 8271
 8272void dev_set_rx_mode(struct net_device *dev)
 8273{
 8274	netif_addr_lock_bh(dev);
 8275	__dev_set_rx_mode(dev);
 8276	netif_addr_unlock_bh(dev);
 8277}
 8278
 8279/**
 8280 *	dev_get_flags - get flags reported to userspace
 8281 *	@dev: device
 8282 *
 8283 *	Get the combination of flag bits exported through APIs to userspace.
 8284 */
 8285unsigned int dev_get_flags(const struct net_device *dev)
 8286{
 8287	unsigned int flags;
 8288
 8289	flags = (dev->flags & ~(IFF_PROMISC |
 8290				IFF_ALLMULTI |
 8291				IFF_RUNNING |
 8292				IFF_LOWER_UP |
 8293				IFF_DORMANT)) |
 8294		(dev->gflags & (IFF_PROMISC |
 8295				IFF_ALLMULTI));
 8296
 8297	if (netif_running(dev)) {
 8298		if (netif_oper_up(dev))
 8299			flags |= IFF_RUNNING;
 8300		if (netif_carrier_ok(dev))
 8301			flags |= IFF_LOWER_UP;
 8302		if (netif_dormant(dev))
 8303			flags |= IFF_DORMANT;
 8304	}
 8305
 8306	return flags;
 8307}
 8308EXPORT_SYMBOL(dev_get_flags);
 8309
 8310int __dev_change_flags(struct net_device *dev, unsigned int flags,
 8311		       struct netlink_ext_ack *extack)
 8312{
 8313	unsigned int old_flags = dev->flags;
 8314	int ret;
 8315
 8316	ASSERT_RTNL();
 8317
 8318	/*
 8319	 *	Set the flags on our device.
 8320	 */
 8321
 8322	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
 8323			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
 8324			       IFF_AUTOMEDIA)) |
 8325		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
 8326				    IFF_ALLMULTI));
 8327
 8328	/*
 8329	 *	Load in the correct multicast list now the flags have changed.
 8330	 */
 8331
 8332	if ((old_flags ^ flags) & IFF_MULTICAST)
 8333		dev_change_rx_flags(dev, IFF_MULTICAST);
 8334
 8335	dev_set_rx_mode(dev);
 8336
 8337	/*
 8338	 *	Have we downed the interface. We handle IFF_UP ourselves
 8339	 *	according to user attempts to set it, rather than blindly
 8340	 *	setting it.
 8341	 */
 8342
 8343	ret = 0;
 8344	if ((old_flags ^ flags) & IFF_UP) {
 8345		if (old_flags & IFF_UP)
 8346			__dev_close(dev);
 8347		else
 8348			ret = __dev_open(dev, extack);
 8349	}
 8350
 8351	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 8352		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 8353		unsigned int old_flags = dev->flags;
 8354
 8355		dev->gflags ^= IFF_PROMISC;
 8356
 8357		if (__dev_set_promiscuity(dev, inc, false) >= 0)
 8358			if (dev->flags != old_flags)
 8359				dev_set_rx_mode(dev);
 8360	}
 8361
 8362	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
 8363	 * is important. Some (broken) drivers set IFF_PROMISC, when
 8364	 * IFF_ALLMULTI is requested not asking us and not reporting.
 8365	 */
 8366	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
 8367		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
 8368
 8369		dev->gflags ^= IFF_ALLMULTI;
 8370		__dev_set_allmulti(dev, inc, false);
 8371	}
 8372
 8373	return ret;
 8374}
 8375
 8376void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
 8377			unsigned int gchanges)
 8378{
 8379	unsigned int changes = dev->flags ^ old_flags;
 8380
 8381	if (gchanges)
 8382		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
 8383
 8384	if (changes & IFF_UP) {
 8385		if (dev->flags & IFF_UP)
 8386			call_netdevice_notifiers(NETDEV_UP, dev);
 8387		else
 8388			call_netdevice_notifiers(NETDEV_DOWN, dev);
 8389	}
 8390
 8391	if (dev->flags & IFF_UP &&
 8392	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
 8393		struct netdev_notifier_change_info change_info = {
 8394			.info = {
 8395				.dev = dev,
 8396			},
 8397			.flags_changed = changes,
 8398		};
 8399
 8400		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
 8401	}
 8402}
 8403
 8404/**
 8405 *	dev_change_flags - change device settings
 8406 *	@dev: device
 8407 *	@flags: device state flags
 8408 *	@extack: netlink extended ack
 8409 *
 8410 *	Change settings on device based state flags. The flags are
 8411 *	in the userspace exported format.
 8412 */
 8413int dev_change_flags(struct net_device *dev, unsigned int flags,
 8414		     struct netlink_ext_ack *extack)
 8415{
 8416	int ret;
 8417	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
 8418
 8419	ret = __dev_change_flags(dev, flags, extack);
 8420	if (ret < 0)
 8421		return ret;
 8422
 8423	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
 8424	__dev_notify_flags(dev, old_flags, changes);
 
 
 
 8425	return ret;
 8426}
 8427EXPORT_SYMBOL(dev_change_flags);
 8428
 8429int __dev_set_mtu(struct net_device *dev, int new_mtu)
 8430{
 8431	const struct net_device_ops *ops = dev->netdev_ops;
 8432
 8433	if (ops->ndo_change_mtu)
 8434		return ops->ndo_change_mtu(dev, new_mtu);
 8435
 8436	/* Pairs with all the lockless reads of dev->mtu in the stack */
 8437	WRITE_ONCE(dev->mtu, new_mtu);
 8438	return 0;
 8439}
 8440EXPORT_SYMBOL(__dev_set_mtu);
 8441
 8442int dev_validate_mtu(struct net_device *dev, int new_mtu,
 8443		     struct netlink_ext_ack *extack)
 8444{
 8445	/* MTU must be positive, and in range */
 8446	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
 8447		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
 8448		return -EINVAL;
 8449	}
 8450
 8451	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
 8452		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
 8453		return -EINVAL;
 8454	}
 8455	return 0;
 8456}
 8457
 8458/**
 8459 *	dev_set_mtu_ext - Change maximum transfer unit
 8460 *	@dev: device
 8461 *	@new_mtu: new transfer unit
 8462 *	@extack: netlink extended ack
 8463 *
 8464 *	Change the maximum transfer size of the network device.
 8465 */
 8466int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
 8467		    struct netlink_ext_ack *extack)
 8468{
 8469	int err, orig_mtu;
 
 8470
 8471	if (new_mtu == dev->mtu)
 8472		return 0;
 8473
 8474	err = dev_validate_mtu(dev, new_mtu, extack);
 8475	if (err)
 8476		return err;
 8477
 8478	if (!netif_device_present(dev))
 8479		return -ENODEV;
 8480
 8481	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
 8482	err = notifier_to_errno(err);
 8483	if (err)
 8484		return err;
 8485
 8486	orig_mtu = dev->mtu;
 8487	err = __dev_set_mtu(dev, new_mtu);
 8488
 8489	if (!err) {
 8490		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8491						   orig_mtu);
 8492		err = notifier_to_errno(err);
 8493		if (err) {
 8494			/* setting mtu back and notifying everyone again,
 8495			 * so that they have a chance to revert changes.
 8496			 */
 8497			__dev_set_mtu(dev, orig_mtu);
 8498			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8499						     new_mtu);
 8500		}
 8501	}
 8502	return err;
 8503}
 8504
 8505int dev_set_mtu(struct net_device *dev, int new_mtu)
 8506{
 8507	struct netlink_ext_ack extack;
 8508	int err;
 8509
 8510	memset(&extack, 0, sizeof(extack));
 8511	err = dev_set_mtu_ext(dev, new_mtu, &extack);
 8512	if (err && extack._msg)
 8513		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
 8514	return err;
 8515}
 8516EXPORT_SYMBOL(dev_set_mtu);
 8517
 8518/**
 8519 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
 8520 *	@dev: device
 8521 *	@new_len: new tx queue length
 8522 */
 8523int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
 8524{
 8525	unsigned int orig_len = dev->tx_queue_len;
 8526	int res;
 8527
 8528	if (new_len != (unsigned int)new_len)
 8529		return -ERANGE;
 8530
 8531	if (new_len != orig_len) {
 8532		dev->tx_queue_len = new_len;
 8533		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
 8534		res = notifier_to_errno(res);
 8535		if (res)
 8536			goto err_rollback;
 8537		res = dev_qdisc_change_tx_queue_len(dev);
 8538		if (res)
 8539			goto err_rollback;
 8540	}
 8541
 8542	return 0;
 8543
 8544err_rollback:
 8545	netdev_err(dev, "refused to change device tx_queue_len\n");
 8546	dev->tx_queue_len = orig_len;
 8547	return res;
 8548}
 8549
 8550/**
 8551 *	dev_set_group - Change group this device belongs to
 8552 *	@dev: device
 8553 *	@new_group: group this device should belong to
 8554 */
 8555void dev_set_group(struct net_device *dev, int new_group)
 8556{
 8557	dev->group = new_group;
 8558}
 8559EXPORT_SYMBOL(dev_set_group);
 8560
 8561/**
 8562 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
 8563 *	@dev: device
 8564 *	@addr: new address
 8565 *	@extack: netlink extended ack
 8566 */
 8567int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 8568			      struct netlink_ext_ack *extack)
 8569{
 8570	struct netdev_notifier_pre_changeaddr_info info = {
 8571		.info.dev = dev,
 8572		.info.extack = extack,
 8573		.dev_addr = addr,
 8574	};
 8575	int rc;
 8576
 8577	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
 8578	return notifier_to_errno(rc);
 8579}
 8580EXPORT_SYMBOL(dev_pre_changeaddr_notify);
 8581
 8582/**
 8583 *	dev_set_mac_address - Change Media Access Control Address
 8584 *	@dev: device
 8585 *	@sa: new address
 8586 *	@extack: netlink extended ack
 8587 *
 8588 *	Change the hardware (MAC) address of the device
 8589 */
 8590int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
 8591			struct netlink_ext_ack *extack)
 8592{
 8593	const struct net_device_ops *ops = dev->netdev_ops;
 8594	int err;
 8595
 8596	if (!ops->ndo_set_mac_address)
 8597		return -EOPNOTSUPP;
 8598	if (sa->sa_family != dev->type)
 8599		return -EINVAL;
 8600	if (!netif_device_present(dev))
 8601		return -ENODEV;
 8602	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
 8603	if (err)
 8604		return err;
 8605	err = ops->ndo_set_mac_address(dev, sa);
 8606	if (err)
 8607		return err;
 8608	dev->addr_assign_type = NET_ADDR_SET;
 8609	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 8610	add_device_randomness(dev->dev_addr, dev->addr_len);
 8611	return 0;
 8612}
 8613EXPORT_SYMBOL(dev_set_mac_address);
 8614
 8615/**
 8616 *	dev_change_carrier - Change device carrier
 8617 *	@dev: device
 8618 *	@new_carrier: new value
 8619 *
 8620 *	Change device carrier
 8621 */
 8622int dev_change_carrier(struct net_device *dev, bool new_carrier)
 8623{
 8624	const struct net_device_ops *ops = dev->netdev_ops;
 
 8625
 8626	if (!ops->ndo_change_carrier)
 8627		return -EOPNOTSUPP;
 8628	if (!netif_device_present(dev))
 8629		return -ENODEV;
 8630	return ops->ndo_change_carrier(dev, new_carrier);
 8631}
 8632EXPORT_SYMBOL(dev_change_carrier);
 8633
 8634/**
 8635 *	dev_get_phys_port_id - Get device physical port ID
 8636 *	@dev: device
 8637 *	@ppid: port ID
 8638 *
 8639 *	Get device physical port ID
 8640 */
 8641int dev_get_phys_port_id(struct net_device *dev,
 8642			 struct netdev_phys_item_id *ppid)
 8643{
 8644	const struct net_device_ops *ops = dev->netdev_ops;
 8645
 8646	if (!ops->ndo_get_phys_port_id)
 8647		return -EOPNOTSUPP;
 8648	return ops->ndo_get_phys_port_id(dev, ppid);
 8649}
 8650EXPORT_SYMBOL(dev_get_phys_port_id);
 8651
 8652/**
 8653 *	dev_get_phys_port_name - Get device physical port name
 8654 *	@dev: device
 8655 *	@name: port name
 8656 *	@len: limit of bytes to copy to name
 8657 *
 8658 *	Get device physical port name
 8659 */
 8660int dev_get_phys_port_name(struct net_device *dev,
 8661			   char *name, size_t len)
 8662{
 8663	const struct net_device_ops *ops = dev->netdev_ops;
 8664	int err;
 8665
 8666	if (ops->ndo_get_phys_port_name) {
 8667		err = ops->ndo_get_phys_port_name(dev, name, len);
 8668		if (err != -EOPNOTSUPP)
 8669			return err;
 8670	}
 8671	return devlink_compat_phys_port_name_get(dev, name, len);
 8672}
 8673EXPORT_SYMBOL(dev_get_phys_port_name);
 8674
 8675/**
 8676 *	dev_get_port_parent_id - Get the device's port parent identifier
 8677 *	@dev: network device
 8678 *	@ppid: pointer to a storage for the port's parent identifier
 8679 *	@recurse: allow/disallow recursion to lower devices
 8680 *
 8681 *	Get the devices's port parent identifier
 8682 */
 8683int dev_get_port_parent_id(struct net_device *dev,
 8684			   struct netdev_phys_item_id *ppid,
 8685			   bool recurse)
 8686{
 8687	const struct net_device_ops *ops = dev->netdev_ops;
 8688	struct netdev_phys_item_id first = { };
 8689	struct net_device *lower_dev;
 8690	struct list_head *iter;
 8691	int err;
 8692
 8693	if (ops->ndo_get_port_parent_id) {
 8694		err = ops->ndo_get_port_parent_id(dev, ppid);
 8695		if (err != -EOPNOTSUPP)
 8696			return err;
 8697	}
 8698
 8699	err = devlink_compat_switch_id_get(dev, ppid);
 8700	if (!err || err != -EOPNOTSUPP)
 8701		return err;
 8702
 8703	if (!recurse)
 8704		return -EOPNOTSUPP;
 
 
 
 
 
 8705
 8706	netdev_for_each_lower_dev(dev, lower_dev, iter) {
 8707		err = dev_get_port_parent_id(lower_dev, ppid, recurse);
 8708		if (err)
 8709			break;
 8710		if (!first.id_len)
 8711			first = *ppid;
 8712		else if (memcmp(&first, ppid, sizeof(*ppid)))
 8713			return -EOPNOTSUPP;
 8714	}
 8715
 8716	return err;
 8717}
 8718EXPORT_SYMBOL(dev_get_port_parent_id);
 8719
 8720/**
 8721 *	netdev_port_same_parent_id - Indicate if two network devices have
 8722 *	the same port parent identifier
 8723 *	@a: first network device
 8724 *	@b: second network device
 8725 */
 8726bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
 8727{
 8728	struct netdev_phys_item_id a_id = { };
 8729	struct netdev_phys_item_id b_id = { };
 
 8730
 8731	if (dev_get_port_parent_id(a, &a_id, true) ||
 8732	    dev_get_port_parent_id(b, &b_id, true))
 8733		return false;
 8734
 8735	return netdev_phys_item_id_same(&a_id, &b_id);
 8736}
 8737EXPORT_SYMBOL(netdev_port_same_parent_id);
 8738
 8739/**
 8740 *	dev_change_proto_down - update protocol port state information
 8741 *	@dev: device
 8742 *	@proto_down: new value
 8743 *
 8744 *	This info can be used by switch drivers to set the phys state of the
 8745 *	port.
 8746 */
 8747int dev_change_proto_down(struct net_device *dev, bool proto_down)
 8748{
 8749	const struct net_device_ops *ops = dev->netdev_ops;
 8750
 8751	if (!ops->ndo_change_proto_down)
 
 8752		return -EOPNOTSUPP;
 8753	if (!netif_device_present(dev))
 8754		return -ENODEV;
 8755	return ops->ndo_change_proto_down(dev, proto_down);
 8756}
 8757EXPORT_SYMBOL(dev_change_proto_down);
 8758
 8759/**
 8760 *	dev_change_proto_down_generic - generic implementation for
 8761 * 	ndo_change_proto_down that sets carrier according to
 8762 * 	proto_down.
 8763 *
 8764 *	@dev: device
 8765 *	@proto_down: new value
 8766 */
 8767int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
 8768{
 8769	if (proto_down)
 8770		netif_carrier_off(dev);
 8771	else
 8772		netif_carrier_on(dev);
 8773	dev->proto_down = proto_down;
 8774	return 0;
 8775}
 8776EXPORT_SYMBOL(dev_change_proto_down_generic);
 8777
 8778/**
 8779 *	dev_change_proto_down_reason - proto down reason
 8780 *
 8781 *	@dev: device
 8782 *	@mask: proto down mask
 8783 *	@value: proto down value
 8784 */
 8785void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
 8786				  u32 value)
 8787{
 8788	int b;
 8789
 8790	if (!mask) {
 8791		dev->proto_down_reason = value;
 8792	} else {
 8793		for_each_set_bit(b, &mask, 32) {
 8794			if (value & (1 << b))
 8795				dev->proto_down_reason |= BIT(b);
 8796			else
 8797				dev->proto_down_reason &= ~BIT(b);
 8798		}
 8799	}
 8800}
 8801EXPORT_SYMBOL(dev_change_proto_down_reason);
 8802
 8803struct bpf_xdp_link {
 8804	struct bpf_link link;
 8805	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
 8806	int flags;
 8807};
 
 
 8808
 8809static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
 8810{
 8811	if (flags & XDP_FLAGS_HW_MODE)
 8812		return XDP_MODE_HW;
 8813	if (flags & XDP_FLAGS_DRV_MODE)
 8814		return XDP_MODE_DRV;
 8815	if (flags & XDP_FLAGS_SKB_MODE)
 8816		return XDP_MODE_SKB;
 8817	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
 8818}
 8819
 8820static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
 8821{
 8822	switch (mode) {
 8823	case XDP_MODE_SKB:
 8824		return generic_xdp_install;
 8825	case XDP_MODE_DRV:
 8826	case XDP_MODE_HW:
 8827		return dev->netdev_ops->ndo_bpf;
 8828	default:
 8829		return NULL;
 8830	};
 8831}
 8832
 8833static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
 8834					 enum bpf_xdp_mode mode)
 8835{
 8836	return dev->xdp_state[mode].link;
 8837}
 8838
 8839static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
 8840				     enum bpf_xdp_mode mode)
 8841{
 8842	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
 8843
 8844	if (link)
 8845		return link->link.prog;
 8846	return dev->xdp_state[mode].prog;
 8847}
 
 8848
 8849u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
 8850{
 8851	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 8852
 8853	return prog ? prog->aux->id : 0;
 
 8854}
 8855
 8856static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
 8857			     struct bpf_xdp_link *link)
 8858{
 8859	dev->xdp_state[mode].link = link;
 8860	dev->xdp_state[mode].prog = NULL;
 8861}
 8862
 8863static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
 8864			     struct bpf_prog *prog)
 8865{
 8866	dev->xdp_state[mode].link = NULL;
 8867	dev->xdp_state[mode].prog = prog;
 8868}
 
 
 
 
 
 8869
 8870static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
 8871			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
 8872			   u32 flags, struct bpf_prog *prog)
 8873{
 8874	struct netdev_bpf xdp;
 8875	int err;
 
 8876
 8877	memset(&xdp, 0, sizeof(xdp));
 8878	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
 8879	xdp.extack = extack;
 8880	xdp.flags = flags;
 8881	xdp.prog = prog;
 8882
 8883	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
 8884	 * "moved" into driver), so they don't increment it on their own, but
 8885	 * they do decrement refcnt when program is detached or replaced.
 8886	 * Given net_device also owns link/prog, we need to bump refcnt here
 8887	 * to prevent drivers from underflowing it.
 8888	 */
 8889	if (prog)
 8890		bpf_prog_inc(prog);
 8891	err = bpf_op(dev, &xdp);
 8892	if (err) {
 8893		if (prog)
 8894			bpf_prog_put(prog);
 8895		return err;
 8896	}
 8897
 8898	if (mode != XDP_MODE_HW)
 8899		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
 8900
 8901	return 0;
 8902}
 8903
 8904static void dev_xdp_uninstall(struct net_device *dev)
 8905{
 8906	struct bpf_xdp_link *link;
 8907	struct bpf_prog *prog;
 8908	enum bpf_xdp_mode mode;
 8909	bpf_op_t bpf_op;
 8910
 8911	ASSERT_RTNL();
 8912
 8913	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
 8914		prog = dev_xdp_prog(dev, mode);
 8915		if (!prog)
 8916			continue;
 8917
 8918		bpf_op = dev_xdp_bpf_op(dev, mode);
 8919		if (!bpf_op)
 8920			continue;
 8921
 8922		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 8923
 8924		/* auto-detach link from net device */
 8925		link = dev_xdp_link(dev, mode);
 8926		if (link)
 8927			link->dev = NULL;
 8928		else
 8929			bpf_prog_put(prog);
 8930
 8931		dev_xdp_set_link(dev, mode, NULL);
 8932	}
 8933}
 
 8934
 8935static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
 8936			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
 8937			  struct bpf_prog *old_prog, u32 flags)
 8938{
 8939	struct bpf_prog *cur_prog;
 8940	enum bpf_xdp_mode mode;
 8941	bpf_op_t bpf_op;
 8942	int err;
 8943
 8944	ASSERT_RTNL();
 8945
 8946	/* either link or prog attachment, never both */
 8947	if (link && (new_prog || old_prog))
 8948		return -EINVAL;
 8949	/* link supports only XDP mode flags */
 8950	if (link && (flags & ~XDP_FLAGS_MODES)) {
 8951		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
 8952		return -EINVAL;
 8953	}
 8954	/* just one XDP mode bit should be set, zero defaults to SKB mode */
 8955	if (hweight32(flags & XDP_FLAGS_MODES) > 1) {
 8956		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
 8957		return -EINVAL;
 8958	}
 8959	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
 8960	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
 8961		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
 8962		return -EINVAL;
 8963	}
 8964
 8965	mode = dev_xdp_mode(dev, flags);
 8966	/* can't replace attached link */
 8967	if (dev_xdp_link(dev, mode)) {
 8968		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
 8969		return -EBUSY;
 8970	}
 8971
 8972	cur_prog = dev_xdp_prog(dev, mode);
 8973	/* can't replace attached prog with link */
 8974	if (link && cur_prog) {
 8975		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
 8976		return -EBUSY;
 8977	}
 8978	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
 8979		NL_SET_ERR_MSG(extack, "Active program does not match expected");
 8980		return -EEXIST;
 8981	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 8982
 8983	/* put effective new program into new_prog */
 8984	if (link)
 8985		new_prog = link->link.prog;
 8986
 8987	if (new_prog) {
 8988		bool offload = mode == XDP_MODE_HW;
 8989		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
 8990					       ? XDP_MODE_DRV : XDP_MODE_SKB;
 8991
 8992		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
 8993			NL_SET_ERR_MSG(extack, "XDP program already attached");
 8994			return -EBUSY;
 8995		}
 8996		if (!offload && dev_xdp_prog(dev, other_mode)) {
 8997			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
 8998			return -EEXIST;
 8999		}
 9000		if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
 9001			NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
 9002			return -EINVAL;
 9003		}
 9004		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
 9005			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
 9006			return -EINVAL;
 9007		}
 9008		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
 9009			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
 9010			return -EINVAL;
 9011		}
 9012	}
 9013
 9014	/* don't call drivers if the effective program didn't change */
 9015	if (new_prog != cur_prog) {
 9016		bpf_op = dev_xdp_bpf_op(dev, mode);
 9017		if (!bpf_op) {
 9018			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
 9019			return -EOPNOTSUPP;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 9020		}
 
 9021
 9022		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
 9023		if (err)
 9024			return err;
 9025	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 9026
 9027	if (link)
 9028		dev_xdp_set_link(dev, mode, link);
 9029	else
 9030		dev_xdp_set_prog(dev, mode, new_prog);
 9031	if (cur_prog)
 9032		bpf_prog_put(cur_prog);
 
 
 9033
 9034	return 0;
 9035}
 9036
 9037static int dev_xdp_attach_link(struct net_device *dev,
 9038			       struct netlink_ext_ack *extack,
 9039			       struct bpf_xdp_link *link)
 9040{
 9041	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
 9042}
 9043
 9044static int dev_xdp_detach_link(struct net_device *dev,
 9045			       struct netlink_ext_ack *extack,
 9046			       struct bpf_xdp_link *link)
 9047{
 9048	enum bpf_xdp_mode mode;
 9049	bpf_op_t bpf_op;
 9050
 9051	ASSERT_RTNL();
 9052
 9053	mode = dev_xdp_mode(dev, link->flags);
 9054	if (dev_xdp_link(dev, mode) != link)
 9055		return -EINVAL;
 9056
 9057	bpf_op = dev_xdp_bpf_op(dev, mode);
 9058	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 9059	dev_xdp_set_link(dev, mode, NULL);
 9060	return 0;
 9061}
 9062
 9063static void bpf_xdp_link_release(struct bpf_link *link)
 9064{
 9065	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9066
 9067	rtnl_lock();
 9068
 9069	/* if racing with net_device's tear down, xdp_link->dev might be
 9070	 * already NULL, in which case link was already auto-detached
 9071	 */
 9072	if (xdp_link->dev) {
 9073		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
 9074		xdp_link->dev = NULL;
 9075	}
 9076
 9077	rtnl_unlock();
 9078}
 9079
 9080static int bpf_xdp_link_detach(struct bpf_link *link)
 9081{
 9082	bpf_xdp_link_release(link);
 9083	return 0;
 9084}
 9085
 9086static void bpf_xdp_link_dealloc(struct bpf_link *link)
 9087{
 9088	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9089
 9090	kfree(xdp_link);
 9091}
 9092
 9093static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
 9094				     struct seq_file *seq)
 9095{
 9096	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9097	u32 ifindex = 0;
 9098
 9099	rtnl_lock();
 9100	if (xdp_link->dev)
 9101		ifindex = xdp_link->dev->ifindex;
 9102	rtnl_unlock();
 9103
 9104	seq_printf(seq, "ifindex:\t%u\n", ifindex);
 9105}
 9106
 9107static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
 9108				       struct bpf_link_info *info)
 9109{
 9110	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9111	u32 ifindex = 0;
 9112
 9113	rtnl_lock();
 9114	if (xdp_link->dev)
 9115		ifindex = xdp_link->dev->ifindex;
 9116	rtnl_unlock();
 9117
 9118	info->xdp.ifindex = ifindex;
 9119	return 0;
 9120}
 9121
 9122static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
 9123			       struct bpf_prog *old_prog)
 9124{
 9125	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9126	enum bpf_xdp_mode mode;
 9127	bpf_op_t bpf_op;
 9128	int err = 0;
 9129
 9130	rtnl_lock();
 9131
 9132	/* link might have been auto-released already, so fail */
 9133	if (!xdp_link->dev) {
 9134		err = -ENOLINK;
 9135		goto out_unlock;
 9136	}
 9137
 9138	if (old_prog && link->prog != old_prog) {
 9139		err = -EPERM;
 9140		goto out_unlock;
 9141	}
 9142	old_prog = link->prog;
 9143	if (old_prog == new_prog) {
 9144		/* no-op, don't disturb drivers */
 9145		bpf_prog_put(new_prog);
 9146		goto out_unlock;
 9147	}
 9148
 9149	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
 9150	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
 9151	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
 9152			      xdp_link->flags, new_prog);
 9153	if (err)
 9154		goto out_unlock;
 9155
 9156	old_prog = xchg(&link->prog, new_prog);
 9157	bpf_prog_put(old_prog);
 9158
 9159out_unlock:
 9160	rtnl_unlock();
 9161	return err;
 9162}
 9163
 9164static const struct bpf_link_ops bpf_xdp_link_lops = {
 9165	.release = bpf_xdp_link_release,
 9166	.dealloc = bpf_xdp_link_dealloc,
 9167	.detach = bpf_xdp_link_detach,
 9168	.show_fdinfo = bpf_xdp_link_show_fdinfo,
 9169	.fill_link_info = bpf_xdp_link_fill_link_info,
 9170	.update_prog = bpf_xdp_link_update,
 9171};
 9172
 9173int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 9174{
 9175	struct net *net = current->nsproxy->net_ns;
 9176	struct bpf_link_primer link_primer;
 9177	struct bpf_xdp_link *link;
 9178	struct net_device *dev;
 9179	int err, fd;
 9180
 9181	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
 9182	if (!dev)
 9183		return -EINVAL;
 9184
 9185	link = kzalloc(sizeof(*link), GFP_USER);
 9186	if (!link) {
 9187		err = -ENOMEM;
 9188		goto out_put_dev;
 9189	}
 9190
 9191	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
 9192	link->dev = dev;
 9193	link->flags = attr->link_create.flags;
 9194
 9195	err = bpf_link_prime(&link->link, &link_primer);
 9196	if (err) {
 9197		kfree(link);
 9198		goto out_put_dev;
 9199	}
 9200
 9201	rtnl_lock();
 9202	err = dev_xdp_attach_link(dev, NULL, link);
 9203	rtnl_unlock();
 9204
 9205	if (err) {
 9206		bpf_link_cleanup(&link_primer);
 9207		goto out_put_dev;
 9208	}
 9209
 9210	fd = bpf_link_settle(&link_primer);
 9211	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
 9212	dev_put(dev);
 9213	return fd;
 9214
 9215out_put_dev:
 9216	dev_put(dev);
 9217	return err;
 9218}
 9219
 9220/**
 9221 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
 9222 *	@dev: device
 9223 *	@extack: netlink extended ack
 9224 *	@fd: new program fd or negative value to clear
 9225 *	@expected_fd: old program fd that userspace expects to replace or clear
 9226 *	@flags: xdp-related flags
 9227 *
 9228 *	Set or clear a bpf program for a device
 9229 */
 9230int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 9231		      int fd, int expected_fd, u32 flags)
 9232{
 9233	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
 9234	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
 9235	int err;
 9236
 9237	ASSERT_RTNL();
 9238
 9239	if (fd >= 0) {
 9240		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
 9241						 mode != XDP_MODE_SKB);
 9242		if (IS_ERR(new_prog))
 9243			return PTR_ERR(new_prog);
 9244	}
 9245
 9246	if (expected_fd >= 0) {
 9247		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
 9248						 mode != XDP_MODE_SKB);
 9249		if (IS_ERR(old_prog)) {
 9250			err = PTR_ERR(old_prog);
 9251			old_prog = NULL;
 9252			goto err_out;
 9253		}
 9254	}
 9255
 9256	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
 9257
 9258err_out:
 9259	if (err && new_prog)
 9260		bpf_prog_put(new_prog);
 9261	if (old_prog)
 9262		bpf_prog_put(old_prog);
 9263	return err;
 9264}
 9265
 9266/**
 9267 *	dev_new_index	-	allocate an ifindex
 9268 *	@net: the applicable net namespace
 9269 *
 9270 *	Returns a suitable unique value for a new device interface
 9271 *	number.  The caller must hold the rtnl semaphore or the
 9272 *	dev_base_lock to be sure it remains unique.
 9273 */
 9274static int dev_new_index(struct net *net)
 9275{
 9276	int ifindex = net->ifindex;
 9277
 9278	for (;;) {
 9279		if (++ifindex <= 0)
 9280			ifindex = 1;
 9281		if (!__dev_get_by_index(net, ifindex))
 9282			return net->ifindex = ifindex;
 9283	}
 9284}
 9285
 9286/* Delayed registration/unregisteration */
 9287static LIST_HEAD(net_todo_list);
 9288DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 9289
 9290static void net_set_todo(struct net_device *dev)
 9291{
 9292	list_add_tail(&dev->todo_list, &net_todo_list);
 9293	dev_net(dev)->dev_unreg_count++;
 9294}
 9295
 9296static void rollback_registered_many(struct list_head *head)
 9297{
 9298	struct net_device *dev, *tmp;
 9299	LIST_HEAD(close_head);
 9300
 9301	BUG_ON(dev_boot_phase);
 9302	ASSERT_RTNL();
 9303
 9304	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
 9305		/* Some devices call without registering
 9306		 * for initialization unwind. Remove those
 9307		 * devices and proceed with the remaining.
 9308		 */
 9309		if (dev->reg_state == NETREG_UNINITIALIZED) {
 9310			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
 9311				 dev->name, dev);
 9312
 9313			WARN_ON(1);
 9314			list_del(&dev->unreg_list);
 9315			continue;
 9316		}
 9317		dev->dismantle = true;
 9318		BUG_ON(dev->reg_state != NETREG_REGISTERED);
 9319	}
 9320
 9321	/* If device is running, close it first. */
 9322	list_for_each_entry(dev, head, unreg_list)
 9323		list_add_tail(&dev->close_list, &close_head);
 9324	dev_close_many(&close_head, true);
 9325
 9326	list_for_each_entry(dev, head, unreg_list) {
 9327		/* And unlink it from device chain. */
 9328		unlist_netdevice(dev);
 9329
 9330		dev->reg_state = NETREG_UNREGISTERING;
 9331	}
 9332	flush_all_backlogs();
 9333
 9334	synchronize_net();
 9335
 9336	list_for_each_entry(dev, head, unreg_list) {
 9337		struct sk_buff *skb = NULL;
 9338
 9339		/* Shutdown queueing discipline. */
 9340		dev_shutdown(dev);
 9341
 9342		dev_xdp_uninstall(dev);
 9343
 9344		/* Notify protocols, that we are about to destroy
 9345		 * this device. They should clean all the things.
 9346		 */
 9347		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 9348
 9349		if (!dev->rtnl_link_ops ||
 9350		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 9351			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
 9352						     GFP_KERNEL, NULL, 0);
 9353
 9354		/*
 9355		 *	Flush the unicast and multicast chains
 9356		 */
 9357		dev_uc_flush(dev);
 9358		dev_mc_flush(dev);
 9359
 9360		netdev_name_node_alt_flush(dev);
 9361		netdev_name_node_free(dev->name_node);
 9362
 9363		if (dev->netdev_ops->ndo_uninit)
 9364			dev->netdev_ops->ndo_uninit(dev);
 9365
 9366		if (skb)
 9367			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
 9368
 9369		/* Notifier chain MUST detach us all upper devices. */
 9370		WARN_ON(netdev_has_any_upper_dev(dev));
 9371		WARN_ON(netdev_has_any_lower_dev(dev));
 9372
 9373		/* Remove entries from kobject tree */
 9374		netdev_unregister_kobject(dev);
 9375#ifdef CONFIG_XPS
 9376		/* Remove XPS queueing entries */
 9377		netif_reset_xps_queues_gt(dev, 0);
 9378#endif
 9379	}
 9380
 
 
 
 
 9381	synchronize_net();
 9382
 9383	list_for_each_entry(dev, head, unreg_list)
 9384		dev_put(dev);
 9385}
 9386
 9387static void rollback_registered(struct net_device *dev)
 9388{
 9389	LIST_HEAD(single);
 9390
 9391	list_add(&dev->unreg_list, &single);
 9392	rollback_registered_many(&single);
 9393	list_del(&single);
 9394}
 9395
 9396static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
 9397	struct net_device *upper, netdev_features_t features)
 9398{
 9399	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9400	netdev_features_t feature;
 9401	int feature_bit;
 9402
 9403	for_each_netdev_feature(upper_disables, feature_bit) {
 9404		feature = __NETIF_F_BIT(feature_bit);
 9405		if (!(upper->wanted_features & feature)
 9406		    && (features & feature)) {
 9407			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
 9408				   &feature, upper->name);
 9409			features &= ~feature;
 9410		}
 9411	}
 9412
 9413	return features;
 9414}
 9415
 9416static void netdev_sync_lower_features(struct net_device *upper,
 9417	struct net_device *lower, netdev_features_t features)
 9418{
 9419	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9420	netdev_features_t feature;
 9421	int feature_bit;
 9422
 9423	for_each_netdev_feature(upper_disables, feature_bit) {
 9424		feature = __NETIF_F_BIT(feature_bit);
 9425		if (!(features & feature) && (lower->features & feature)) {
 9426			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
 9427				   &feature, lower->name);
 9428			lower->wanted_features &= ~feature;
 9429			__netdev_update_features(lower);
 9430
 9431			if (unlikely(lower->features & feature))
 9432				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
 9433					    &feature, lower->name);
 9434			else
 9435				netdev_features_change(lower);
 9436		}
 9437	}
 9438}
 9439
 9440static netdev_features_t netdev_fix_features(struct net_device *dev,
 9441	netdev_features_t features)
 9442{
 9443	/* Fix illegal checksum combinations */
 9444	if ((features & NETIF_F_HW_CSUM) &&
 9445	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
 9446		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
 9447		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
 9448	}
 9449
 
 
 
 
 
 
 
 
 9450	/* TSO requires that SG is present as well. */
 9451	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
 9452		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
 9453		features &= ~NETIF_F_ALL_TSO;
 9454	}
 9455
 9456	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
 9457					!(features & NETIF_F_IP_CSUM)) {
 9458		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
 9459		features &= ~NETIF_F_TSO;
 9460		features &= ~NETIF_F_TSO_ECN;
 9461	}
 9462
 9463	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
 9464					 !(features & NETIF_F_IPV6_CSUM)) {
 9465		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
 9466		features &= ~NETIF_F_TSO6;
 9467	}
 9468
 9469	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
 9470	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
 9471		features &= ~NETIF_F_TSO_MANGLEID;
 9472
 9473	/* TSO ECN requires that TSO is present as well. */
 9474	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
 9475		features &= ~NETIF_F_TSO_ECN;
 9476
 9477	/* Software GSO depends on SG. */
 9478	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
 9479		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
 9480		features &= ~NETIF_F_GSO;
 9481	}
 9482
 9483	/* GSO partial features require GSO partial be set */
 9484	if ((features & dev->gso_partial_features) &&
 9485	    !(features & NETIF_F_GSO_PARTIAL)) {
 9486		netdev_dbg(dev,
 9487			   "Dropping partially supported GSO features since no GSO partial.\n");
 9488		features &= ~dev->gso_partial_features;
 9489	}
 9490
 9491	if (!(features & NETIF_F_RXCSUM)) {
 9492		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
 9493		 * successfully merged by hardware must also have the
 9494		 * checksum verified by hardware.  If the user does not
 9495		 * want to enable RXCSUM, logically, we should disable GRO_HW.
 9496		 */
 9497		if (features & NETIF_F_GRO_HW) {
 9498			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
 9499			features &= ~NETIF_F_GRO_HW;
 9500		}
 9501	}
 9502
 9503	/* LRO/HW-GRO features cannot be combined with RX-FCS */
 9504	if (features & NETIF_F_RXFCS) {
 9505		if (features & NETIF_F_LRO) {
 9506			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
 9507			features &= ~NETIF_F_LRO;
 9508		}
 9509
 9510		if (features & NETIF_F_GRO_HW) {
 9511			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
 9512			features &= ~NETIF_F_GRO_HW;
 
 9513		}
 9514	}
 9515
 9516	return features;
 9517}
 9518
 9519int __netdev_update_features(struct net_device *dev)
 9520{
 9521	struct net_device *upper, *lower;
 9522	netdev_features_t features;
 9523	struct list_head *iter;
 9524	int err = -1;
 9525
 9526	ASSERT_RTNL();
 9527
 9528	features = netdev_get_wanted_features(dev);
 9529
 9530	if (dev->netdev_ops->ndo_fix_features)
 9531		features = dev->netdev_ops->ndo_fix_features(dev, features);
 9532
 9533	/* driver might be less strict about feature dependencies */
 9534	features = netdev_fix_features(dev, features);
 9535
 9536	/* some features can't be enabled if they're off an an upper device */
 9537	netdev_for_each_upper_dev_rcu(dev, upper, iter)
 9538		features = netdev_sync_upper_features(dev, upper, features);
 9539
 9540	if (dev->features == features)
 9541		goto sync_lower;
 9542
 9543	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
 9544		&dev->features, &features);
 9545
 9546	if (dev->netdev_ops->ndo_set_features)
 9547		err = dev->netdev_ops->ndo_set_features(dev, features);
 9548	else
 9549		err = 0;
 9550
 9551	if (unlikely(err < 0)) {
 9552		netdev_err(dev,
 9553			"set_features() failed (%d); wanted %pNF, left %pNF\n",
 9554			err, &features, &dev->features);
 9555		/* return non-0 since some features might have changed and
 9556		 * it's better to fire a spurious notification than miss it
 9557		 */
 9558		return -1;
 9559	}
 9560
 9561sync_lower:
 9562	/* some features must be disabled on lower devices when disabled
 9563	 * on an upper device (think: bonding master or bridge)
 9564	 */
 9565	netdev_for_each_lower_dev(dev, lower, iter)
 9566		netdev_sync_lower_features(dev, lower, features);
 9567
 9568	if (!err) {
 9569		netdev_features_t diff = features ^ dev->features;
 9570
 9571		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9572			/* udp_tunnel_{get,drop}_rx_info both need
 9573			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
 9574			 * device, or they won't do anything.
 9575			 * Thus we need to update dev->features
 9576			 * *before* calling udp_tunnel_get_rx_info,
 9577			 * but *after* calling udp_tunnel_drop_rx_info.
 9578			 */
 9579			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9580				dev->features = features;
 9581				udp_tunnel_get_rx_info(dev);
 9582			} else {
 9583				udp_tunnel_drop_rx_info(dev);
 9584			}
 9585		}
 9586
 9587		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9588			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9589				dev->features = features;
 9590				err |= vlan_get_rx_ctag_filter_info(dev);
 9591			} else {
 9592				vlan_drop_rx_ctag_filter_info(dev);
 9593			}
 9594		}
 9595
 9596		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
 9597			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
 9598				dev->features = features;
 9599				err |= vlan_get_rx_stag_filter_info(dev);
 9600			} else {
 9601				vlan_drop_rx_stag_filter_info(dev);
 9602			}
 9603		}
 9604
 9605		dev->features = features;
 9606	}
 9607
 9608	return err < 0 ? 0 : 1;
 9609}
 9610
 9611/**
 9612 *	netdev_update_features - recalculate device features
 9613 *	@dev: the device to check
 9614 *
 9615 *	Recalculate dev->features set and send notifications if it
 9616 *	has changed. Should be called after driver or hardware dependent
 9617 *	conditions might have changed that influence the features.
 9618 */
 9619void netdev_update_features(struct net_device *dev)
 9620{
 9621	if (__netdev_update_features(dev))
 9622		netdev_features_change(dev);
 9623}
 9624EXPORT_SYMBOL(netdev_update_features);
 9625
 9626/**
 9627 *	netdev_change_features - recalculate device features
 9628 *	@dev: the device to check
 9629 *
 9630 *	Recalculate dev->features set and send notifications even
 9631 *	if they have not changed. Should be called instead of
 9632 *	netdev_update_features() if also dev->vlan_features might
 9633 *	have changed to allow the changes to be propagated to stacked
 9634 *	VLAN devices.
 9635 */
 9636void netdev_change_features(struct net_device *dev)
 9637{
 9638	__netdev_update_features(dev);
 9639	netdev_features_change(dev);
 9640}
 9641EXPORT_SYMBOL(netdev_change_features);
 9642
 9643/**
 9644 *	netif_stacked_transfer_operstate -	transfer operstate
 9645 *	@rootdev: the root or lower level device to transfer state from
 9646 *	@dev: the device to transfer operstate to
 9647 *
 9648 *	Transfer operational state from root to device. This is normally
 9649 *	called when a stacking relationship exists between the root
 9650 *	device and the device(a leaf device).
 9651 */
 9652void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 9653					struct net_device *dev)
 9654{
 9655	if (rootdev->operstate == IF_OPER_DORMANT)
 9656		netif_dormant_on(dev);
 9657	else
 9658		netif_dormant_off(dev);
 9659
 9660	if (rootdev->operstate == IF_OPER_TESTING)
 9661		netif_testing_on(dev);
 9662	else
 9663		netif_testing_off(dev);
 9664
 9665	if (netif_carrier_ok(rootdev))
 9666		netif_carrier_on(dev);
 9667	else
 9668		netif_carrier_off(dev);
 9669}
 9670EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 9671
 
 9672static int netif_alloc_rx_queues(struct net_device *dev)
 9673{
 9674	unsigned int i, count = dev->num_rx_queues;
 9675	struct netdev_rx_queue *rx;
 9676	size_t sz = count * sizeof(*rx);
 9677	int err = 0;
 9678
 9679	BUG_ON(count < 1);
 9680
 9681	rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 9682	if (!rx)
 
 9683		return -ENOMEM;
 9684
 9685	dev->_rx = rx;
 9686
 9687	for (i = 0; i < count; i++) {
 9688		rx[i].dev = dev;
 9689
 9690		/* XDP RX-queue setup */
 9691		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
 9692		if (err < 0)
 9693			goto err_rxq_info;
 9694	}
 9695	return 0;
 9696
 9697err_rxq_info:
 9698	/* Rollback successful reg's and free other resources */
 9699	while (i--)
 9700		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
 9701	kvfree(dev->_rx);
 9702	dev->_rx = NULL;
 9703	return err;
 9704}
 9705
 9706static void netif_free_rx_queues(struct net_device *dev)
 9707{
 9708	unsigned int i, count = dev->num_rx_queues;
 9709
 9710	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
 9711	if (!dev->_rx)
 9712		return;
 9713
 9714	for (i = 0; i < count; i++)
 9715		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
 9716
 9717	kvfree(dev->_rx);
 9718}
 
 9719
 9720static void netdev_init_one_queue(struct net_device *dev,
 9721				  struct netdev_queue *queue, void *_unused)
 9722{
 9723	/* Initialize queue lock */
 9724	spin_lock_init(&queue->_xmit_lock);
 9725	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
 9726	queue->xmit_lock_owner = -1;
 9727	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
 9728	queue->dev = dev;
 9729#ifdef CONFIG_BQL
 9730	dql_init(&queue->dql, HZ);
 9731#endif
 9732}
 9733
 9734static void netif_free_tx_queues(struct net_device *dev)
 9735{
 9736	kvfree(dev->_tx);
 9737}
 9738
 9739static int netif_alloc_netdev_queues(struct net_device *dev)
 9740{
 9741	unsigned int count = dev->num_tx_queues;
 9742	struct netdev_queue *tx;
 9743	size_t sz = count * sizeof(*tx);
 9744
 9745	if (count < 1 || count > 0xffff)
 9746		return -EINVAL;
 9747
 9748	tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 9749	if (!tx)
 
 9750		return -ENOMEM;
 9751
 9752	dev->_tx = tx;
 9753
 9754	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
 9755	spin_lock_init(&dev->tx_global_lock);
 9756
 9757	return 0;
 9758}
 9759
 9760void netif_tx_stop_all_queues(struct net_device *dev)
 9761{
 9762	unsigned int i;
 9763
 9764	for (i = 0; i < dev->num_tx_queues; i++) {
 9765		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 9766
 9767		netif_tx_stop_queue(txq);
 9768	}
 9769}
 9770EXPORT_SYMBOL(netif_tx_stop_all_queues);
 9771
 9772/**
 9773 *	register_netdevice	- register a network device
 9774 *	@dev: device to register
 9775 *
 9776 *	Take a completed network device structure and add it to the kernel
 9777 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 9778 *	chain. 0 is returned on success. A negative errno code is returned
 9779 *	on a failure to set up the device, or if the name is a duplicate.
 9780 *
 9781 *	Callers must hold the rtnl semaphore. You may want
 9782 *	register_netdev() instead of this.
 9783 *
 9784 *	BUGS:
 9785 *	The locking appears insufficient to guarantee two parallel registers
 9786 *	will not get the same name.
 9787 */
 9788
 9789int register_netdevice(struct net_device *dev)
 9790{
 9791	int ret;
 9792	struct net *net = dev_net(dev);
 9793
 9794	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
 9795		     NETDEV_FEATURE_COUNT);
 9796	BUG_ON(dev_boot_phase);
 9797	ASSERT_RTNL();
 9798
 9799	might_sleep();
 9800
 9801	/* When net_device's are persistent, this will be fatal. */
 9802	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
 9803	BUG_ON(!net);
 9804
 9805	ret = ethtool_check_ops(dev->ethtool_ops);
 9806	if (ret)
 9807		return ret;
 9808
 9809	spin_lock_init(&dev->addr_list_lock);
 9810	netdev_set_addr_lockdep_class(dev);
 9811
 9812	ret = dev_get_valid_name(net, dev, dev->name);
 
 
 9813	if (ret < 0)
 9814		goto out;
 9815
 9816	ret = -ENOMEM;
 9817	dev->name_node = netdev_name_node_head_alloc(dev);
 9818	if (!dev->name_node)
 9819		goto out;
 9820
 9821	/* Init, if this function is available */
 9822	if (dev->netdev_ops->ndo_init) {
 9823		ret = dev->netdev_ops->ndo_init(dev);
 9824		if (ret) {
 9825			if (ret > 0)
 9826				ret = -EIO;
 9827			goto err_free_name;
 9828		}
 9829	}
 9830
 9831	if (((dev->hw_features | dev->features) &
 9832	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
 9833	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
 9834	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
 9835		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
 9836		ret = -EINVAL;
 9837		goto err_uninit;
 9838	}
 9839
 9840	ret = -EBUSY;
 9841	if (!dev->ifindex)
 9842		dev->ifindex = dev_new_index(net);
 9843	else if (__dev_get_by_index(net, dev->ifindex))
 9844		goto err_uninit;
 9845
 9846	/* Transfer changeable features to wanted_features and enable
 9847	 * software offloads (GSO and GRO).
 9848	 */
 9849	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
 9850	dev->features |= NETIF_F_SOFT_FEATURES;
 9851
 9852	if (dev->netdev_ops->ndo_udp_tunnel_add) {
 9853		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 9854		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 9855	}
 9856
 9857	dev->wanted_features = dev->features & dev->hw_features;
 9858
 9859	if (!(dev->flags & IFF_LOOPBACK))
 
 9860		dev->hw_features |= NETIF_F_NOCACHE_COPY;
 9861
 9862	/* If IPv4 TCP segmentation offload is supported we should also
 9863	 * allow the device to enable segmenting the frame with the option
 9864	 * of ignoring a static IP ID value.  This doesn't enable the
 9865	 * feature itself but allows the user to enable it later.
 9866	 */
 9867	if (dev->hw_features & NETIF_F_TSO)
 9868		dev->hw_features |= NETIF_F_TSO_MANGLEID;
 9869	if (dev->vlan_features & NETIF_F_TSO)
 9870		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
 9871	if (dev->mpls_features & NETIF_F_TSO)
 9872		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
 9873	if (dev->hw_enc_features & NETIF_F_TSO)
 9874		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
 9875
 9876	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
 9877	 */
 9878	dev->vlan_features |= NETIF_F_HIGHDMA;
 9879
 9880	/* Make NETIF_F_SG inheritable to tunnel devices.
 9881	 */
 9882	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
 9883
 9884	/* Make NETIF_F_SG inheritable to MPLS.
 9885	 */
 9886	dev->mpls_features |= NETIF_F_SG;
 9887
 9888	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
 9889	ret = notifier_to_errno(ret);
 9890	if (ret)
 9891		goto err_uninit;
 9892
 9893	ret = netdev_register_kobject(dev);
 9894	if (ret) {
 9895		dev->reg_state = NETREG_UNREGISTERED;
 9896		goto err_uninit;
 9897	}
 9898	dev->reg_state = NETREG_REGISTERED;
 9899
 9900	__netdev_update_features(dev);
 9901
 9902	/*
 9903	 *	Default initial state at registry is that the
 9904	 *	device is present.
 9905	 */
 9906
 9907	set_bit(__LINK_STATE_PRESENT, &dev->state);
 9908
 9909	linkwatch_init_dev(dev);
 9910
 9911	dev_init_scheduler(dev);
 9912	dev_hold(dev);
 9913	list_netdevice(dev);
 9914	add_device_randomness(dev->dev_addr, dev->addr_len);
 9915
 9916	/* If the device has permanent device address, driver should
 9917	 * set dev_addr and also addr_assign_type should be set to
 9918	 * NET_ADDR_PERM (default value).
 9919	 */
 9920	if (dev->addr_assign_type == NET_ADDR_PERM)
 9921		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
 9922
 9923	/* Notify protocols, that a new device appeared. */
 9924	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
 9925	ret = notifier_to_errno(ret);
 9926	if (ret) {
 9927		rollback_registered(dev);
 9928		rcu_barrier();
 9929
 9930		dev->reg_state = NETREG_UNREGISTERED;
 9931		/* We should put the kobject that hold in
 9932		 * netdev_unregister_kobject(), otherwise
 9933		 * the net device cannot be freed when
 9934		 * driver calls free_netdev(), because the
 9935		 * kobject is being hold.
 9936		 */
 9937		kobject_put(&dev->dev.kobj);
 9938	}
 9939	/*
 9940	 *	Prevent userspace races by waiting until the network
 9941	 *	device is fully setup before sending notifications.
 9942	 */
 9943	if (!dev->rtnl_link_ops ||
 9944	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 9945		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
 9946
 9947out:
 9948	return ret;
 9949
 9950err_uninit:
 9951	if (dev->netdev_ops->ndo_uninit)
 9952		dev->netdev_ops->ndo_uninit(dev);
 9953	if (dev->priv_destructor)
 9954		dev->priv_destructor(dev);
 9955err_free_name:
 9956	netdev_name_node_free(dev->name_node);
 9957	goto out;
 9958}
 9959EXPORT_SYMBOL(register_netdevice);
 9960
 9961/**
 9962 *	init_dummy_netdev	- init a dummy network device for NAPI
 9963 *	@dev: device to init
 9964 *
 9965 *	This takes a network device structure and initialize the minimum
 9966 *	amount of fields so it can be used to schedule NAPI polls without
 9967 *	registering a full blown interface. This is to be used by drivers
 9968 *	that need to tie several hardware interfaces to a single NAPI
 9969 *	poll scheduler due to HW limitations.
 9970 */
 9971int init_dummy_netdev(struct net_device *dev)
 9972{
 9973	/* Clear everything. Note we don't initialize spinlocks
 9974	 * are they aren't supposed to be taken by any of the
 9975	 * NAPI code and this dummy netdev is supposed to be
 9976	 * only ever used for NAPI polls
 9977	 */
 9978	memset(dev, 0, sizeof(struct net_device));
 9979
 9980	/* make sure we BUG if trying to hit standard
 9981	 * register/unregister code path
 9982	 */
 9983	dev->reg_state = NETREG_DUMMY;
 9984
 9985	/* NAPI wants this */
 9986	INIT_LIST_HEAD(&dev->napi_list);
 9987
 9988	/* a dummy interface is started by default */
 9989	set_bit(__LINK_STATE_PRESENT, &dev->state);
 9990	set_bit(__LINK_STATE_START, &dev->state);
 9991
 9992	/* napi_busy_loop stats accounting wants this */
 9993	dev_net_set(dev, &init_net);
 9994
 9995	/* Note : We dont allocate pcpu_refcnt for dummy devices,
 9996	 * because users of this 'device' dont need to change
 9997	 * its refcount.
 9998	 */
 9999
10000	return 0;
10001}
10002EXPORT_SYMBOL_GPL(init_dummy_netdev);
10003
10004
10005/**
10006 *	register_netdev	- register a network device
10007 *	@dev: device to register
10008 *
10009 *	Take a completed network device structure and add it to the kernel
10010 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10011 *	chain. 0 is returned on success. A negative errno code is returned
10012 *	on a failure to set up the device, or if the name is a duplicate.
10013 *
10014 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
10015 *	and expands the device name if you passed a format string to
10016 *	alloc_netdev.
10017 */
10018int register_netdev(struct net_device *dev)
10019{
10020	int err;
10021
10022	if (rtnl_lock_killable())
10023		return -EINTR;
10024	err = register_netdevice(dev);
10025	rtnl_unlock();
10026	return err;
10027}
10028EXPORT_SYMBOL(register_netdev);
10029
10030int netdev_refcnt_read(const struct net_device *dev)
10031{
10032	int i, refcnt = 0;
10033
10034	for_each_possible_cpu(i)
10035		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10036	return refcnt;
10037}
10038EXPORT_SYMBOL(netdev_refcnt_read);
10039
10040/**
10041 * netdev_wait_allrefs - wait until all references are gone.
10042 * @dev: target net_device
10043 *
10044 * This is called when unregistering network devices.
10045 *
10046 * Any protocol or device that holds a reference should register
10047 * for netdevice notification, and cleanup and put back the
10048 * reference if they receive an UNREGISTER event.
10049 * We can get stuck here if buggy protocols don't correctly
10050 * call dev_put.
10051 */
10052static void netdev_wait_allrefs(struct net_device *dev)
10053{
10054	unsigned long rebroadcast_time, warning_time;
10055	int refcnt;
10056
10057	linkwatch_forget_dev(dev);
10058
10059	rebroadcast_time = warning_time = jiffies;
10060	refcnt = netdev_refcnt_read(dev);
10061
10062	while (refcnt != 0) {
10063		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10064			rtnl_lock();
10065
10066			/* Rebroadcast unregister notification */
10067			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10068
10069			__rtnl_unlock();
10070			rcu_barrier();
10071			rtnl_lock();
10072
10073			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10074				     &dev->state)) {
10075				/* We must not have linkwatch events
10076				 * pending on unregister. If this
10077				 * happens, we simply run the queue
10078				 * unscheduled, resulting in a noop
10079				 * for this device.
10080				 */
10081				linkwatch_run_queue();
10082			}
10083
10084			__rtnl_unlock();
10085
10086			rebroadcast_time = jiffies;
10087		}
10088
10089		msleep(250);
10090
10091		refcnt = netdev_refcnt_read(dev);
10092
10093		if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
10094			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10095				 dev->name, refcnt);
10096			warning_time = jiffies;
10097		}
10098	}
10099}
10100
10101/* The sequence is:
10102 *
10103 *	rtnl_lock();
10104 *	...
10105 *	register_netdevice(x1);
10106 *	register_netdevice(x2);
10107 *	...
10108 *	unregister_netdevice(y1);
10109 *	unregister_netdevice(y2);
10110 *      ...
10111 *	rtnl_unlock();
10112 *	free_netdev(y1);
10113 *	free_netdev(y2);
10114 *
10115 * We are invoked by rtnl_unlock().
10116 * This allows us to deal with problems:
10117 * 1) We can delete sysfs objects which invoke hotplug
10118 *    without deadlocking with linkwatch via keventd.
10119 * 2) Since we run with the RTNL semaphore not held, we can sleep
10120 *    safely in order to wait for the netdev refcnt to drop to zero.
10121 *
10122 * We must not return until all unregister events added during
10123 * the interval the lock was held have been completed.
10124 */
10125void netdev_run_todo(void)
10126{
10127	struct list_head list;
10128#ifdef CONFIG_LOCKDEP
10129	struct list_head unlink_list;
10130
10131	list_replace_init(&net_unlink_list, &unlink_list);
10132
10133	while (!list_empty(&unlink_list)) {
10134		struct net_device *dev = list_first_entry(&unlink_list,
10135							  struct net_device,
10136							  unlink_list);
10137		list_del(&dev->unlink_list);
10138		dev->nested_level = dev->lower_level - 1;
10139	}
10140#endif
10141
10142	/* Snapshot list, allow later requests */
10143	list_replace_init(&net_todo_list, &list);
10144
10145	__rtnl_unlock();
10146
10147
10148	/* Wait for rcu callbacks to finish before next phase */
 
10149	if (!list_empty(&list))
10150		rcu_barrier();
10151
10152	while (!list_empty(&list)) {
10153		struct net_device *dev
10154			= list_first_entry(&list, struct net_device, todo_list);
10155		list_del(&dev->todo_list);
10156
10157		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10158			pr_err("network todo '%s' but state %d\n",
10159			       dev->name, dev->reg_state);
10160			dump_stack();
10161			continue;
10162		}
10163
10164		dev->reg_state = NETREG_UNREGISTERED;
10165
 
 
10166		netdev_wait_allrefs(dev);
10167
10168		/* paranoia */
10169		BUG_ON(netdev_refcnt_read(dev));
10170		BUG_ON(!list_empty(&dev->ptype_all));
10171		BUG_ON(!list_empty(&dev->ptype_specific));
10172		WARN_ON(rcu_access_pointer(dev->ip_ptr));
10173		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10174#if IS_ENABLED(CONFIG_DECNET)
10175		WARN_ON(dev->dn_ptr);
10176#endif
10177		if (dev->priv_destructor)
10178			dev->priv_destructor(dev);
10179		if (dev->needs_free_netdev)
10180			free_netdev(dev);
10181
10182		/* Report a network device has been unregistered */
10183		rtnl_lock();
10184		dev_net(dev)->dev_unreg_count--;
10185		__rtnl_unlock();
10186		wake_up(&netdev_unregistering_wq);
10187
10188		/* Free network device */
10189		kobject_put(&dev->dev.kobj);
10190	}
10191}
10192
10193/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10194 * all the same fields in the same order as net_device_stats, with only
10195 * the type differing, but rtnl_link_stats64 may have additional fields
10196 * at the end for newer counters.
10197 */
10198void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10199			     const struct net_device_stats *netdev_stats)
10200{
10201#if BITS_PER_LONG == 64
10202	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
10203	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
10204	/* zero out counters that only exist in rtnl_link_stats64 */
10205	memset((char *)stats64 + sizeof(*netdev_stats), 0,
10206	       sizeof(*stats64) - sizeof(*netdev_stats));
10207#else
10208	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
10209	const unsigned long *src = (const unsigned long *)netdev_stats;
10210	u64 *dst = (u64 *)stats64;
10211
10212	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
 
10213	for (i = 0; i < n; i++)
10214		dst[i] = src[i];
10215	/* zero out counters that only exist in rtnl_link_stats64 */
10216	memset((char *)stats64 + n * sizeof(u64), 0,
10217	       sizeof(*stats64) - n * sizeof(u64));
10218#endif
10219}
10220EXPORT_SYMBOL(netdev_stats_to_stats64);
10221
10222/**
10223 *	dev_get_stats	- get network device statistics
10224 *	@dev: device to get statistics from
10225 *	@storage: place to store stats
10226 *
10227 *	Get network statistics from device. Return @storage.
10228 *	The device driver may provide its own method by setting
10229 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10230 *	otherwise the internal statistics structure is used.
10231 */
10232struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10233					struct rtnl_link_stats64 *storage)
10234{
10235	const struct net_device_ops *ops = dev->netdev_ops;
10236
10237	if (ops->ndo_get_stats64) {
10238		memset(storage, 0, sizeof(*storage));
10239		ops->ndo_get_stats64(dev, storage);
10240	} else if (ops->ndo_get_stats) {
10241		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10242	} else {
10243		netdev_stats_to_stats64(storage, &dev->stats);
10244	}
10245	storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
10246	storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
10247	storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
10248	return storage;
10249}
10250EXPORT_SYMBOL(dev_get_stats);
10251
10252struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10253{
10254	struct netdev_queue *queue = dev_ingress_queue(dev);
10255
10256#ifdef CONFIG_NET_CLS_ACT
10257	if (queue)
10258		return queue;
10259	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10260	if (!queue)
10261		return NULL;
10262	netdev_init_one_queue(dev, queue, NULL);
10263	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10264	queue->qdisc_sleeping = &noop_qdisc;
10265	rcu_assign_pointer(dev->ingress_queue, queue);
10266#endif
10267	return queue;
10268}
10269
10270static const struct ethtool_ops default_ethtool_ops;
10271
10272void netdev_set_default_ethtool_ops(struct net_device *dev,
10273				    const struct ethtool_ops *ops)
10274{
10275	if (dev->ethtool_ops == &default_ethtool_ops)
10276		dev->ethtool_ops = ops;
10277}
10278EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10279
10280void netdev_freemem(struct net_device *dev)
10281{
10282	char *addr = (char *)dev - dev->padded;
10283
10284	kvfree(addr);
10285}
10286
10287/**
10288 * alloc_netdev_mqs - allocate network device
10289 * @sizeof_priv: size of private data to allocate space for
10290 * @name: device name format string
10291 * @name_assign_type: origin of device name
10292 * @setup: callback to initialize device
10293 * @txqs: the number of TX subqueues to allocate
10294 * @rxqs: the number of RX subqueues to allocate
10295 *
10296 * Allocates a struct net_device with private data area for driver use
10297 * and performs basic initialization.  Also allocates subqueue structs
10298 * for each queue on the device.
10299 */
10300struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10301		unsigned char name_assign_type,
10302		void (*setup)(struct net_device *),
10303		unsigned int txqs, unsigned int rxqs)
10304{
10305	struct net_device *dev;
10306	unsigned int alloc_size;
10307	struct net_device *p;
10308
10309	BUG_ON(strlen(name) >= sizeof(dev->name));
10310
10311	if (txqs < 1) {
10312		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10313		return NULL;
10314	}
10315
 
10316	if (rxqs < 1) {
10317		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10318		return NULL;
10319	}
 
10320
10321	alloc_size = sizeof(struct net_device);
10322	if (sizeof_priv) {
10323		/* ensure 32-byte alignment of private area */
10324		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10325		alloc_size += sizeof_priv;
10326	}
10327	/* ensure 32-byte alignment of whole construct */
10328	alloc_size += NETDEV_ALIGN - 1;
10329
10330	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
10331	if (!p)
 
10332		return NULL;
 
10333
10334	dev = PTR_ALIGN(p, NETDEV_ALIGN);
10335	dev->padded = (char *)dev - (char *)p;
10336
10337	dev->pcpu_refcnt = alloc_percpu(int);
10338	if (!dev->pcpu_refcnt)
10339		goto free_dev;
10340
10341	if (dev_addr_init(dev))
10342		goto free_pcpu;
10343
10344	dev_mc_init(dev);
10345	dev_uc_init(dev);
10346
10347	dev_net_set(dev, &init_net);
10348
10349	dev->gso_max_size = GSO_MAX_SIZE;
10350	dev->gso_max_segs = GSO_MAX_SEGS;
10351	dev->upper_level = 1;
10352	dev->lower_level = 1;
10353#ifdef CONFIG_LOCKDEP
10354	dev->nested_level = 0;
10355	INIT_LIST_HEAD(&dev->unlink_list);
10356#endif
10357
10358	INIT_LIST_HEAD(&dev->napi_list);
10359	INIT_LIST_HEAD(&dev->unreg_list);
10360	INIT_LIST_HEAD(&dev->close_list);
10361	INIT_LIST_HEAD(&dev->link_watch_list);
10362	INIT_LIST_HEAD(&dev->adj_list.upper);
10363	INIT_LIST_HEAD(&dev->adj_list.lower);
10364	INIT_LIST_HEAD(&dev->ptype_all);
10365	INIT_LIST_HEAD(&dev->ptype_specific);
10366	INIT_LIST_HEAD(&dev->net_notifier_list);
10367#ifdef CONFIG_NET_SCHED
10368	hash_init(dev->qdisc_hash);
10369#endif
10370	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10371	setup(dev);
10372
10373	if (!dev->tx_queue_len) {
10374		dev->priv_flags |= IFF_NO_QUEUE;
10375		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10376	}
10377
10378	dev->num_tx_queues = txqs;
10379	dev->real_num_tx_queues = txqs;
10380	if (netif_alloc_netdev_queues(dev))
10381		goto free_all;
10382
 
10383	dev->num_rx_queues = rxqs;
10384	dev->real_num_rx_queues = rxqs;
10385	if (netif_alloc_rx_queues(dev))
10386		goto free_all;
 
10387
10388	strcpy(dev->name, name);
10389	dev->name_assign_type = name_assign_type;
10390	dev->group = INIT_NETDEV_GROUP;
10391	if (!dev->ethtool_ops)
10392		dev->ethtool_ops = &default_ethtool_ops;
10393
10394	nf_hook_ingress_init(dev);
10395
10396	return dev;
10397
10398free_all:
10399	free_netdev(dev);
10400	return NULL;
10401
10402free_pcpu:
10403	free_percpu(dev->pcpu_refcnt);
10404free_dev:
10405	netdev_freemem(dev);
 
 
 
 
 
10406	return NULL;
10407}
10408EXPORT_SYMBOL(alloc_netdev_mqs);
10409
10410/**
10411 * free_netdev - free network device
10412 * @dev: device
10413 *
10414 * This function does the last stage of destroying an allocated device
10415 * interface. The reference to the device object is released. If this
10416 * is the last reference then it will be freed.Must be called in process
10417 * context.
10418 */
10419void free_netdev(struct net_device *dev)
10420{
10421	struct napi_struct *p, *n;
10422
10423	might_sleep();
10424	netif_free_tx_queues(dev);
10425	netif_free_rx_queues(dev);
 
 
 
10426
10427	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10428
10429	/* Flush device addresses */
10430	dev_addr_flush(dev);
10431
10432	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10433		netif_napi_del(p);
10434
10435	free_percpu(dev->pcpu_refcnt);
10436	dev->pcpu_refcnt = NULL;
10437	free_percpu(dev->xdp_bulkq);
10438	dev->xdp_bulkq = NULL;
10439
10440	/*  Compatibility with error handling in drivers */
10441	if (dev->reg_state == NETREG_UNINITIALIZED) {
10442		netdev_freemem(dev);
10443		return;
10444	}
10445
10446	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10447	dev->reg_state = NETREG_RELEASED;
10448
10449	/* will free via device release */
10450	put_device(&dev->dev);
10451}
10452EXPORT_SYMBOL(free_netdev);
10453
10454/**
10455 *	synchronize_net -  Synchronize with packet receive processing
10456 *
10457 *	Wait for packets currently being received to be done.
10458 *	Does not block later packets from starting.
10459 */
10460void synchronize_net(void)
10461{
10462	might_sleep();
10463	if (rtnl_is_locked())
10464		synchronize_rcu_expedited();
10465	else
10466		synchronize_rcu();
10467}
10468EXPORT_SYMBOL(synchronize_net);
10469
10470/**
10471 *	unregister_netdevice_queue - remove device from the kernel
10472 *	@dev: device
10473 *	@head: list
10474 *
10475 *	This function shuts down a device interface and removes it
10476 *	from the kernel tables.
10477 *	If head not NULL, device is queued to be unregistered later.
10478 *
10479 *	Callers must hold the rtnl semaphore.  You may want
10480 *	unregister_netdev() instead of this.
10481 */
10482
10483void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
10484{
10485	ASSERT_RTNL();
10486
10487	if (head) {
10488		list_move_tail(&dev->unreg_list, head);
10489	} else {
10490		rollback_registered(dev);
10491		/* Finish processing unregister after unlock */
10492		net_set_todo(dev);
10493	}
10494}
10495EXPORT_SYMBOL(unregister_netdevice_queue);
10496
10497/**
10498 *	unregister_netdevice_many - unregister many devices
10499 *	@head: list of devices
10500 *
10501 *  Note: As most callers use a stack allocated list_head,
10502 *  we force a list_del() to make sure stack wont be corrupted later.
10503 */
10504void unregister_netdevice_many(struct list_head *head)
10505{
10506	struct net_device *dev;
10507
10508	if (!list_empty(head)) {
10509		rollback_registered_many(head);
10510		list_for_each_entry(dev, head, unreg_list)
10511			net_set_todo(dev);
10512		list_del(head);
10513	}
10514}
10515EXPORT_SYMBOL(unregister_netdevice_many);
10516
10517/**
10518 *	unregister_netdev - remove device from the kernel
10519 *	@dev: device
10520 *
10521 *	This function shuts down a device interface and removes it
10522 *	from the kernel tables.
10523 *
10524 *	This is just a wrapper for unregister_netdevice that takes
10525 *	the rtnl semaphore.  In general you want to use this and not
10526 *	unregister_netdevice.
10527 */
10528void unregister_netdev(struct net_device *dev)
10529{
10530	rtnl_lock();
10531	unregister_netdevice(dev);
10532	rtnl_unlock();
10533}
10534EXPORT_SYMBOL(unregister_netdev);
10535
10536/**
10537 *	dev_change_net_namespace - move device to different nethost namespace
10538 *	@dev: device
10539 *	@net: network namespace
10540 *	@pat: If not NULL name pattern to try if the current device name
10541 *	      is already taken in the destination network namespace.
10542 *
10543 *	This function shuts down a device interface and moves it
10544 *	to a new network namespace. On success 0 is returned, on
10545 *	a failure a netagive errno code is returned.
10546 *
10547 *	Callers must hold the rtnl semaphore.
10548 */
10549
10550int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
10551{
10552	struct net *net_old = dev_net(dev);
10553	int err, new_nsid, new_ifindex;
10554
10555	ASSERT_RTNL();
10556
10557	/* Don't allow namespace local devices to be moved. */
10558	err = -EINVAL;
10559	if (dev->features & NETIF_F_NETNS_LOCAL)
10560		goto out;
10561
10562	/* Ensure the device has been registrered */
 
10563	if (dev->reg_state != NETREG_REGISTERED)
10564		goto out;
10565
10566	/* Get out if there is nothing todo */
10567	err = 0;
10568	if (net_eq(net_old, net))
10569		goto out;
10570
10571	/* Pick the destination device name, and ensure
10572	 * we can use it in the destination network namespace.
10573	 */
10574	err = -EEXIST;
10575	if (__dev_get_by_name(net, dev->name)) {
10576		/* We get here if we can't use the current device name */
10577		if (!pat)
10578			goto out;
10579		err = dev_get_valid_name(net, dev, pat);
10580		if (err < 0)
10581			goto out;
10582	}
10583
10584	/*
10585	 * And now a mini version of register_netdevice unregister_netdevice.
10586	 */
10587
10588	/* If device is running close it first. */
10589	dev_close(dev);
10590
10591	/* And unlink it from device chain */
 
10592	unlist_netdevice(dev);
10593
10594	synchronize_net();
10595
10596	/* Shutdown queueing discipline. */
10597	dev_shutdown(dev);
10598
10599	/* Notify protocols, that we are about to destroy
10600	 * this device. They should clean all the things.
10601	 *
10602	 * Note that dev->reg_state stays at NETREG_REGISTERED.
10603	 * This is wanted because this way 8021q and macvlan know
10604	 * the device is just moving and can keep their slaves up.
10605	 */
10606	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10607	rcu_barrier();
10608
10609	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
10610	/* If there is an ifindex conflict assign a new one */
10611	if (__dev_get_by_index(net, dev->ifindex))
10612		new_ifindex = dev_new_index(net);
10613	else
10614		new_ifindex = dev->ifindex;
10615
10616	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
10617			    new_ifindex);
10618
10619	/*
10620	 *	Flush the unicast and multicast chains
10621	 */
10622	dev_uc_flush(dev);
10623	dev_mc_flush(dev);
10624
10625	/* Send a netdev-removed uevent to the old namespace */
10626	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
10627	netdev_adjacent_del_links(dev);
10628
10629	/* Move per-net netdevice notifiers that are following the netdevice */
10630	move_netdevice_notifiers_dev_net(dev, net);
10631
10632	/* Actually switch the network namespace */
10633	dev_net_set(dev, net);
10634	dev->ifindex = new_ifindex;
10635
10636	/* Send a netdev-add uevent to the new namespace */
10637	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
10638	netdev_adjacent_add_links(dev);
 
 
 
 
10639
10640	/* Fixup kobjects */
10641	err = device_rename(&dev->dev, dev->name);
10642	WARN_ON(err);
10643
10644	/* Adapt owner in case owning user namespace of target network
10645	 * namespace is different from the original one.
10646	 */
10647	err = netdev_change_owner(dev, net_old, net);
10648	WARN_ON(err);
10649
10650	/* Add the device back in the hashes */
10651	list_netdevice(dev);
10652
10653	/* Notify protocols, that a new device appeared. */
10654	call_netdevice_notifiers(NETDEV_REGISTER, dev);
10655
10656	/*
10657	 *	Prevent userspace races by waiting until the network
10658	 *	device is fully setup before sending notifications.
10659	 */
10660	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10661
10662	synchronize_net();
10663	err = 0;
10664out:
10665	return err;
10666}
10667EXPORT_SYMBOL_GPL(dev_change_net_namespace);
10668
10669static int dev_cpu_dead(unsigned int oldcpu)
 
 
10670{
10671	struct sk_buff **list_skb;
10672	struct sk_buff *skb;
10673	unsigned int cpu;
10674	struct softnet_data *sd, *oldsd, *remsd = NULL;
 
 
 
10675
10676	local_irq_disable();
10677	cpu = smp_processor_id();
10678	sd = &per_cpu(softnet_data, cpu);
10679	oldsd = &per_cpu(softnet_data, oldcpu);
10680
10681	/* Find end of our completion_queue. */
10682	list_skb = &sd->completion_queue;
10683	while (*list_skb)
10684		list_skb = &(*list_skb)->next;
10685	/* Append completion queue from offline CPU. */
10686	*list_skb = oldsd->completion_queue;
10687	oldsd->completion_queue = NULL;
10688
10689	/* Append output queue from offline CPU. */
10690	if (oldsd->output_queue) {
10691		*sd->output_queue_tailp = oldsd->output_queue;
10692		sd->output_queue_tailp = oldsd->output_queue_tailp;
10693		oldsd->output_queue = NULL;
10694		oldsd->output_queue_tailp = &oldsd->output_queue;
10695	}
10696	/* Append NAPI poll list from offline CPU, with one exception :
10697	 * process_backlog() must be called by cpu owning percpu backlog.
10698	 * We properly handle process_queue & input_pkt_queue later.
10699	 */
10700	while (!list_empty(&oldsd->poll_list)) {
10701		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
10702							    struct napi_struct,
10703							    poll_list);
10704
10705		list_del_init(&napi->poll_list);
10706		if (napi->poll == process_backlog)
10707			napi->state = 0;
10708		else
10709			____napi_schedule(sd, napi);
10710	}
10711
10712	raise_softirq_irqoff(NET_TX_SOFTIRQ);
10713	local_irq_enable();
10714
10715#ifdef CONFIG_RPS
10716	remsd = oldsd->rps_ipi_list;
10717	oldsd->rps_ipi_list = NULL;
10718#endif
10719	/* send out pending IPI's on offline CPU */
10720	net_rps_send_ipi(remsd);
10721
10722	/* Process offline CPU's input_pkt_queue */
10723	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
10724		netif_rx_ni(skb);
10725		input_queue_head_incr(oldsd);
10726	}
10727	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
10728		netif_rx_ni(skb);
10729		input_queue_head_incr(oldsd);
10730	}
10731
10732	return 0;
10733}
10734
 
10735/**
10736 *	netdev_increment_features - increment feature set by one
10737 *	@all: current feature set
10738 *	@one: new feature set
10739 *	@mask: mask feature set
10740 *
10741 *	Computes a new feature set after adding a device with feature set
10742 *	@one to the master device with current feature set @all.  Will not
10743 *	enable anything that is off in @mask. Returns the new feature set.
10744 */
10745netdev_features_t netdev_increment_features(netdev_features_t all,
10746	netdev_features_t one, netdev_features_t mask)
10747{
10748	if (mask & NETIF_F_HW_CSUM)
10749		mask |= NETIF_F_CSUM_MASK;
10750	mask |= NETIF_F_VLAN_CHALLENGED;
10751
10752	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
10753	all &= one | ~NETIF_F_ALL_FOR_ALL;
10754
10755	/* If one device supports hw checksumming, set for all. */
10756	if (all & NETIF_F_HW_CSUM)
10757		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
10758
10759	return all;
10760}
10761EXPORT_SYMBOL(netdev_increment_features);
10762
10763static struct hlist_head * __net_init netdev_create_hash(void)
10764{
10765	int i;
10766	struct hlist_head *hash;
10767
10768	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
10769	if (hash != NULL)
10770		for (i = 0; i < NETDEV_HASHENTRIES; i++)
10771			INIT_HLIST_HEAD(&hash[i]);
10772
10773	return hash;
10774}
10775
10776/* Initialize per network namespace state */
10777static int __net_init netdev_init(struct net *net)
10778{
10779	BUILD_BUG_ON(GRO_HASH_BUCKETS >
10780		     8 * sizeof_field(struct napi_struct, gro_bitmask));
10781
10782	if (net != &init_net)
10783		INIT_LIST_HEAD(&net->dev_base_head);
10784
10785	net->dev_name_head = netdev_create_hash();
10786	if (net->dev_name_head == NULL)
10787		goto err_name;
10788
10789	net->dev_index_head = netdev_create_hash();
10790	if (net->dev_index_head == NULL)
10791		goto err_idx;
10792
10793	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
10794
10795	return 0;
10796
10797err_idx:
10798	kfree(net->dev_name_head);
10799err_name:
10800	return -ENOMEM;
10801}
10802
10803/**
10804 *	netdev_drivername - network driver for the device
10805 *	@dev: network device
10806 *
10807 *	Determine network driver for device.
10808 */
10809const char *netdev_drivername(const struct net_device *dev)
10810{
10811	const struct device_driver *driver;
10812	const struct device *parent;
10813	const char *empty = "";
10814
10815	parent = dev->dev.parent;
10816	if (!parent)
10817		return empty;
10818
10819	driver = parent->driver;
10820	if (driver && driver->name)
10821		return driver->name;
10822	return empty;
10823}
10824
10825static void __netdev_printk(const char *level, const struct net_device *dev,
10826			    struct va_format *vaf)
10827{
10828	if (dev && dev->dev.parent) {
10829		dev_printk_emit(level[1] - '0',
10830				dev->dev.parent,
10831				"%s %s %s%s: %pV",
10832				dev_driver_string(dev->dev.parent),
10833				dev_name(dev->dev.parent),
10834				netdev_name(dev), netdev_reg_state(dev),
10835				vaf);
10836	} else if (dev) {
10837		printk("%s%s%s: %pV",
10838		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
10839	} else {
10840		printk("%s(NULL net_device): %pV", level, vaf);
10841	}
10842}
 
10843
10844void netdev_printk(const char *level, const struct net_device *dev,
10845		   const char *format, ...)
10846{
10847	struct va_format vaf;
10848	va_list args;
 
10849
10850	va_start(args, format);
10851
10852	vaf.fmt = format;
10853	vaf.va = &args;
10854
10855	__netdev_printk(level, dev, &vaf);
 
10856
10857	va_end(args);
10858}
10859EXPORT_SYMBOL(netdev_printk);
10860
10861#define define_netdev_printk_level(func, level)			\
10862void func(const struct net_device *dev, const char *fmt, ...)	\
10863{								\
 
10864	struct va_format vaf;					\
10865	va_list args;						\
10866								\
10867	va_start(args, fmt);					\
10868								\
10869	vaf.fmt = fmt;						\
10870	vaf.va = &args;						\
10871								\
10872	__netdev_printk(level, dev, &vaf);			\
 
10873								\
10874	va_end(args);						\
10875}								\
10876EXPORT_SYMBOL(func);
10877
10878define_netdev_printk_level(netdev_emerg, KERN_EMERG);
10879define_netdev_printk_level(netdev_alert, KERN_ALERT);
10880define_netdev_printk_level(netdev_crit, KERN_CRIT);
10881define_netdev_printk_level(netdev_err, KERN_ERR);
10882define_netdev_printk_level(netdev_warn, KERN_WARNING);
10883define_netdev_printk_level(netdev_notice, KERN_NOTICE);
10884define_netdev_printk_level(netdev_info, KERN_INFO);
10885
10886static void __net_exit netdev_exit(struct net *net)
10887{
10888	kfree(net->dev_name_head);
10889	kfree(net->dev_index_head);
10890	if (net != &init_net)
10891		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
10892}
10893
10894static struct pernet_operations __net_initdata netdev_net_ops = {
10895	.init = netdev_init,
10896	.exit = netdev_exit,
10897};
10898
10899static void __net_exit default_device_exit(struct net *net)
10900{
10901	struct net_device *dev, *aux;
10902	/*
10903	 * Push all migratable network devices back to the
10904	 * initial network namespace
10905	 */
10906	rtnl_lock();
10907	for_each_netdev_safe(net, dev, aux) {
10908		int err;
10909		char fb_name[IFNAMSIZ];
10910
10911		/* Ignore unmoveable devices (i.e. loopback) */
10912		if (dev->features & NETIF_F_NETNS_LOCAL)
10913			continue;
10914
10915		/* Leave virtual devices for the generic cleanup */
10916		if (dev->rtnl_link_ops)
10917			continue;
10918
10919		/* Push remaining network devices to init_net */
10920		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
10921		if (__dev_get_by_name(&init_net, fb_name))
10922			snprintf(fb_name, IFNAMSIZ, "dev%%d");
10923		err = dev_change_net_namespace(dev, &init_net, fb_name);
10924		if (err) {
10925			pr_emerg("%s: failed to move %s to init_net: %d\n",
10926				 __func__, dev->name, err);
10927			BUG();
10928		}
10929	}
10930	rtnl_unlock();
10931}
10932
10933static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
10934{
10935	/* Return with the rtnl_lock held when there are no network
10936	 * devices unregistering in any network namespace in net_list.
10937	 */
10938	struct net *net;
10939	bool unregistering;
10940	DEFINE_WAIT_FUNC(wait, woken_wake_function);
10941
10942	add_wait_queue(&netdev_unregistering_wq, &wait);
10943	for (;;) {
10944		unregistering = false;
10945		rtnl_lock();
10946		list_for_each_entry(net, net_list, exit_list) {
10947			if (net->dev_unreg_count > 0) {
10948				unregistering = true;
10949				break;
10950			}
10951		}
10952		if (!unregistering)
10953			break;
10954		__rtnl_unlock();
10955
10956		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
10957	}
10958	remove_wait_queue(&netdev_unregistering_wq, &wait);
10959}
10960
10961static void __net_exit default_device_exit_batch(struct list_head *net_list)
10962{
10963	/* At exit all network devices most be removed from a network
10964	 * namespace.  Do this in the reverse order of registration.
10965	 * Do this across as many network namespaces as possible to
10966	 * improve batching efficiency.
10967	 */
10968	struct net_device *dev;
10969	struct net *net;
10970	LIST_HEAD(dev_kill_list);
10971
10972	/* To prevent network device cleanup code from dereferencing
10973	 * loopback devices or network devices that have been freed
10974	 * wait here for all pending unregistrations to complete,
10975	 * before unregistring the loopback device and allowing the
10976	 * network namespace be freed.
10977	 *
10978	 * The netdev todo list containing all network devices
10979	 * unregistrations that happen in default_device_exit_batch
10980	 * will run in the rtnl_unlock() at the end of
10981	 * default_device_exit_batch.
10982	 */
10983	rtnl_lock_unregistering(net_list);
10984	list_for_each_entry(net, net_list, exit_list) {
10985		for_each_netdev_reverse(net, dev) {
10986			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
10987				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
10988			else
10989				unregister_netdevice_queue(dev, &dev_kill_list);
10990		}
10991	}
10992	unregister_netdevice_many(&dev_kill_list);
 
10993	rtnl_unlock();
10994}
10995
10996static struct pernet_operations __net_initdata default_device_ops = {
10997	.exit = default_device_exit,
10998	.exit_batch = default_device_exit_batch,
10999};
11000
11001/*
11002 *	Initialize the DEV module. At boot time this walks the device list and
11003 *	unhooks any devices that fail to initialise (normally hardware not
11004 *	present) and leaves us with a valid list of present and active devices.
11005 *
11006 */
11007
11008/*
11009 *       This is called single threaded during boot, so no need
11010 *       to take the rtnl semaphore.
11011 */
11012static int __init net_dev_init(void)
11013{
11014	int i, rc = -ENOMEM;
11015
11016	BUG_ON(!dev_boot_phase);
11017
11018	if (dev_proc_init())
11019		goto out;
11020
11021	if (netdev_kobject_init())
11022		goto out;
11023
11024	INIT_LIST_HEAD(&ptype_all);
11025	for (i = 0; i < PTYPE_HASH_SIZE; i++)
11026		INIT_LIST_HEAD(&ptype_base[i]);
11027
11028	INIT_LIST_HEAD(&offload_base);
11029
11030	if (register_pernet_subsys(&netdev_net_ops))
11031		goto out;
11032
11033	/*
11034	 *	Initialise the packet receive queues.
11035	 */
11036
11037	for_each_possible_cpu(i) {
11038		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11039		struct softnet_data *sd = &per_cpu(softnet_data, i);
11040
11041		INIT_WORK(flush, flush_backlog);
11042
11043		skb_queue_head_init(&sd->input_pkt_queue);
11044		skb_queue_head_init(&sd->process_queue);
11045#ifdef CONFIG_XFRM_OFFLOAD
11046		skb_queue_head_init(&sd->xfrm_backlog);
11047#endif
11048		INIT_LIST_HEAD(&sd->poll_list);
 
11049		sd->output_queue_tailp = &sd->output_queue;
11050#ifdef CONFIG_RPS
11051		sd->csd.func = rps_trigger_softirq;
11052		sd->csd.info = sd;
 
11053		sd->cpu = i;
11054#endif
11055
11056		init_gro_hash(&sd->backlog);
11057		sd->backlog.poll = process_backlog;
11058		sd->backlog.weight = weight_p;
 
 
11059	}
11060
11061	dev_boot_phase = 0;
11062
11063	/* The loopback device is special if any other network devices
11064	 * is present in a network namespace the loopback device must
11065	 * be present. Since we now dynamically allocate and free the
11066	 * loopback device ensure this invariant is maintained by
11067	 * keeping the loopback device as the first device on the
11068	 * list of network devices.  Ensuring the loopback devices
11069	 * is the first device that appears and the last network device
11070	 * that disappears.
11071	 */
11072	if (register_pernet_device(&loopback_net_ops))
11073		goto out;
11074
11075	if (register_pernet_device(&default_device_ops))
11076		goto out;
11077
11078	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11079	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11080
11081	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11082				       NULL, dev_cpu_dead);
11083	WARN_ON(rc < 0);
11084	rc = 0;
11085out:
11086	return rc;
11087}
11088
11089subsys_initcall(net_dev_init);